25 files changed, 1148 insertions, 395 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3bfc8f1..30b426e 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -412,4 +412,18 @@ config DM_VERITY
 
 	  If unsure, say N.
 
+config DM_SWITCH
+	tristate "Switch target support (EXPERIMENTAL)"
+	depends on BLK_DEV_DM
+	---help---
+	  This device-mapper target creates a device that supports an arbitrary
+	  mapping of fixed-size regions of I/O across a fixed set of paths.
+	  The path used for any specific region can be switched dynamically
+	  by sending the target a message.
+
+	  To compile this code as a module, choose M here: the module will
+	  be called dm-switch.
+
+	  If unsure, say N.
+
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 1439fd4..5ef78ef 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_DM_FLAKEY)		+= dm-flakey.o
 obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
+obj-$(CONFIG_DM_SWITCH)		+= dm-switch.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_PERSISTENT_DATA)	+= persistent-data/
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 05c220d..f950c9d 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -1,7 +1,6 @@
 
 config BCACHE
 	tristate "Block device as cache"
-	select CLOSURES
 	---help---
 	Allows a block device to be used as cache for other devices; uses
 	a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 340146d..d3e15b4 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -1241,7 +1241,7 @@ void bch_cache_set_stop(struct cache_set *);
 struct cache_set *bch_cache_set_alloc(struct cache_sb *);
 void bch_btree_cache_free(struct cache_set *);
 int bch_btree_cache_alloc(struct cache_set *);
-void bch_writeback_init_cached_dev(struct cached_dev *);
+void bch_cached_dev_writeback_init(struct cached_dev *);
 void bch_moving_init_cache_set(struct cache_set *);
 
 void bch_cache_allocator_exit(struct cache *ca);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index cb4578a..1d27d3a 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -395,7 +395,7 @@ void inorder_test(void)
 #endif
 
 /*
- * Cacheline/offset <-> bkey pointer arithmatic:
+ * Cacheline/offset <-> bkey pointer arithmetic:
  *
  * t->tree is a binary search tree in an array; each node corresponds to a key
  * in one cacheline in t->set (BSET_CACHELINE bytes).
@@ -404,7 +404,7 @@ void inorder_test(void)
  * the binary tree points to; to_inorder() gives us the cacheline, and then
  * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes.
  *
- * cacheline_to_bkey() and friends abstract out all the pointer arithmatic to
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
  * make this work.
  *
  * To construct the bfloat for an arbitrary key we need to know what the key
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index 64e6794..b8730e7 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -93,24 +93,6 @@ static struct attribute *bch_stats_files[] = {
 };
 static KTYPE(bch_stats);
 
-static void scale_accounting(unsigned long data);
-
-void bch_cache_accounting_init(struct cache_accounting *acc,
-			       struct closure *parent)
-{
-	kobject_init(&acc->total.kobj,		&bch_stats_ktype);
-	kobject_init(&acc->five_minute.kobj,	&bch_stats_ktype);
-	kobject_init(&acc->hour.kobj,		&bch_stats_ktype);
-	kobject_init(&acc->day.kobj,		&bch_stats_ktype);
-
-	closure_init(&acc->cl, parent);
-	init_timer(&acc->timer);
-	acc->timer.expires	= jiffies + accounting_delay;
-	acc->timer.data		= (unsigned long) acc;
-	acc->timer.function	= scale_accounting;
-	add_timer(&acc->timer);
-}
-
 int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
 				   struct kobject *parent)
 {
@@ -244,3 +226,19 @@ void bch_mark_sectors_bypassed(struct search *s, int sectors)
 	atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
 	atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed);
 }
+
+void bch_cache_accounting_init(struct cache_accounting *acc,
+			       struct closure *parent)
+{
+	kobject_init(&acc->total.kobj,		&bch_stats_ktype);
+	kobject_init(&acc->five_minute.kobj,	&bch_stats_ktype);
+	kobject_init(&acc->hour.kobj,		&bch_stats_ktype);
+	kobject_init(&acc->day.kobj,		&bch_stats_ktype);
+
+	closure_init(&acc->cl, parent);
+	init_timer(&acc->timer);
+	acc->timer.expires	= jiffies + accounting_delay;
+	acc->timer.data		= (unsigned long) acc;
+	acc->timer.function	= scale_accounting;
+	add_timer(&acc->timer);
+}
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index c8046bc..f88e2b6 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -634,11 +634,10 @@ static int open_dev(struct block_device *b, fmode_t mode)
 	return 0;
 }
 
-static int release_dev(struct gendisk *b, fmode_t mode)
+static void release_dev(struct gendisk *b, fmode_t mode)
 {
 	struct bcache_device *d = b->private_data;
 	closure_put(&d->cl);
-	return 0;
 }
 
 static int ioctl_dev(struct block_device *b, fmode_t mode,
@@ -732,8 +731,7 @@ static void bcache_device_free(struct bcache_device *d)
 
 	if (d->c)
 		bcache_device_detach(d);
-
-	if (d->disk)
+	if (d->disk && d->disk->flags & GENHD_FL_UP)
 		del_gendisk(d->disk);
 	if (d->disk && d->disk->queue)
 		blk_cleanup_queue(d->disk->queue);
@@ -756,12 +754,9 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size)
 	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
 	    !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
 				sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
-	    bio_split_pool_init(&d->bio_split_hook))
-
-		return -ENOMEM;
-
-	d->disk = alloc_disk(1);
-	if (!d->disk)
+	    bio_split_pool_init(&d->bio_split_hook) ||
+	    !(d->disk = alloc_disk(1)) ||
+	    !(q = blk_alloc_queue(GFP_KERNEL)))
 		return -ENOMEM;
 
 	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
@@ -771,10 +766,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size)
 	d->disk->fops		= &bcache_ops;
 	d->disk->private_data	= d;
 
-	q = blk_alloc_queue(GFP_KERNEL);
-	if (!q)
-		return -ENOMEM;
-
 	blk_queue_make_request(q, NULL);
 	d->disk->queue			= q;
 	q->queuedata			= d;
@@ -999,14 +990,17 @@ static void cached_dev_free(struct closure *cl)
 
 	mutex_lock(&bch_register_lock);
 
-	bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
+	if (atomic_read(&dc->running))
+		bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
 	bcache_device_free(&dc->disk);
 	list_del(&dc->list);
 
 	mutex_unlock(&bch_register_lock);
 
 	if (!IS_ERR_OR_NULL(dc->bdev)) {
-		blk_sync_queue(bdev_get_queue(dc->bdev));
+		if (dc->bdev->bd_disk)
+			blk_sync_queue(bdev_get_queue(dc->bdev));
+
 		blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 	}
 
@@ -1028,73 +1022,67 @@ static void cached_dev_flush(struct closure *cl)
 
 static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
 {
-	int err;
+	int ret;
 	struct io *io;
-
-	closure_init(&dc->disk.cl, NULL);
-	set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
+	struct request_queue *q = bdev_get_queue(dc->bdev);
 
 	__module_get(THIS_MODULE);
 	INIT_LIST_HEAD(&dc->list);
+	closure_init(&dc->disk.cl, NULL);
+	set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
 	kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
-
-	bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
-
-	err = bcache_device_init(&dc->disk, block_size);
-	if (err)
-		goto err;
-
-	spin_lock_init(&dc->io_lock);
-	closure_init_unlocked(&dc->sb_write);
 	INIT_WORK(&dc->detach, cached_dev_detach_finish);
+	closure_init_unlocked(&dc->sb_write);
+	INIT_LIST_HEAD(&dc->io_lru);
+	spin_lock_init(&dc->io_lock);
+	bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
 
 	dc->sequential_merge		= true;
 	dc->sequential_cutoff		= 4 << 20;
 
-	INIT_LIST_HEAD(&dc->io_lru);
-	dc->sb_bio.bi_max_vecs	= 1;
-	dc->sb_bio.bi_io_vec	= dc->sb_bio.bi_inline_vecs;
-
 	for (io = dc->io; io < dc->io + RECENT_IO; io++) {
 		list_add(&io->lru, &dc->io_lru);
 		hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
 	}
 
-	bch_writeback_init_cached_dev(dc);
+	ret = bcache_device_init(&dc->disk, block_size);
+	if (ret)
+		return ret;
+
+	set_capacity(dc->disk.disk,
+		     dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
+
+	dc->disk.disk->queue->backing_dev_info.ra_pages =
+		max(dc->disk.disk->queue->backing_dev_info.ra_pages,
+		    q->backing_dev_info.ra_pages);
+
+	bch_cached_dev_request_init(dc);
+	bch_cached_dev_writeback_init(dc);
 	return 0;
-err:
-	bcache_device_stop(&dc->disk);
-	return err;
 }
 
 /* Cached device - bcache superblock */
 
-static const char *register_bdev(struct cache_sb *sb, struct page *sb_page,
+static void register_bdev(struct cache_sb *sb, struct page *sb_page,
 				 struct block_device *bdev,
 				 struct cached_dev *dc)
 {
 	char name[BDEVNAME_SIZE];
 	const char *err = "cannot allocate memory";
-	struct gendisk *g;
 	struct cache_set *c;
 
-	if (!dc || cached_dev_init(dc, sb->block_size << 9) != 0)
-		return err;
-
 	memcpy(&dc->sb, sb, sizeof(struct cache_sb));
-	dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
 	dc->bdev = bdev;
 	dc->bdev->bd_holder = dc;
 
-	g = dc->disk.disk;
-
-	set_capacity(g, dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
-
-	g->queue->backing_dev_info.ra_pages =
-		max(g->queue->backing_dev_info.ra_pages,
-		    bdev->bd_queue->backing_dev_info.ra_pages);
+	bio_init(&dc->sb_bio);
+	dc->sb_bio.bi_max_vecs	= 1;
+	dc->sb_bio.bi_io_vec	= dc->sb_bio.bi_inline_vecs;
+	dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
+	get_page(sb_page);
 
-	bch_cached_dev_request_init(dc);
+	if (cached_dev_init(dc, sb->block_size << 9))
+		goto err;
 
 	err = "error creating kobject";
 	if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
@@ -1103,6 +1091,8 @@ static const char *register_bdev(struct cache_sb *sb, struct page *sb_page,
 	if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
 		goto err;
 
+	pr_info("registered backing device %s", bdevname(bdev, name));
+
 	list_add(&dc->list, &uncached_devices);
 	list_for_each_entry(c, &bch_cache_sets, list)
 		bch_cached_dev_attach(dc, c);
@@ -1111,15 +1101,10 @@ static const char *register_bdev(struct cache_sb *sb, struct page *sb_page,
 	    BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
 		bch_cached_dev_run(dc);
 
-	return NULL;
+	return;
 err:
-	kobject_put(&dc->disk.kobj);
 	pr_notice("error opening %s: %s", bdevname(bdev, name), err);
-	/*
-	 * Return NULL instead of an error because kobject_put() cleans
-	 * everything up
-	 */
-	return NULL;
+	bcache_device_stop(&dc->disk);
 }
 
 /* Flash only volumes */
@@ -1717,20 +1702,11 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
 	size_t free;
 	struct bucket *b;
 
-	if (!ca)
-		return -ENOMEM;
-
 	__module_get(THIS_MODULE);
 	kobject_init(&ca->kobj, &bch_cache_ktype);
 
-	memcpy(&ca->sb, sb, sizeof(struct cache_sb));
-
 	INIT_LIST_HEAD(&ca->discards);
 
-	bio_init(&ca->sb_bio);
-	ca->sb_bio.bi_max_vecs	= 1;
-	ca->sb_bio.bi_io_vec	= ca->sb_bio.bi_inline_vecs;
-
 	bio_init(&ca->journal.bio);
 	ca->journal.bio.bi_max_vecs = 8;
 	ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
@@ -1742,18 +1718,17 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
 	    !init_fifo(&ca->free_inc,	free << 2, GFP_KERNEL) ||
 	    !init_fifo(&ca->unused,	free << 2, GFP_KERNEL) ||
 	    !init_heap(&ca->heap,	free << 3, GFP_KERNEL) ||
-	    !(ca->buckets	= vmalloc(sizeof(struct bucket) *
+	    !(ca->buckets	= vzalloc(sizeof(struct bucket) *
 					  ca->sb.nbuckets)) ||
 	    !(ca->prio_buckets	= kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
 					  2, GFP_KERNEL)) ||
 	    !(ca->disk_buckets	= alloc_bucket_pages(GFP_KERNEL, ca)) ||
 	    !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) ||
 	    bio_split_pool_init(&ca->bio_split_hook))
-		goto err;
+		return -ENOMEM;
 
 	ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
 
-	memset(ca->buckets, 0, ca->sb.nbuckets * sizeof(struct bucket));
 	for_each_bucket(b, ca)
 		atomic_set(&b->pin, 0);
 
@@ -1766,22 +1741,28 @@ err:
 	return -ENOMEM;
 }
 
-static const char *register_cache(struct cache_sb *sb, struct page *sb_page,
+static void register_cache(struct cache_sb *sb, struct page *sb_page,
 				  struct block_device *bdev, struct cache *ca)
 {
 	char name[BDEVNAME_SIZE];
 	const char *err = "cannot allocate memory";
 
-	if (cache_alloc(sb, ca) != 0)
-		return err;
-
-	ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
+	memcpy(&ca->sb, sb, sizeof(struct cache_sb));
 	ca->bdev = bdev;
 	ca->bdev->bd_holder = ca;
 
+	bio_init(&ca->sb_bio);
+	ca->sb_bio.bi_max_vecs	= 1;
+	ca->sb_bio.bi_io_vec	= ca->sb_bio.bi_inline_vecs;
+	ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
+	get_page(sb_page);
+
 	if (blk_queue_discard(bdev_get_queue(ca->bdev)))
 		ca->discard = CACHE_DISCARD(&ca->sb);
 
+	if (cache_alloc(sb, ca) != 0)
+		goto err;
+
 	err = "error creating kobject";
 	if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
 		goto err;
@@ -1791,15 +1772,10 @@ static const char *register_cache(struct cache_sb *sb, struct page *sb_page,
 		goto err;
 
 	pr_info("registered cache device %s", bdevname(bdev, name));
-
-	return NULL;
+	return;
 err:
+	pr_notice("error opening %s: %s", bdevname(bdev, name), err);
 	kobject_put(&ca->kobj);
-	pr_info("error opening %s: %s", bdevname(bdev, name), err);
-	/* Return NULL instead of an error because kobject_put() cleans
-	 * everything up
-	 */
-	return NULL;
 }
 
 /* Global interfaces/init */
@@ -1833,12 +1809,15 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	bdev = blkdev_get_by_path(strim(path),
 				  FMODE_READ|FMODE_WRITE|FMODE_EXCL,
 				  sb);
-	if (bdev == ERR_PTR(-EBUSY))
-		err = "device busy";
-
-	if (IS_ERR(bdev) ||
-	    set_blocksize(bdev, 4096))
+	if (IS_ERR(bdev)) {
+		if (bdev == ERR_PTR(-EBUSY))
+			err = "device busy";
 		goto err;
+	}
+
+	err = "failed to set blocksize";
+	if (set_blocksize(bdev, 4096))
+		goto err_close;
 
 	err = read_super(sb, bdev, &sb_page);
 	if (err)
@@ -1846,33 +1825,33 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 
 	if (SB_IS_BDEV(sb)) {
 		struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
+		if (!dc)
+			goto err_close;
 
-		err = register_bdev(sb, sb_page, bdev, dc);
+		register_bdev(sb, sb_page, bdev, dc);
 	} else {
 		struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+		if (!ca)
+			goto err_close;
 
-		err = register_cache(sb, sb_page, bdev, ca);
+		register_cache(sb, sb_page, bdev, ca);
 	}
-
-	if (err) {
-		/* register_(bdev|cache) will only return an error if they
-		 * didn't get far enough to create the kobject - if they did,
-		 * the kobject destructor will do this cleanup.
-		 */
+out:
+	if (sb_page)
 		put_page(sb_page);
-err_close:
-		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-err:
-		if (attr != &ksysfs_register_quiet)
-			pr_info("error opening %s: %s", path, err);
-		ret = -EINVAL;
-	}
-
 	kfree(sb);
 	kfree(path);
 	mutex_unlock(&bch_register_lock);
 	module_put(THIS_MODULE);
 	return ret;
+
+err_close:
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+err:
+	if (attr != &ksysfs_register_quiet)
+		pr_info("error opening %s: %s", path, err);
+	ret = -EINVAL;
+	goto out;
 }
 
 static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 93e7e31..2714ed3 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -375,7 +375,7 @@ err:
 	refill_dirty(cl);
 }
 
-void bch_writeback_init_cached_dev(struct cached_dev *dc)
+void bch_cached_dev_writeback_init(struct cached_dev *dc)
 {
 	closure_init_unlocked(&dc->writeback);
 	init_rwsem(&dc->writeback_lock);
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 5a2c754..a7fd821 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -2002,9 +2002,9 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 		} else {
 			int rv;
 			if (buf[0] == '+')
-				rv = strict_strtoll(buf+1, 10, &offset);
+				rv = kstrtoll(buf+1, 10, &offset);
 			else
-				rv = strict_strtoll(buf, 10, &offset);
+				rv = kstrtoll(buf, 10, &offset);
 			if (rv)
 				return rv;
 			if (offset == 0)
@@ -2139,7 +2139,7 @@ static ssize_t
 backlog_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	unsigned long backlog;
-	int rv = strict_strtoul(buf, 10, &backlog);
+	int rv = kstrtoul(buf, 10, &backlog);
 	if (rv)
 		return rv;
 	if (backlog > COUNTER_MAX)
@@ -2165,7 +2165,7 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len)
 	unsigned long csize;
 	if (mddev->bitmap)
 		return -EBUSY;
-	rv = strict_strtoul(buf, 10, &csize);
+	rv = kstrtoul(buf, 10, &csize);
 	if (rv)
 		return rv;
 	if (csize < 512 ||
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 0387e05..5227e07 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -145,6 +145,7 @@ struct dm_buffer {
 	unsigned long state;
 	unsigned long last_accessed;
 	struct dm_bufio_client *c;
+	struct list_head write_list;
 	struct bio bio;
 	struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
 };
@@ -349,7 +350,7 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
 	if (gfp_mask & __GFP_NORETRY)
 		noio_flag = memalloc_noio_save();
 
-	ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
+	ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL);
 
 	if (gfp_mask & __GFP_NORETRY)
 		memalloc_noio_restore(noio_flag);
@@ -630,7 +631,8 @@ static int do_io_schedule(void *word)
  * - Submit our write and don't wait on it. We set B_WRITING indicating
  *   that there is a write in progress.
  */
-static void __write_dirty_buffer(struct dm_buffer *b)
+static void __write_dirty_buffer(struct dm_buffer *b,
+				 struct list_head *write_list)
 {
 	if (!test_bit(B_DIRTY, &b->state))
 		return;
@@ -639,7 +641,24 @@ static void __write_dirty_buffer(struct dm_buffer *b)
 	wait_on_bit_lock(&b->state, B_WRITING,
 			 do_io_schedule, TASK_UNINTERRUPTIBLE);
 
-	submit_io(b, WRITE, b->block, write_endio);
+	if (!write_list)
+		submit_io(b, WRITE, b->block, write_endio);
+	else
+		list_add_tail(&b->write_list, write_list);
+}
+
+static void __flush_write_list(struct list_head *write_list)
+{
+	struct blk_plug plug;
+	blk_start_plug(&plug);
+	while (!list_empty(write_list)) {
+		struct dm_buffer *b =
+			list_entry(write_list->next, struct dm_buffer, write_list);
+		list_del(&b->write_list);
+		submit_io(b, WRITE, b->block, write_endio);
+		dm_bufio_cond_resched();
+	}
+	blk_finish_plug(&plug);
 }
 
 /*
@@ -655,7 +674,7 @@ static void __make_buffer_clean(struct dm_buffer *b)
 		return;
 
 	wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
-	__write_dirty_buffer(b);
+	__write_dirty_buffer(b, NULL);
 	wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
 }
 
@@ -802,7 +821,8 @@ static void __free_buffer_wake(struct dm_buffer *b)
 	wake_up(&c->free_buffer_wait);
 }
 
-static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait)
+static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
+					struct list_head *write_list)
 {
 	struct dm_buffer *b, *tmp;
 
@@ -818,7 +838,7 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait)
 		if (no_wait && test_bit(B_WRITING, &b->state))
 			return;
 
-		__write_dirty_buffer(b);
+		__write_dirty_buffer(b, write_list);
 		dm_bufio_cond_resched();
 	}
 }
@@ -853,7 +873,8 @@ static void __get_memory_limit(struct dm_bufio_client *c,
  * If we are over threshold_buffers, start freeing buffers.
  * If we're over "limit_buffers", block until we get under the limit.
  */
-static void __check_watermark(struct dm_bufio_client *c)
+static void __check_watermark(struct dm_bufio_client *c,
+			      struct list_head *write_list)
 {
 	unsigned long threshold_buffers, limit_buffers;
 
@@ -872,7 +893,7 @@ static void __check_watermark(struct dm_bufio_client *c)
 	}
 
 	if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
-		__write_dirty_buffers_async(c, 1);
+		__write_dirty_buffers_async(c, 1, write_list);
 }
 
 /*
@@ -897,7 +918,8 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
  *--------------------------------------------------------------*/
 
 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
-				     enum new_flag nf, int *need_submit)
+				     enum new_flag nf, int *need_submit,
+				     struct list_head *write_list)
 {
 	struct dm_buffer *b, *new_b = NULL;
 
@@ -924,7 +946,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
 		goto found_buffer;
 	}
 
-	__check_watermark(c);
+	__check_watermark(c, write_list);
 
 	b = new_b;
 	b->hold_count = 1;
@@ -992,10 +1014,14 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
 	int need_submit;
 	struct dm_buffer *b;
 
+	LIST_HEAD(write_list);
+
 	dm_bufio_lock(c);
-	b = __bufio_new(c, block, nf, &need_submit);
+	b = __bufio_new(c, block, nf, &need_submit, &write_list);
 	dm_bufio_unlock(c);
 
+	__flush_write_list(&write_list);
+
 	if (!b)
 		return b;
 
@@ -1047,6 +1073,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
 {
 	struct blk_plug plug;
 
+	LIST_HEAD(write_list);
+
 	BUG_ON(dm_bufio_in_request());
 
 	blk_start_plug(&plug);
@@ -1055,7 +1083,15 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
 	for (; n_blocks--; block++) {
 		int need_submit;
 		struct dm_buffer *b;
-		b = __bufio_new(c, block, NF_PREFETCH, &need_submit);
+		b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
+				&write_list);
+		if (unlikely(!list_empty(&write_list))) {
+			dm_bufio_unlock(c);
+			blk_finish_plug(&plug);
+			__flush_write_list(&write_list);
+			blk_start_plug(&plug);
+			dm_bufio_lock(c);
+		}
 		if (unlikely(b != NULL)) {
 			dm_bufio_unlock(c);
 
@@ -1069,7 +1105,6 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
 				goto flush_plug;
 			dm_bufio_lock(c);
 		}
-
 	}
 
 	dm_bufio_unlock(c);
@@ -1126,11 +1161,14 @@ EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
 
 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
 {
+	LIST_HEAD(write_list);
+
 	BUG_ON(dm_bufio_in_request());
 
 	dm_bufio_lock(c);
-	__write_dirty_buffers_async(c, 0);
+	__write_dirty_buffers_async(c, 0, &write_list);
 	dm_bufio_unlock(c);
+	__flush_write_list(&write_list);
 }
 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
 
@@ -1147,8 +1185,13 @@ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
 	unsigned long buffers_processed = 0;
 	struct dm_buffer *b, *tmp;
 
+	LIST_HEAD(write_list);
+
+	dm_bufio_lock(c);
+	__write_dirty_buffers_async(c, 0, &write_list);
+	dm_bufio_unlock(c);
+	__flush_write_list(&write_list);
 	dm_bufio_lock(c);
-	__write_dirty_buffers_async(c, 0);
 
 again:
 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
@@ -1274,7 +1317,7 @@ retry:
 	BUG_ON(!b->hold_count);
 	BUG_ON(test_bit(B_READING, &b->state));
 
-	__write_dirty_buffer(b);
+	__write_dirty_buffer(b, NULL);
 	if (b->hold_count == 1) {
 		wait_on_bit(&b->state, B_WRITING,
 			    do_io_schedule, TASK_UNINTERRUPTIBLE);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index df44b60..0df3ec0 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -425,6 +425,10 @@ static bool block_size_is_power_of_two(struct cache *cache)
 	return cache->sectors_per_block_shift >= 0;
 }
 
+/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
+#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
+__always_inline
+#endif
 static dm_block_t block_div(dm_block_t b, uint32_t n)
 {
 	do_div(b, n);
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 7fcf21c..c80a0ec 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -176,7 +176,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
 	if (!fc) {
-		ti->error = "Cannot allocate linear context";
+		ti->error = "Cannot allocate context";
 		return -ENOMEM;
 	}
 	fc->start_time = jiffies;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index aa04f02..f1b7586 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -36,6 +36,14 @@ struct hash_cell {
 	struct dm_table *new_map;
 };
 
+/*
+ * A dummy definition to make RCU happy.
+ * struct dm_table should never be dereferenced in this file.
+ */
+struct dm_table {
+	int undefined__;
+};
+
 struct vers_iter {
     size_t param_size;
     struct dm_target_versions *vers, *old_vers;
@@ -242,9 +250,10 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
 	return -EBUSY;
 }
 
-static void __hash_remove(struct hash_cell *hc)
+static struct dm_table *__hash_remove(struct hash_cell *hc)
 {
 	struct dm_table *table;
+	int srcu_idx;
 
 	/* remove from the dev hash */
 	list_del(&hc->uuid_list);
@@ -253,16 +262,18 @@ static void __hash_remove(struct hash_cell *hc)
 	dm_set_mdptr(hc->md, NULL);
 	mutex_unlock(&dm_hash_cells_mutex);
 
-	table = dm_get_live_table(hc->md);
-	if (table) {
+	table = dm_get_live_table(hc->md, &srcu_idx);
+	if (table)
 		dm_table_event(table);
-		dm_table_put(table);
-	}
+	dm_put_live_table(hc->md, srcu_idx);
 
+	table = NULL;
 	if (hc->new_map)
-		dm_table_destroy(hc->new_map);
+		table = hc->new_map;
 	dm_put(hc->md);
 	free_cell(hc);
+
+	return table;
 }
 
 static void dm_hash_remove_all(int keep_open_devices)
@@ -270,6 +281,7 @@ static void dm_hash_remove_all(int keep_open_devices)
 	int i, dev_skipped;
 	struct hash_cell *hc;
 	struct mapped_device *md;
+	struct dm_table *t;
 
 retry:
 	dev_skipped = 0;
@@ -287,10 +299,14 @@ retry:
 				continue;
 			}
 
-			__hash_remove(hc);
+			t = __hash_remove(hc);
 
 			up_write(&_hash_lock);
 
+			if (t) {
+				dm_sync_table(md);
+				dm_table_destroy(t);
+			}
 			dm_put(md);
 			if (likely(keep_open_devices))
 				dm_destroy(md);
@@ -356,6 +372,7 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 	struct dm_table *table;
 	struct mapped_device *md;
 	unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
+	int srcu_idx;
 
 	/*
 	 * duplicate new.
@@ -418,11 +435,10 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 	/*
 	 * Wake up any dm event waiters.
 	 */
-	table = dm_get_live_table(hc->md);
-	if (table) {
+	table = dm_get_live_table(hc->md, &srcu_idx);
+	if (table)
 		dm_table_event(table);
-		dm_table_put(table);
-	}
+	dm_put_live_table(hc->md, srcu_idx);
 
 	if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr))
 		param->flags |= DM_UEVENT_GENERATED_FLAG;
@@ -620,11 +636,14 @@ static int check_name(const char *name)
  * _hash_lock without first calling dm_table_put, because dm_table_destroy
  * waits for this dm_table_put and could be called under this lock.
  */
-static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
+static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx)
 {
 	struct hash_cell *hc;
 	struct dm_table *table = NULL;
 
+	/* increment rcu count, we don't care about the table pointer */
+	dm_get_live_table(md, srcu_idx);
+
 	down_read(&_hash_lock);
 	hc = dm_get_mdptr(md);
 	if (!hc || hc->md != md) {
@@ -633,8 +652,6 @@ static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
 	}
 
 	table = hc->new_map;
-	if (table)
-		dm_table_get(table);
 
 out:
 	up_read(&_hash_lock);
@@ -643,10 +660,11 @@ out:
 }
 
 static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
-						      struct dm_ioctl *param)
+						      struct dm_ioctl *param,
+						      int *srcu_idx)
 {
 	return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ?
-		dm_get_inactive_table(md) : dm_get_live_table(md);
+		dm_get_inactive_table(md, srcu_idx) : dm_get_live_table(md, srcu_idx);
 }
 
 /*
@@ -657,6 +675,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 {
 	struct gendisk *disk = dm_disk(md);
 	struct dm_table *table;
+	int srcu_idx;
 
 	param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
 			  DM_ACTIVE_PRESENT_FLAG);
@@ -676,26 +695,27 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 	param->event_nr = dm_get_event_nr(md);
 	param->target_count = 0;
 
-	table = dm_get_live_table(md);
+	table = dm_get_live_table(md, &srcu_idx);
 	if (table) {
 		if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
 			if (get_disk_ro(disk))
 				param->flags |= DM_READONLY_FLAG;
 			param->target_count = dm_table_get_num_targets(table);
 		}
-		dm_table_put(table);
 
 		param->flags |= DM_ACTIVE_PRESENT_FLAG;
 	}
+	dm_put_live_table(md, srcu_idx);
 
 	if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) {
-		table = dm_get_inactive_table(md);
+		int srcu_idx;
+		table = dm_get_inactive_table(md, &srcu_idx);
 		if (table) {
 			if (!(dm_table_get_mode(table) & FMODE_WRITE))
 				param->flags |= DM_READONLY_FLAG;
 			param->target_count = dm_table_get_num_targets(table);
-			dm_table_put(table);
 		}
+		dm_put_live_table(md, srcu_idx);
 	}
 }
 
@@ -796,6 +816,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 	struct hash_cell *hc;
 	struct mapped_device *md;
 	int r;
+	struct dm_table *t;
 
 	down_write(&_hash_lock);
 	hc = __find_device_hash_cell(param);
@@ -819,9 +840,14 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 		return r;
 	}
 
-	__hash_remove(hc);
+	t = __hash_remove(hc);
 	up_write(&_hash_lock);
 
+	if (t) {
+		dm_sync_table(md);
+		dm_table_destroy(t);
+	}
+
 	if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr))
 		param->flags |= DM_UEVENT_GENERATED_FLAG;
 
@@ -986,6 +1012,7 @@ static int do_resume(struct dm_ioctl *param)
 
 		old_map = dm_swap_table(md, new_map);
 		if (IS_ERR(old_map)) {
+			dm_sync_table(md);
 			dm_table_destroy(new_map);
 			dm_put(md);
 			return PTR_ERR(old_map);
@@ -1003,6 +1030,10 @@ static int do_resume(struct dm_ioctl *param)
 			param->flags |= DM_UEVENT_GENERATED_FLAG;
 	}
 
+	/*
+	 * Since dm_swap_table synchronizes RCU, nobody should be in
+	 * read-side critical section already.
+	 */
 	if (old_map)
 		dm_table_destroy(old_map);
 
@@ -1125,6 +1156,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
 	int r = 0;
 	struct mapped_device *md;
 	struct dm_table *table;
+	int srcu_idx;
 
 	md = find_device(param);
 	if (!md)
@@ -1145,11 +1177,10 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
 	 */
 	__dev_status(md, param);
 
-	table = dm_get_live_or_inactive_table(md, param);
-	if (table) {
+	table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+	if (table)
 		retrieve_status(table, param, param_size);
-		dm_table_put(table);
-	}
+	dm_put_live_table(md, srcu_idx);
 
 out:
 	dm_put(md);
@@ -1221,7 +1252,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 {
 	int r;
 	struct hash_cell *hc;
-	struct dm_table *t;
+	struct dm_table *t, *old_map = NULL;
 	struct mapped_device *md;
 	struct target_type *immutable_target_type;
 
@@ -1277,14 +1308,14 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 	hc = dm_get_mdptr(md);
 	if (!hc || hc->md != md) {
 		DMWARN("device has been removed from the dev hash table.");
-		dm_table_destroy(t);
 		up_write(&_hash_lock);
+		dm_table_destroy(t);
 		r = -ENXIO;
 		goto out;
 	}
 
 	if (hc->new_map)
-		dm_table_destroy(hc->new_map);
+		old_map = hc->new_map;
 	hc->new_map = t;
 	up_write(&_hash_lock);
 
@@ -1292,6 +1323,11 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 	__dev_status(md, param);
 
 out:
+	if (old_map) {
+		dm_sync_table(md);
+		dm_table_destroy(old_map);
+	}
+
 	dm_put(md);
 
 	return r;
@@ -1301,6 +1337,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
 {
 	struct hash_cell *hc;
 	struct mapped_device *md;
+	struct dm_table *old_map = NULL;
 
 	down_write(&_hash_lock);
 
@@ -1312,7 +1349,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
 	}
 
 	if (hc->new_map) {
-		dm_table_destroy(hc->new_map);
+		old_map = hc->new_map;
 		hc->new_map = NULL;
 	}
 
@@ -1321,6 +1358,10 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
 	__dev_status(hc->md, param);
 	md = hc->md;
 	up_write(&_hash_lock);
+	if (old_map) {
+		dm_sync_table(md);
+		dm_table_destroy(old_map);
+	}
 	dm_put(md);
 
 	return 0;
@@ -1370,6 +1411,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
 {
 	struct mapped_device *md;
 	struct dm_table *table;
+	int srcu_idx;
 
 	md = find_device(param);
 	if (!md)
@@ -1377,11 +1419,10 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
 
 	__dev_status(md, param);
 
-	table = dm_get_live_or_inactive_table(md, param);
-	if (table) {
+	table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+	if (table)
 		retrieve_deps(table, param, param_size);
-		dm_table_put(table);
-	}
+	dm_put_live_table(md, srcu_idx);
 
 	dm_put(md);
 
@@ -1396,6 +1437,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
 {
 	struct mapped_device *md;
 	struct dm_table *table;
+	int srcu_idx;
 
 	md = find_device(param);
 	if (!md)
@@ -1403,11 +1445,10 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
 
 	__dev_status(md, param);
 
-	table = dm_get_live_or_inactive_table(md, param);
-	if (table) {
+	table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+	if (table)
 		retrieve_status(table, param, param_size);
-		dm_table_put(table);
-	}
+	dm_put_live_table(md, srcu_idx);
 
 	dm_put(md);
 
@@ -1443,6 +1484,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 	struct dm_target_msg *tmsg = (void *) param + param->data_start;
 	size_t maxlen;
 	char *result = get_result_buffer(param, param_size, &maxlen);
+	int srcu_idx;
 
 	md = find_device(param);
 	if (!md)
@@ -1470,9 +1512,9 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 	if (r <= 1)
 		goto out_argv;
 
-	table = dm_get_live_table(md);
+	table = dm_get_live_table(md, &srcu_idx);
 	if (!table)
-		goto out_argv;
+		goto out_table;
 
 	if (dm_deleting_md(md)) {
 		r = -ENXIO;
@@ -1491,7 +1533,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 	}
 
  out_table:
-	dm_table_put(table);
+	dm_put_live_table(md, srcu_idx);
  out_argv:
 	kfree(argv);
  out:
@@ -1644,7 +1686,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
 	}
 
 	if (!dmi) {
-		dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL);
+		unsigned noio_flag;
+		noio_flag = memalloc_noio_save();
+		dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
+		memalloc_noio_restore(noio_flag);
 		if (dmi)
 			*param_flags |= DM_PARAMS_VMALLOC;
 	}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index bdf26f5..5adede1 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1561,7 +1561,6 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 	unsigned long flags;
 	int r;
 
-again:
 	bdev = NULL;
 	mode = 0;
 	r = 0;
@@ -1579,7 +1578,7 @@ again:
 	}
 
 	if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
-		r = -EAGAIN;
+		r = -ENOTCONN;
 	else if (!bdev)
 		r = -EIO;
 
@@ -1591,11 +1590,8 @@ again:
 	if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
 		r = scsi_verify_blk_ioctl(NULL, cmd);
 
-	if (r == -EAGAIN && !fatal_signal_pending(current)) {
+	if (r == -ENOTCONN && !fatal_signal_pending(current))
 		queue_work(kmultipathd, &m->process_queued_ios);
-		msleep(10);
-		goto again;
-	}
 
 	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 }
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1d3fe1a..4880b69 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -380,7 +380,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
 static int validate_raid_redundancy(struct raid_set *rs)
 {
 	unsigned i, rebuild_cnt = 0;
-	unsigned rebuilds_per_group, copies, d;
+	unsigned rebuilds_per_group = 0, copies, d;
 	unsigned group_size, last_group_start;
 
 	for (i = 0; i < rs->md.raid_disks; i++)
@@ -504,7 +504,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	 * First, parse the in-order required arguments
 	 * "chunk_size" is the only argument of this type.
 	 */
-	if ((strict_strtoul(argv[0], 10, &value) < 0)) {
+	if ((kstrtoul(argv[0], 10, &value) < 0)) {
 		rs->ti->error = "Bad chunk size";
 		return -EINVAL;
 	} else if (rs->raid_type->level == 1) {
@@ -585,7 +585,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 			continue;
 		}
 
-		if (strict_strtoul(argv[i], 10, &value) < 0) {
+		if (kstrtoul(argv[i], 10, &value) < 0) {
 			rs->ti->error = "Bad numerical argument given in raid params";
 			return -EINVAL;
 		}
@@ -1181,7 +1181,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	argv++;
 
 	/* number of RAID parameters */
-	if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
+	if (kstrtoul(argv[0], 10, &num_raid_params) < 0) {
 		ti->error = "Cannot understand number of RAID parameters";
 		return -EINVAL;
 	}
@@ -1194,7 +1194,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		return -EINVAL;
 	}
 
-	if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
+	if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
 	    (num_raid_devs >= INT_MAX)) {
 		ti->error = "Cannot understand number of raid devices";
 		return -EINVAL;
@@ -1388,6 +1388,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		 *   performing a "check" of the array.
 		 */
 		DMEMIT(" %llu",
+		       (strcmp(rs->md.last_sync_action, "check")) ? 0 :
 		       (unsigned long long)
 		       atomic64_read(&rs->md.resync_mismatches));
 		break;
@@ -1572,6 +1573,62 @@ static void raid_postsuspend(struct dm_target *ti)
 	mddev_suspend(&rs->md);
 }
 
+static void attempt_restore_of_faulty_devices(struct raid_set *rs)
+{
+	int i;
+	uint64_t failed_devices, cleared_failed_devices = 0;
+	unsigned long flags;
+	struct dm_raid_superblock *sb;
+	struct md_rdev *r;
+
+	for (i = 0; i < rs->md.raid_disks; i++) {
+		r = &rs->dev[i].rdev;
+		if (test_bit(Faulty, &r->flags) && r->sb_page &&
+		    sync_page_io(r, 0, r->sb_size, r->sb_page, READ, 1)) {
+			DMINFO("Faulty %s device #%d has readable super block."
+			       "  Attempting to revive it.",
+			       rs->raid_type->name, i);
+
+			/*
+			 * Faulty bit may be set, but sometimes the array can
+			 * be suspended before the personalities can respond
+			 * by removing the device from the array (i.e. calling
+			 * 'hot_remove_disk').  If they haven't yet removed
+			 * the failed device, its 'raid_disk' number will be
+			 * '>= 0' - meaning we must call this function
+			 * ourselves.
+			 */
+			if ((r->raid_disk >= 0) &&
+			    (r->mddev->pers->hot_remove_disk(r->mddev, r) != 0))
+				/* Failed to revive this device, try next */
+				continue;
+
+			r->raid_disk = i;
+			r->saved_raid_disk = i;
+			flags = r->flags;
+			clear_bit(Faulty, &r->flags);
+			clear_bit(WriteErrorSeen, &r->flags);
+			clear_bit(In_sync, &r->flags);
+			if (r->mddev->pers->hot_add_disk(r->mddev, r)) {
+				r->raid_disk = -1;
+				r->saved_raid_disk = -1;
+				r->flags = flags;
+			} else {
+				r->recovery_offset = 0;
+				cleared_failed_devices |= 1 << i;
+			}
+		}
+	}
+	if (cleared_failed_devices) {
+		rdev_for_each(r, &rs->md) {
+			sb = page_address(r->sb_page);
+			failed_devices = le64_to_cpu(sb->failed_devices);
+			failed_devices &= ~cleared_failed_devices;
+			sb->failed_devices = cpu_to_le64(failed_devices);
+		}
+	}
+}
+
 static void raid_resume(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;
@@ -1580,6 +1637,13 @@ static void raid_resume(struct dm_target *ti)
 	if (!rs->bitmap_loaded) {
 		bitmap_load(&rs->md);
 		rs->bitmap_loaded = 1;
+	} else {
+		/*
+		 * A secondary resume while the device is active.
+		 * Take this opportunity to check whether any failed
+		 * devices are reachable again.
+		 */
+		attempt_restore_of_faulty_devices(rs);
 	}
 
 	clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
@@ -1588,7 +1652,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 5, 0},
+	.version = {1, 5, 2},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
new file mode 100644
index 0000000..ff9ac4b
--- /dev/null
+++ b/drivers/md/dm-switch.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright (C) 2010-2012 by Dell Inc.  All rights reserved.
+ * Copyright (C) 2011-2013 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ *
+ * dm-switch is a device-mapper target that maps IO to underlying block
+ * devices efficiently when there are a large number of fixed-sized
+ * address regions but there is no simple pattern to allow for a compact
+ * mapping representation such as dm-stripe.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "switch"
+
+/*
+ * One region_table_slot_t holds <region_entries_per_slot> region table
+ * entries each of which is <region_table_entry_bits> in size.
+ */
+typedef unsigned long region_table_slot_t;
+
+/*
+ * A device with the offset to its start sector.
+ */
+struct switch_path {
+	struct dm_dev *dmdev;
+	sector_t start;
+};
+
+/*
+ * Context block for a dm switch device.
+ */
+struct switch_ctx {
+	struct dm_target *ti;
+
+	unsigned nr_paths;		/* Number of paths in path_list. */
+
+	unsigned region_size;		/* Region size in 512-byte sectors */
+	unsigned long nr_regions;	/* Number of regions making up the device */
+	signed char region_size_bits;	/* log2 of region_size or -1 */
+
+	unsigned char region_table_entry_bits;	/* Number of bits in one region table entry */
+	unsigned char region_entries_per_slot;	/* Number of entries in one region table slot */
+	signed char region_entries_per_slot_bits;	/* log2 of region_entries_per_slot or -1 */
+
+	region_table_slot_t *region_table;	/* Region table */
+
+	/*
+	 * Array of dm devices to switch between.
+	 */
+	struct switch_path path_list[0];
+};
+
+static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
+					   unsigned region_size)
+{
+	struct switch_ctx *sctx;
+
+	sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
+		       GFP_KERNEL);
+	if (!sctx)
+		return NULL;
+
+	sctx->ti = ti;
+	sctx->region_size = region_size;
+
+	ti->private = sctx;
+
+	return sctx;
+}
+
+static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
+{
+	struct switch_ctx *sctx = ti->private;
+	sector_t nr_regions = ti->len;
+	sector_t nr_slots;
+
+	if (!(sctx->region_size & (sctx->region_size - 1)))
+		sctx->region_size_bits = __ffs(sctx->region_size);
+	else
+		sctx->region_size_bits = -1;
+
+	sctx->region_table_entry_bits = 1;
+	while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
+	       (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
+		sctx->region_table_entry_bits++;
+
+	sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
+	if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
+		sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
+	else
+		sctx->region_entries_per_slot_bits = -1;
+
+	if (sector_div(nr_regions, sctx->region_size))
+		nr_regions++;
+
+	sctx->nr_regions = nr_regions;
+	if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) {
+		ti->error = "Region table too large";
+		return -EINVAL;
+	}
+
+	nr_slots = nr_regions;
+	if (sector_div(nr_slots, sctx->region_entries_per_slot))
+		nr_slots++;
+
+	if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
+		ti->error = "Region table too large";
+		return -EINVAL;
+	}
+
+	sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t));
+	if (!sctx->region_table) {
+		ti->error = "Cannot allocate region table";
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
+				unsigned long *region_index, unsigned *bit)
+{
+	if (sctx->region_entries_per_slot_bits >= 0) {
+		*region_index = region_nr >> sctx->region_entries_per_slot_bits;
+		*bit = region_nr & (sctx->region_entries_per_slot - 1);
+	} else {
+		*region_index = region_nr / sctx->region_entries_per_slot;
+		*bit = region_nr % sctx->region_entries_per_slot;
+	}
+
+	*bit *= sctx->region_table_entry_bits;
+}
+
+/*
+ * Find which path to use at given offset.
+ */
+static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
+{
+	unsigned long region_index;
+	unsigned bit, path_nr;
+	sector_t p;
+
+	p = offset;
+	if (sctx->region_size_bits >= 0)
+		p >>= sctx->region_size_bits;
+	else
+		sector_div(p, sctx->region_size);
+
+	switch_get_position(sctx, p, &region_index, &bit);
+	path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
+	       ((1 << sctx->region_table_entry_bits) - 1);
+
+	/* This can only happen if the processor uses non-atomic stores. */
+	if (unlikely(path_nr >= sctx->nr_paths))
+		path_nr = 0;
+
+	return path_nr;
+}
+
+static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
+				      unsigned value)
+{
+	unsigned long region_index;
+	unsigned bit;
+	region_table_slot_t pte;
+
+	switch_get_position(sctx, region_nr, &region_index, &bit);
+
+	pte = sctx->region_table[region_index];
+	pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
+	pte |= (region_table_slot_t)value << bit;
+	sctx->region_table[region_index] = pte;
+}
+
+/*
+ * Fill the region table with an initial round robin pattern.
+ */
+static void initialise_region_table(struct switch_ctx *sctx)
+{
+	unsigned path_nr = 0;
+	unsigned long region_nr;
+
+	for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
+		switch_region_table_write(sctx, region_nr, path_nr);
+		if (++path_nr >= sctx->nr_paths)
+			path_nr = 0;
+	}
+}
+
+static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
+{
+	struct switch_ctx *sctx = ti->private;
+	unsigned long long start;
+	int r;
+
+	r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
+			  &sctx->path_list[sctx->nr_paths].dmdev);
+	if (r) {
+		ti->error = "Device lookup failed";
+		return r;
+	}
+
+	if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
+		ti->error = "Invalid device starting offset";
+		dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
+		return -EINVAL;
+	}
+
+	sctx->path_list[sctx->nr_paths].start = start;
+
+	sctx->nr_paths++;
+
+	return 0;
+}
+
+/*
+ * Destructor: Don't free the dm_target, just the ti->private data (if any).
+ */
+static void switch_dtr(struct dm_target *ti)
+{
+	struct switch_ctx *sctx = ti->private;
+
+	while (sctx->nr_paths--)
+		dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
+
+	vfree(sctx->region_table);
+	kfree(sctx);
+}
+
+/*
+ * Constructor arguments:
+ *   <num_paths> <region_size> <num_optional_args> [<optional_args>...]
+ *   [<dev_path> <offset>]+
+ *
+ * Optional args are to allow for future extension: currently this
+ * parameter must be 0.
+ */
+static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	static struct dm_arg _args[] = {
+		{1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
+		{1, UINT_MAX, "Invalid region size"},
+		{0, 0, "Invalid number of optional args"},
+	};
+
+	struct switch_ctx *sctx;
+	struct dm_arg_set as;
+	unsigned nr_paths, region_size, nr_optional_args;
+	int r;
+
+	as.argc = argc;
+	as.argv = argv;
+
+	r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
+	if (r)
+		return -EINVAL;
+
+	r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
+	if (r)
+		return r;
+
+	r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
+	if (r)
+		return r;
+	/* parse optional arguments here, if we add any */
+
+	if (as.argc != nr_paths * 2) {
+		ti->error = "Incorrect number of path arguments";
+		return -EINVAL;
+	}
+
+	sctx = alloc_switch_ctx(ti, nr_paths, region_size);
+	if (!sctx) {
+		ti->error = "Cannot allocate redirection context";
+		return -ENOMEM;
+	}
+
+	r = dm_set_target_max_io_len(ti, region_size);
+	if (r)
+		goto error;
+
+	while (as.argc) {
+		r = parse_path(&as, ti);
+		if (r)
+			goto error;
+	}
+
+	r = alloc_region_table(ti, nr_paths);
+	if (r)
+		goto error;
+
+	initialise_region_table(sctx);
+
+	/* For UNMAP, sending the request down any path is sufficient */
+	ti->num_discard_bios = 1;
+
+	return 0;
+
+error:
+	switch_dtr(ti);
+
+	return r;
+}
+
+static int switch_map(struct dm_target *ti, struct bio *bio)
+{
+	struct switch_ctx *sctx = ti->private;
+	sector_t offset = dm_target_offset(ti, bio->bi_sector);
+	unsigned path_nr = switch_get_path_nr(sctx, offset);
+
+	bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
+	bio->bi_sector = sctx->path_list[path_nr].start + offset;
+
+	return DM_MAPIO_REMAPPED;
+}
+
+/*
+ * We need to parse hex numbers in the message as quickly as possible.
+ *
+ * This table-based hex parser improves performance.
+ * It improves a time to load 1000000 entries compared to the condition-based
+ * parser.
+ *		table-based parser	condition-based parser
+ * PA-RISC	0.29s			0.31s
+ * Opteron	0.0495s			0.0498s
+ */
+static const unsigned char hex_table[256] = {
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
+255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+};
+
+static __always_inline unsigned long parse_hex(const char **string)
+{
+	unsigned char d;
+	unsigned long r = 0;
+
+	while ((d = hex_table[(unsigned char)**string]) < 16) {
+		r = (r << 4) | d;
+		(*string)++;
+	}
+
+	return r;
+}
+
+static int process_set_region_mappings(struct switch_ctx *sctx,
+			     unsigned argc, char **argv)
+{
+	unsigned i;
+	unsigned long region_index = 0;
+
+	for (i = 1; i < argc; i++) {
+		unsigned long path_nr;
+		const char *string = argv[i];
+
+		if (*string == ':')
+			region_index++;
+		else {
+			region_index = parse_hex(&string);
+			if (unlikely(*string != ':')) {
+				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+				return -EINVAL;
+			}
+		}
+
+		string++;
+		if (unlikely(!*string)) {
+			DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+			return -EINVAL;
+		}
+
+		path_nr = parse_hex(&string);
+		if (unlikely(*string)) {
+			DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+			return -EINVAL;
+		}
+		if (unlikely(region_index >= sctx->nr_regions)) {
+			DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
+			return -EINVAL;
+		}
+		if (unlikely(path_nr >= sctx->nr_paths)) {
+			DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
+			return -EINVAL;
+		}
+
+		switch_region_table_write(sctx, region_index, path_nr);
+	}
+
+	return 0;
+}
+
+/*
+ * Messages are processed one-at-a-time.
+ *
+ * Only set_region_mappings is supported.
+ */
+static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	static DEFINE_MUTEX(message_mutex);
+
+	struct switch_ctx *sctx = ti->private;
+	int r = -EINVAL;
+
+	mutex_lock(&message_mutex);
+
+	if (!strcasecmp(argv[0], "set_region_mappings"))
+		r = process_set_region_mappings(sctx, argc, argv);
+	else
+		DMWARN("Unrecognised message received.");
+
+	mutex_unlock(&message_mutex);
+
+	return r;
+}
+
+static void switch_status(struct dm_target *ti, status_type_t type,
+			  unsigned status_flags, char *result, unsigned maxlen)
+{
+	struct switch_ctx *sctx = ti->private;
+	unsigned sz = 0;
+	int path_nr;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		result[0] = '\0';
+		break;
+
+	case STATUSTYPE_TABLE:
+		DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
+		for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
+			DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
+			       (unsigned long long)sctx->path_list[path_nr].start);
+		break;
+	}
+}
+
+/*
+ * Switch ioctl:
+ *
+ * Passthrough all ioctls to the path for sector 0
+ */
+static int switch_ioctl(struct dm_target *ti, unsigned cmd,
+			unsigned long arg)
+{
+	struct switch_ctx *sctx = ti->private;
+	struct block_device *bdev;
+	fmode_t mode;
+	unsigned path_nr;
+	int r = 0;
+
+	path_nr = switch_get_path_nr(sctx, 0);
+
+	bdev = sctx->path_list[path_nr].dmdev->bdev;
+	mode = sctx->path_list[path_nr].dmdev->mode;
+
+	/*
+	 * Only pass ioctls through if the device sizes match exactly.
+	 */
+	if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
+		r = scsi_verify_blk_ioctl(NULL, cmd);
+
+	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+}
+
+static int switch_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct switch_ctx *sctx = ti->private;
+	int path_nr;
+	int r;
+
+	for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
+		r = fn(ti, sctx->path_list[path_nr].dmdev,
+			 sctx->path_list[path_nr].start, ti->len, data);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static struct target_type switch_target = {
+	.name = "switch",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr = switch_ctr,
+	.dtr = switch_dtr,
+	.map = switch_map,
+	.message = switch_message,
+	.status = switch_status,
+	.ioctl = switch_ioctl,
+	.iterate_devices = switch_iterate_devices,
+};
+
+static int __init dm_switch_init(void)
+{
+	int r;
+
+	r = dm_register_target(&switch_target);
+	if (r < 0)
+		DMERR("dm_register_target() failed %d", r);
+
+	return r;
+}
+
+static void __exit dm_switch_exit(void)
+{
+	dm_unregister_target(&switch_target);
+}
+
+module_init(dm_switch_init);
+module_exit(dm_switch_exit);
+
+MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
+MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
+MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
+MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
+MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 1ff252a..f221812 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -26,22 +26,8 @@
 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
 
-/*
- * The table has always exactly one reference from either mapped_device->map
- * or hash_cell->new_map. This reference is not counted in table->holders.
- * A pair of dm_create_table/dm_destroy_table functions is used for table
- * creation/destruction.
- *
- * Temporary references from the other code increase table->holders. A pair
- * of dm_table_get/dm_table_put functions is used to manipulate it.
- *
- * When the table is about to be destroyed, we wait for table->holders to
- * drop to zero.
- */
-
 struct dm_table {
 	struct mapped_device *md;
-	atomic_t holders;
 	unsigned type;
 
 	/* btree table */
@@ -208,7 +194,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 
 	INIT_LIST_HEAD(&t->devices);
 	INIT_LIST_HEAD(&t->target_callbacks);
-	atomic_set(&t->holders, 0);
 
 	if (!num_targets)
 		num_targets = KEYS_PER_NODE;
@@ -246,10 +231,6 @@ void dm_table_destroy(struct dm_table *t)
 	if (!t)
 		return;
 
-	while (atomic_read(&t->holders))
-		msleep(1);
-	smp_mb();
-
 	/* free the indexes */
 	if (t->depth >= 2)
 		vfree(t->index[t->depth - 2]);
@@ -274,22 +255,6 @@ void dm_table_destroy(struct dm_table *t)
 	kfree(t);
 }
 
-void dm_table_get(struct dm_table *t)
-{
-	atomic_inc(&t->holders);
-}
-EXPORT_SYMBOL(dm_table_get);
-
-void dm_table_put(struct dm_table *t)
-{
-	if (!t)
-		return;
-
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->holders);
-}
-EXPORT_SYMBOL(dm_table_put);
-
 /*
  * Checks to see if we need to extend highs or targets.
  */
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index b948fd8..4b7941d 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -451,7 +451,7 @@ static void verity_prefetch_io(struct work_struct *work)
 				goto no_prefetch_cluster;
 
 			if (unlikely(cluster & (cluster - 1)))
-				cluster = 1 << (fls(cluster) - 1);
+				cluster = 1 << __fls(cluster);
 
 			hash_block_start &= ~(sector_t)(cluster - 1);
 			hash_block_end |= cluster - 1;
@@ -695,8 +695,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 
-	if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 ||
-	    num < 0 || num > 1) {
+	if (sscanf(argv[0], "%u%c", &num, &dummy) != 1 ||
+	    num > 1) {
 		ti->error = "Invalid version";
 		r = -EINVAL;
 		goto bad;
@@ -723,7 +723,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		r = -EINVAL;
 		goto bad;
 	}
-	v->data_dev_block_bits = ffs(num) - 1;
+	v->data_dev_block_bits = __ffs(num);
 
 	if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
 	    !num || (num & (num - 1)) ||
@@ -733,7 +733,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		r = -EINVAL;
 		goto bad;
 	}
-	v->hash_dev_block_bits = ffs(num) - 1;
+	v->hash_dev_block_bits = __ffs(num);
 
 	if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
 	    (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT))
@@ -812,7 +812,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	v->hash_per_block_bits =
-		fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1;
+		__fls((1 << v->hash_dev_block_bits) / v->digest_size);
 
 	v->levels = 0;
 	if (v->data_blocks)
@@ -831,9 +831,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	for (i = v->levels - 1; i >= 0; i--) {
 		sector_t s;
 		v->hash_level_block[i] = hash_position;
-		s = verity_position_at_level(v, v->data_blocks, i);
-		s = (s >> v->hash_per_block_bits) +
-		    !!(s & ((1 << v->hash_per_block_bits) - 1));
+		s = (v->data_blocks + ((sector_t)1 << ((i + 1) * v->hash_per_block_bits)) - 1)
+					>> ((i + 1) * v->hash_per_block_bits);
 		if (hash_position + s < hash_position) {
 			ti->error = "Hash device offset overflow";
 			r = -E2BIG;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d5370a9..9e39d2b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -117,15 +117,29 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_MERGE_IS_OPTIONAL 6
 
 /*
+ * A dummy definition to make RCU happy.
+ * struct dm_table should never be dereferenced in this file.
+ */
+struct dm_table {
+	int undefined__;
+};
+
+/*
  * Work processed by per-device workqueue.
  */
 struct mapped_device {
-	struct rw_semaphore io_lock;
+	struct srcu_struct io_barrier;
 	struct mutex suspend_lock;
-	rwlock_t map_lock;
 	atomic_t holders;
 	atomic_t open_count;
 
+	/*
+	 * The current mapping.
+	 * Use dm_get_live_table{_fast} or take suspend_lock for
+	 * dereference.
+	 */
+	struct dm_table *map;
+
 	unsigned long flags;
 
 	struct request_queue *queue;
@@ -155,11 +169,6 @@ struct mapped_device {
 	struct workqueue_struct *wq;
 
 	/*
-	 * The current mapping.
-	 */
-	struct dm_table *map;
-
-	/*
 	 * io objects are allocated from here.
 	 */
 	mempool_t *io_pool;
@@ -386,10 +395,14 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned int cmd, unsigned long arg)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
-	struct dm_table *map = dm_get_live_table(md);
+	int srcu_idx;
+	struct dm_table *map;
 	struct dm_target *tgt;
 	int r = -ENOTTY;
 
+retry:
+	map = dm_get_live_table(md, &srcu_idx);
+
 	if (!map || !dm_table_get_size(map))
 		goto out;
 
@@ -408,7 +421,12 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 		r = tgt->type->ioctl(tgt, cmd, arg);
 
 out:
-	dm_table_put(map);
+	dm_put_live_table(md, srcu_idx);
+
+	if (r == -ENOTCONN) {
+		msleep(10);
+		goto retry;
+	}
 
 	return r;
 }
@@ -502,20 +520,39 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
 /*
  * Everyone (including functions in this file), should use this
  * function to access the md->map field, and make sure they call
- * dm_table_put() when finished.
+ * dm_put_live_table() when finished.
  */
-struct dm_table *dm_get_live_table(struct mapped_device *md)
+struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 {
-	struct dm_table *t;
-	unsigned long flags;
+	*srcu_idx = srcu_read_lock(&md->io_barrier);
+
+	return srcu_dereference(md->map, &md->io_barrier);
+}
 
-	read_lock_irqsave(&md->map_lock, flags);
-	t = md->map;
-	if (t)
-		dm_table_get(t);
-	read_unlock_irqrestore(&md->map_lock, flags);
+void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
+{
+	srcu_read_unlock(&md->io_barrier, srcu_idx);
+}
 
-	return t;
+void dm_sync_table(struct mapped_device *md)
+{
+	synchronize_srcu(&md->io_barrier);
+	synchronize_rcu_expedited();
+}
+
+/*
+ * A fast alternative to dm_get_live_table/dm_put_live_table.
+ * The caller must not block between these two functions.
+ */
+static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
+{
+	rcu_read_lock();
+	return rcu_dereference(md->map);
+}
+
+static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
+{
+	rcu_read_unlock();
 }
 
 /*
@@ -1349,17 +1386,18 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 /*
  * Entry point to split a bio into clones and submit them to the targets.
  */
-static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
+static void __split_and_process_bio(struct mapped_device *md,
+				    struct dm_table *map, struct bio *bio)
 {
 	struct clone_info ci;
 	int error = 0;
 
-	ci.map = dm_get_live_table(md);
-	if (unlikely(!ci.map)) {
+	if (unlikely(!map)) {
 		bio_io_error(bio);
 		return;
 	}
 
+	ci.map = map;
 	ci.md = md;
 	ci.io = alloc_io(md);
 	ci.io->error = 0;
@@ -1386,7 +1424,6 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 
 	/* drop the extra reference count */
 	dec_pending(ci.io, error);
-	dm_table_put(ci.map);
 }
 /*-----------------------------------------------------------------
  * CRUD END
@@ -1397,7 +1434,7 @@ static int dm_merge_bvec(struct request_queue *q,
 			 struct bio_vec *biovec)
 {
 	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table(md);
+	struct dm_table *map = dm_get_live_table_fast(md);
 	struct dm_target *ti;
 	sector_t max_sectors;
 	int max_size = 0;
@@ -1407,7 +1444,7 @@ static int dm_merge_bvec(struct request_queue *q,
 
 	ti = dm_table_find_target(map, bvm->bi_sector);
 	if (!dm_target_is_valid(ti))
-		goto out_table;
+		goto out;
 
 	/*
 	 * Find maximum amount of I/O that won't need splitting
@@ -1436,10 +1473,8 @@ static int dm_merge_bvec(struct request_queue *q,
 
 		max_size = 0;
 
-out_table:
-	dm_table_put(map);
-
 out:
+	dm_put_live_table_fast(md);
 	/*
 	 * Always allow an entire first page
 	 */
@@ -1458,8 +1493,10 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
 	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
 	int cpu;
+	int srcu_idx;
+	struct dm_table *map;
 
-	down_read(&md->io_lock);
+	map = dm_get_live_table(md, &srcu_idx);
 
 	cpu = part_stat_lock();
 	part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
@@ -1468,7 +1505,7 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
 
 	/* if we're suspended, we have to queue this io for later */
 	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
-		up_read(&md->io_lock);
+		dm_put_live_table(md, srcu_idx);
 
 		if (bio_rw(bio) != READA)
 			queue_io(md, bio);
@@ -1477,8 +1514,8 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
 		return;
 	}
 
-	__split_and_process_bio(md, bio);
-	up_read(&md->io_lock);
+	__split_and_process_bio(md, map, bio);
+	dm_put_live_table(md, srcu_idx);
 	return;
 }
 
@@ -1664,7 +1701,8 @@ static struct request *dm_start_request(struct mapped_device *md, struct request
 static void dm_request_fn(struct request_queue *q)
 {
 	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table(md);
+	int srcu_idx;
+	struct dm_table *map = dm_get_live_table(md, &srcu_idx);
 	struct dm_target *ti;
 	struct request *rq, *clone;
 	sector_t pos;
@@ -1719,7 +1757,7 @@ requeued:
 delay_and_out:
 	blk_delay_queue(q, HZ / 10);
 out:
-	dm_table_put(map);
+	dm_put_live_table(md, srcu_idx);
 }
 
 int dm_underlying_device_busy(struct request_queue *q)
@@ -1732,14 +1770,14 @@ static int dm_lld_busy(struct request_queue *q)
 {
 	int r;
 	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table(md);
+	struct dm_table *map = dm_get_live_table_fast(md);
 
 	if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
 		r = 1;
 	else
 		r = dm_table_any_busy_target(map);
 
-	dm_table_put(map);
+	dm_put_live_table_fast(md);
 
 	return r;
 }
@@ -1751,7 +1789,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 	struct dm_table *map;
 
 	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
-		map = dm_get_live_table(md);
+		map = dm_get_live_table_fast(md);
 		if (map) {
 			/*
 			 * Request-based dm cares about only own queue for
@@ -1762,9 +1800,8 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 				    bdi_bits;
 			else
 				r = dm_table_any_congested(map, bdi_bits);
-
-			dm_table_put(map);
 		}
+		dm_put_live_table_fast(md);
 	}
 
 	return r;
@@ -1869,12 +1906,14 @@ static struct mapped_device *alloc_dev(int minor)
 	if (r < 0)
 		goto bad_minor;
 
+	r = init_srcu_struct(&md->io_barrier);
+	if (r < 0)
+		goto bad_io_barrier;
+
 	md->type = DM_TYPE_NONE;
-	init_rwsem(&md->io_lock);
 	mutex_init(&md->suspend_lock);
 	mutex_init(&md->type_lock);
 	spin_lock_init(&md->deferred_lock);
-	rwlock_init(&md->map_lock);
 	atomic_set(&md->holders, 1);
 	atomic_set(&md->open_count, 0);
 	atomic_set(&md->event_nr, 0);
@@ -1937,6 +1976,8 @@ bad_thread:
 bad_disk:
 	blk_cleanup_queue(md->queue);
 bad_queue:
+	cleanup_srcu_struct(&md->io_barrier);
+bad_io_barrier:
 	free_minor(minor);
 bad_minor:
 	module_put(THIS_MODULE);
@@ -1960,6 +2001,7 @@ static void free_dev(struct mapped_device *md)
 		bioset_free(md->bs);
 	blk_integrity_unregister(md->disk);
 	del_gendisk(md->disk);
+	cleanup_srcu_struct(&md->io_barrier);
 	free_minor(minor);
 
 	spin_lock(&_minor_lock);
@@ -2102,7 +2144,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 	struct dm_table *old_map;
 	struct request_queue *q = md->queue;
 	sector_t size;
-	unsigned long flags;
 	int merge_is_optional;
 
 	size = dm_table_get_size(t);
@@ -2131,9 +2172,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 
 	merge_is_optional = dm_table_merge_is_optional(t);
 
-	write_lock_irqsave(&md->map_lock, flags);
 	old_map = md->map;
-	md->map = t;
+	rcu_assign_pointer(md->map, t);
 	md->immutable_target_type = dm_table_get_immutable_target_type(t);
 
 	dm_table_set_restrictions(t, q, limits);
@@ -2141,7 +2181,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 		set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
 	else
 		clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
-	write_unlock_irqrestore(&md->map_lock, flags);
+	dm_sync_table(md);
 
 	return old_map;
 }
@@ -2152,15 +2192,13 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 static struct dm_table *__unbind(struct mapped_device *md)
 {
 	struct dm_table *map = md->map;
-	unsigned long flags;
 
 	if (!map)
 		return NULL;
 
 	dm_table_event_callback(map, NULL, NULL);
-	write_lock_irqsave(&md->map_lock, flags);
-	md->map = NULL;
-	write_unlock_irqrestore(&md->map_lock, flags);
+	rcu_assign_pointer(md->map, NULL);
+	dm_sync_table(md);
 
 	return map;
 }
@@ -2312,11 +2350,12 @@ EXPORT_SYMBOL_GPL(dm_device_name);
 static void __dm_destroy(struct mapped_device *md, bool wait)
 {
 	struct dm_table *map;
+	int srcu_idx;
 
 	might_sleep();
 
 	spin_lock(&_minor_lock);
-	map = dm_get_live_table(md);
+	map = dm_get_live_table(md, &srcu_idx);
 	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
 	set_bit(DMF_FREEING, &md->flags);
 	spin_unlock(&_minor_lock);
@@ -2326,6 +2365,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
 		dm_table_postsuspend_targets(map);
 	}
 
+	/* dm_put_live_table must be before msleep, otherwise deadlock is possible */
+	dm_put_live_table(md, srcu_idx);
+
 	/*
 	 * Rare, but there may be I/O requests still going to complete,
 	 * for example.  Wait for all references to disappear.
@@ -2340,7 +2382,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
 		       dm_device_name(md), atomic_read(&md->holders));
 
 	dm_sysfs_exit(md);
-	dm_table_put(map);
 	dm_table_destroy(__unbind(md));
 	free_dev(md);
 }
@@ -2397,8 +2438,10 @@ static void dm_wq_work(struct work_struct *work)
 	struct mapped_device *md = container_of(work, struct mapped_device,
 						work);
 	struct bio *c;
+	int srcu_idx;
+	struct dm_table *map;
 
-	down_read(&md->io_lock);
+	map = dm_get_live_table(md, &srcu_idx);
 
 	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
 		spin_lock_irq(&md->deferred_lock);
@@ -2408,17 +2451,13 @@ static void dm_wq_work(struct work_struct *work)
 		if (!c)
 			break;
 
-		up_read(&md->io_lock);
-
 		if (dm_request_based(md))
 			generic_make_request(c);
 		else
-			__split_and_process_bio(md, c);
-
-		down_read(&md->io_lock);
+			__split_and_process_bio(md, map, c);
 	}
 
-	up_read(&md->io_lock);
+	dm_put_live_table(md, srcu_idx);
 }
 
 static void dm_queue_flush(struct mapped_device *md)
@@ -2450,10 +2489,10 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 	 * reappear.
 	 */
 	if (dm_table_has_no_data_devices(table)) {
-		live_map = dm_get_live_table(md);
+		live_map = dm_get_live_table_fast(md);
 		if (live_map)
 			limits = md->queue->limits;
-		dm_table_put(live_map);
+		dm_put_live_table_fast(md);
 	}
 
 	if (!live_map) {
@@ -2533,7 +2572,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		goto out_unlock;
 	}
 
-	map = dm_get_live_table(md);
+	map = md->map;
 
 	/*
 	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2554,7 +2593,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	if (!noflush && do_lockfs) {
 		r = lock_fs(md);
 		if (r)
-			goto out;
+			goto out_unlock;
 	}
 
 	/*
@@ -2569,9 +2608,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
 	 * flush_workqueue(md->wq).
 	 */
-	down_write(&md->io_lock);
 	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
-	up_write(&md->io_lock);
+	synchronize_srcu(&md->io_barrier);
 
 	/*
 	 * Stop md->queue before flushing md->wq in case request-based
@@ -2589,10 +2627,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	 */
 	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
 
-	down_write(&md->io_lock);
 	if (noflush)
 		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
-	up_write(&md->io_lock);
+	synchronize_srcu(&md->io_barrier);
 
 	/* were we interrupted ? */
 	if (r < 0) {
@@ -2602,7 +2639,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 			start_queue(md->queue);
 
 		unlock_fs(md);
-		goto out; /* pushback list is already flushed, so skip flush */
+		goto out_unlock; /* pushback list is already flushed, so skip flush */
 	}
 
 	/*
@@ -2615,9 +2652,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
 	dm_table_postsuspend_targets(map);
 
-out:
-	dm_table_put(map);
-
 out_unlock:
 	mutex_unlock(&md->suspend_lock);
 	return r;
@@ -2632,7 +2666,7 @@ int dm_resume(struct mapped_device *md)
 	if (!dm_suspended_md(md))
 		goto out;
 
-	map = dm_get_live_table(md);
+	map = md->map;
 	if (!map || !dm_table_get_size(map))
 		goto out;
 
@@ -2656,7 +2690,6 @@ int dm_resume(struct mapped_device *md)
 
 	r = 0;
 out:
-	dm_table_put(map);
 	mutex_unlock(&md->suspend_lock);
 
 	return r;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 681d109..dddc87b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -521,6 +521,7 @@ void mddev_init(struct mddev *mddev)
 	init_waitqueue_head(&mddev->recovery_wait);
 	mddev->reshape_position = MaxSector;
 	mddev->reshape_backwards = 0;
+	mddev->last_sync_action = "none";
 	mddev->resync_min = 0;
 	mddev->resync_max = MaxSector;
 	mddev->level = LEVEL_NONE;
@@ -2867,7 +2868,7 @@ static ssize_t
 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
 {
 	unsigned long long offset;
-	if (strict_strtoull(buf, 10, &offset) < 0)
+	if (kstrtoull(buf, 10, &offset) < 0)
 		return -EINVAL;
 	if (rdev->mddev->pers && rdev->raid_disk >= 0)
 		return -EBUSY;
@@ -2895,7 +2896,7 @@ static ssize_t new_offset_store(struct md_rdev *rdev,
 	unsigned long long new_offset;
 	struct mddev *mddev = rdev->mddev;
 
-	if (strict_strtoull(buf, 10, &new_offset) < 0)
+	if (kstrtoull(buf, 10, &new_offset) < 0)
 		return -EINVAL;
 
 	if (mddev->sync_thread)
@@ -2961,7 +2962,7 @@ static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
 	unsigned long long blocks;
 	sector_t new;
 
-	if (strict_strtoull(buf, 10, &blocks) < 0)
+	if (kstrtoull(buf, 10, &blocks) < 0)
 		return -EINVAL;
 
 	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
@@ -3069,7 +3070,7 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_
 
 	if (cmd_match(buf, "none"))
 		recovery_start = MaxSector;
-	else if (strict_strtoull(buf, 10, &recovery_start))
+	else if (kstrtoull(buf, 10, &recovery_start))
 		return -EINVAL;
 
 	if (rdev->mddev->pers &&
@@ -3497,7 +3498,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
 	if (clevel[len-1] == '\n')
 		len--;
 	clevel[len] = 0;
-	if (strict_strtol(clevel, 10, &level))
+	if (kstrtol(clevel, 10, &level))
 		level = LEVEL_NONE;
 
 	if (request_module("md-%s", clevel) != 0)
@@ -4272,6 +4273,17 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 	return len;
 }
 
+static struct md_sysfs_entry md_scan_mode =
+__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
+
+static ssize_t
+last_sync_action_show(struct mddev *mddev, char *page)
+{
+	return sprintf(page, "%s\n", mddev->last_sync_action);
+}
+
+static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
+
 static ssize_t
 mismatch_cnt_show(struct mddev *mddev, char *page)
 {
@@ -4280,10 +4292,6 @@ mismatch_cnt_show(struct mddev *mddev, char *page)
 		       atomic64_read(&mddev->resync_mismatches));
 }
 
-static struct md_sysfs_entry md_scan_mode =
-__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
-
-
 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
 
 static ssize_t
@@ -4356,7 +4364,7 @@ sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	long n;
 
-	if (strict_strtol(buf, 10, &n))
+	if (kstrtol(buf, 10, &n))
 		return -EINVAL;
 
 	if (n != 0 && n != 1)
@@ -4424,7 +4432,7 @@ static ssize_t
 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	unsigned long long min;
-	if (strict_strtoull(buf, 10, &min))
+	if (kstrtoull(buf, 10, &min))
 		return -EINVAL;
 	if (min > mddev->resync_max)
 		return -EINVAL;
@@ -4461,7 +4469,7 @@ max_sync_store(struct mddev *mddev, const char *buf, size_t len)
 		mddev->resync_max = MaxSector;
 	else {
 		unsigned long long max;
-		if (strict_strtoull(buf, 10, &max))
+		if (kstrtoull(buf, 10, &max))
 			return -EINVAL;
 		if (max < mddev->resync_min)
 			return -EINVAL;
@@ -4686,6 +4694,7 @@ static struct attribute *md_default_attrs[] = {
 
 static struct attribute *md_redundancy_attrs[] = {
 	&md_scan_mode.attr,
+	&md_last_scan_mode.attr,
 	&md_mismatches.attr,
 	&md_sync_min.attr,
 	&md_sync_max.attr,
@@ -5268,8 +5277,8 @@ static void md_clean(struct mddev *mddev)
 
 static void __md_stop_writes(struct mddev *mddev)
 {
+	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	if (mddev->sync_thread) {
-		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 		md_reap_sync_thread(mddev);
 	}
@@ -6405,6 +6414,12 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 		/* need to ensure md_delayed_delete() has completed */
 		flush_workqueue(md_misc_wq);
 
+	if (cmd == HOT_REMOVE_DISK)
+		/* need to ensure recovery thread has run */
+		wait_event_interruptible_timeout(mddev->sb_wait,
+						 !test_bit(MD_RECOVERY_NEEDED,
+							   &mddev->flags),
+						 msecs_to_jiffies(5000));
 	err = mddev_lock(mddev);
 	if (err) {
 		printk(KERN_INFO 
@@ -7323,7 +7338,7 @@ void md_do_sync(struct md_thread *thread)
 	sector_t last_check;
 	int skipped = 0;
 	struct md_rdev *rdev;
-	char *desc;
+	char *desc, *action = NULL;
 	struct blk_plug plug;
 
 	/* just incase thread restarts... */
@@ -7333,17 +7348,21 @@ void md_do_sync(struct md_thread *thread)
 		return;
 
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
 			desc = "data-check";
-		else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+			action = "check";
+		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
 			desc = "requested-resync";
-		else
+			action = "repair";
+		} else
 			desc = "resync";
 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		desc = "reshape";
 	else
 		desc = "recovery";
 
+	mddev->last_sync_action = action ?: desc;
+
 	/* we overload curr_resync somewhat here.
 	 * 0 == not engaged in resync at all
 	 * 2 == checking that there is no conflict with another sync
@@ -7892,6 +7911,8 @@ void md_check_recovery(struct mddev *mddev)
 			md_new_event(mddev);
 		}
 	unlock:
+		wake_up(&mddev->sb_wait);
+
 		if (!mddev->sync_thread) {
 			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 653f992b6..20f02c0 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -268,6 +268,14 @@ struct mddev {
 
 	struct md_thread		*thread;	/* management thread */
 	struct md_thread		*sync_thread;	/* doing resync or reconstruct */
+
+	/* 'last_sync_action' is initialized to "none".  It is set when a
+	 * sync operation (i.e "data-check", "requested-resync", "resync",
+	 * "recovery", or "reshape") is started.  It holds this value even
+	 * when the sync thread is "frozen" (interrupted) or "idle" (stopped
+	 * or finished).  It is overwritten when a new sync operation is begun.
+	 */
+	char				*last_sync_action;
 	sector_t			curr_resync;	/* last block scheduled */
 	/* As resync requests can complete out of order, we cannot easily track
 	 * how much resync has been completed.  So we occasionally pause until
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index fcf65e5..c4d420b 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -597,6 +597,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 			       mdname(mddev));
 			return ERR_PTR(-EINVAL);
 		}
+		rdev->sectors = mddev->dev_sectors;
 	}
 
 	/* Set new parameters */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5595118..ec73458 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -417,7 +417,17 @@ static void raid1_end_write_request(struct bio *bio, int error)
 
 		r1_bio->bios[mirror] = NULL;
 		to_put = bio;
-		set_bit(R1BIO_Uptodate, &r1_bio->state);
+		/*
+		 * Do not set R1BIO_Uptodate if the current device is
+		 * rebuilding or Faulty. This is because we cannot use
+		 * such device for properly reading the data back (we could
+		 * potentially use it, if the current write would have felt
+		 * before rdev->recovery_offset, but for simplicity we don't
+		 * check this here.
+		 */
+		if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) &&
+		    !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))
+			set_bit(R1BIO_Uptodate, &r1_bio->state);
 
 		/* Maybe we can clear some bad blocks. */
 		if (is_badblock(conf->mirrors[mirror].rdev,
@@ -870,17 +880,17 @@ static void allow_barrier(struct r1conf *conf)
 	wake_up(&conf->wait_barrier);
 }
 
-static void freeze_array(struct r1conf *conf)
+static void freeze_array(struct r1conf *conf, int extra)
 {
 	/* stop syncio and normal IO and wait for everything to
 	 * go quite.
 	 * We increment barrier and nr_waiting, and then
-	 * wait until nr_pending match nr_queued+1
+	 * wait until nr_pending match nr_queued+extra
 	 * This is called in the context of one normal IO request
 	 * that has failed. Thus any sync request that might be pending
 	 * will be blocked by nr_pending, and we need to wait for
 	 * pending IO requests to complete or be queued for re-try.
-	 * Thus the number queued (nr_queued) plus this request (1)
+	 * Thus the number queued (nr_queued) plus this request (extra)
 	 * must match the number of pending IOs (nr_pending) before
 	 * we continue.
 	 */
@@ -888,7 +898,7 @@ static void freeze_array(struct r1conf *conf)
 	conf->barrier++;
 	conf->nr_waiting++;
 	wait_event_lock_irq_cmd(conf->wait_barrier,
-				conf->nr_pending == conf->nr_queued+1,
+				conf->nr_pending == conf->nr_queued+extra,
 				conf->resync_lock,
 				flush_pending_writes(conf));
 	spin_unlock_irq(&conf->resync_lock);
@@ -1509,8 +1519,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		p = conf->mirrors+mirror;
 		if (!p->rdev) {
 
-			disk_stack_limits(mddev->gendisk, rdev->bdev,
-					  rdev->data_offset << 9);
+			if (mddev->gendisk)
+				disk_stack_limits(mddev->gendisk, rdev->bdev,
+						  rdev->data_offset << 9);
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
@@ -1544,12 +1555,12 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * we wait for all outstanding requests to complete.
 		 */
 		synchronize_sched();
-		raise_barrier(conf);
-		lower_barrier(conf);
+		freeze_array(conf, 0);
+		unfreeze_array(conf);
 		clear_bit(Unmerged, &rdev->flags);
 	}
 	md_integrity_add_rdev(rdev, mddev);
-	if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 	print_conf(conf);
 	return err;
@@ -1595,11 +1606,11 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			 */
 			struct md_rdev *repl =
 				conf->mirrors[conf->raid_disks + number].rdev;
-			raise_barrier(conf);
+			freeze_array(conf, 0);
 			clear_bit(Replacement, &repl->flags);
 			p->rdev = repl;
 			conf->mirrors[conf->raid_disks + number].rdev = NULL;
-			lower_barrier(conf);
+			unfreeze_array(conf);
 			clear_bit(WantReplacement, &rdev->flags);
 		} else
 			clear_bit(WantReplacement, &rdev->flags);
@@ -2195,7 +2206,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 	 * frozen
 	 */
 	if (mddev->ro == 0) {
-		freeze_array(conf);
+		freeze_array(conf, 1);
 		fix_read_error(conf, r1_bio->read_disk,
 			       r1_bio->sector, r1_bio->sectors);
 		unfreeze_array(conf);
@@ -2780,8 +2791,8 @@ static int run(struct mddev *mddev)
 		return PTR_ERR(conf);
 
 	if (mddev->queue)
-		blk_queue_max_write_same_sectors(mddev->queue,
-						 mddev->chunk_sectors);
+		blk_queue_max_write_same_sectors(mddev->queue, 0);
+
 	rdev_for_each(rdev, mddev) {
 		if (!mddev->gendisk)
 			continue;
@@ -2963,7 +2974,7 @@ static int raid1_reshape(struct mddev *mddev)
 		return -ENOMEM;
 	}
 
-	raise_barrier(conf);
+	freeze_array(conf, 0);
 
 	/* ok, everything is stopped */
 	oldpool = conf->r1bio_pool;
@@ -2994,7 +3005,7 @@ static int raid1_reshape(struct mddev *mddev)
 	conf->raid_disks = mddev->raid_disks = raid_disks;
 	mddev->delta_disks = 0;
 
-	lower_barrier(conf);
+	unfreeze_array(conf);
 
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 59d4daa..cd066b6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -97,7 +97,7 @@ static int max_queued_requests = 1024;
 
 static void allow_barrier(struct r10conf *conf);
 static void lower_barrier(struct r10conf *conf);
-static int enough(struct r10conf *conf, int ignore);
+static int _enough(struct r10conf *conf, int previous, int ignore);
 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 				int *skipped);
 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
@@ -392,11 +392,9 @@ static void raid10_end_read_request(struct bio *bio, int error)
 		 * than fail the last device.  Here we redefine
 		 * "uptodate" to mean "Don't want to retry"
 		 */
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
-		if (!enough(conf, rdev->raid_disk))
+		if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
+			     rdev->raid_disk))
 			uptodate = 1;
-		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	if (uptodate) {
 		raid_end_bio_io(r10_bio);
@@ -490,7 +488,17 @@ static void raid10_end_write_request(struct bio *bio, int error)
 		sector_t first_bad;
 		int bad_sectors;
 
-		set_bit(R10BIO_Uptodate, &r10_bio->state);
+		/*
+		 * Do not set R10BIO_Uptodate if the current device is
+		 * rebuilding or Faulty. This is because we cannot use
+		 * such device for properly reading the data back (we could
+		 * potentially use it, if the current write would have felt
+		 * before rdev->recovery_offset, but for simplicity we don't
+		 * check this here.
+		 */
+		if (test_bit(In_sync, &rdev->flags) &&
+		    !test_bit(Faulty, &rdev->flags))
+			set_bit(R10BIO_Uptodate, &r10_bio->state);
 
 		/* Maybe we can clear some bad blocks. */
 		if (is_badblock(rdev,
@@ -1055,17 +1063,17 @@ static void allow_barrier(struct r10conf *conf)
 	wake_up(&conf->wait_barrier);
 }
 
-static void freeze_array(struct r10conf *conf)
+static void freeze_array(struct r10conf *conf, int extra)
 {
 	/* stop syncio and normal IO and wait for everything to
 	 * go quiet.
 	 * We increment barrier and nr_waiting, and then
-	 * wait until nr_pending match nr_queued+1
+	 * wait until nr_pending match nr_queued+extra
 	 * This is called in the context of one normal IO request
 	 * that has failed. Thus any sync request that might be pending
 	 * will be blocked by nr_pending, and we need to wait for
 	 * pending IO requests to complete or be queued for re-try.
-	 * Thus the number queued (nr_queued) plus this request (1)
+	 * Thus the number queued (nr_queued) plus this request (extra)
 	 * must match the number of pending IOs (nr_pending) before
 	 * we continue.
 	 */
@@ -1073,7 +1081,7 @@ static void freeze_array(struct r10conf *conf)
 	conf->barrier++;
 	conf->nr_waiting++;
 	wait_event_lock_irq_cmd(conf->wait_barrier,
-				conf->nr_pending == conf->nr_queued+1,
+				conf->nr_pending == conf->nr_queued+extra,
 				conf->resync_lock,
 				flush_pending_writes(conf));
 
@@ -1622,37 +1630,58 @@ static void status(struct seq_file *seq, struct mddev *mddev)
  * Don't consider the device numbered 'ignore'
  * as we might be about to remove it.
  */
-static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
+static int _enough(struct r10conf *conf, int previous, int ignore)
 {
 	int first = 0;
+	int has_enough = 0;
+	int disks, ncopies;
+	if (previous) {
+		disks = conf->prev.raid_disks;
+		ncopies = conf->prev.near_copies;
+	} else {
+		disks = conf->geo.raid_disks;
+		ncopies = conf->geo.near_copies;
+	}
 
+	rcu_read_lock();
 	do {
 		int n = conf->copies;
 		int cnt = 0;
 		int this = first;
 		while (n--) {
-			if (conf->mirrors[this].rdev &&
-			    this != ignore)
+			struct md_rdev *rdev;
+			if (this != ignore &&
+			    (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
+			    test_bit(In_sync, &rdev->flags))
 				cnt++;
-			this = (this+1) % geo->raid_disks;
+			this = (this+1) % disks;
 		}
 		if (cnt == 0)
-			return 0;
-		first = (first + geo->near_copies) % geo->raid_disks;
+			goto out;
+		first = (first + ncopies) % disks;
 	} while (first != 0);
-	return 1;
+	has_enough = 1;
+out:
+	rcu_read_unlock();
+	return has_enough;
 }
 
 static int enough(struct r10conf *conf, int ignore)
 {
-	return _enough(conf, &conf->geo, ignore) &&
-		_enough(conf, &conf->prev, ignore);
+	/* when calling 'enough', both 'prev' and 'geo' must
+	 * be stable.
+	 * This is ensured if ->reconfig_mutex or ->device_lock
+	 * is held.
+	 */
+	return _enough(conf, 0, ignore) &&
+		_enough(conf, 1, ignore);
 }
 
 static void error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
 	struct r10conf *conf = mddev->private;
+	unsigned long flags;
 
 	/*
 	 * If it is not operational, then we have already marked it as dead
@@ -1660,18 +1689,18 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 	 * next level up know.
 	 * else mark the drive as failed
 	 */
+	spin_lock_irqsave(&conf->device_lock, flags);
 	if (test_bit(In_sync, &rdev->flags)
-	    && !enough(conf, rdev->raid_disk))
+	    && !enough(conf, rdev->raid_disk)) {
 		/*
 		 * Don't fail the drive, just return an IO error.
 		 */
+		spin_unlock_irqrestore(&conf->device_lock, flags);
 		return;
+	}
 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
 		mddev->degraded++;
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		/*
+			/*
 		 * if recovery is running, make sure it aborts.
 		 */
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
@@ -1679,6 +1708,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	spin_unlock_irqrestore(&conf->device_lock, flags);
 	printk(KERN_ALERT
 	       "md/raid10:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid10:%s: Operation continuing on %d devices.\n",
@@ -1781,7 +1811,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * very different from resync
 		 */
 		return -EBUSY;
-	if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
+	if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
 		return -EINVAL;
 
 	if (rdev->raid_disk >= 0)
@@ -1809,15 +1839,17 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			set_bit(Replacement, &rdev->flags);
 			rdev->raid_disk = mirror;
 			err = 0;
-			disk_stack_limits(mddev->gendisk, rdev->bdev,
-					  rdev->data_offset << 9);
+			if (mddev->gendisk)
+				disk_stack_limits(mddev->gendisk, rdev->bdev,
+						  rdev->data_offset << 9);
 			conf->fullsync = 1;
 			rcu_assign_pointer(p->replacement, rdev);
 			break;
 		}
 
-		disk_stack_limits(mddev->gendisk, rdev->bdev,
-				  rdev->data_offset << 9);
+		if (mddev->gendisk)
+			disk_stack_limits(mddev->gendisk, rdev->bdev,
+					  rdev->data_offset << 9);
 
 		p->head_position = 0;
 		p->recovery_disabled = mddev->recovery_disabled - 1;
@@ -1837,8 +1869,8 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * we wait for all outstanding requests to complete.
 		 */
 		synchronize_sched();
-		raise_barrier(conf, 0);
-		lower_barrier(conf);
+		freeze_array(conf, 0);
+		unfreeze_array(conf);
 		clear_bit(Unmerged, &rdev->flags);
 	}
 	md_integrity_add_rdev(rdev, mddev);
@@ -2612,7 +2644,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 	r10_bio->devs[slot].bio = NULL;
 
 	if (mddev->ro == 0) {
-		freeze_array(conf);
+		freeze_array(conf, 1);
 		fix_read_error(conf, mddev, r10_bio);
 		unfreeze_array(conf);
 	} else
@@ -2899,14 +2931,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 	 */
 	if (mddev->bitmap == NULL &&
 	    mddev->recovery_cp == MaxSector &&
+	    mddev->reshape_position == MaxSector &&
+	    !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+	    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    conf->fullsync == 0) {
 		*skipped = 1;
-		max_sector = mddev->dev_sectors;
-		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
-		    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
-			max_sector = mddev->resync_max_sectors;
-		return max_sector - sector_nr;
+		return mddev->dev_sectors - sector_nr;
 	}
 
  skipped:
@@ -3522,7 +3553,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 
 	/* FIXME calc properly */
 	conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
-							    max(0,mddev->delta_disks)),
+							    max(0,-mddev->delta_disks)),
 				GFP_KERNEL);
 	if (!conf->mirrors)
 		goto out;
@@ -3609,8 +3640,7 @@ static int run(struct mddev *mddev)
 	if (mddev->queue) {
 		blk_queue_max_discard_sectors(mddev->queue,
 					      mddev->chunk_sectors);
-		blk_queue_max_write_same_sectors(mddev->queue,
-						 mddev->chunk_sectors);
+		blk_queue_max_write_same_sectors(mddev->queue, 0);
 		blk_queue_io_min(mddev->queue, chunk_size);
 		if (conf->geo.raid_disks % conf->geo.near_copies)
 			blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
@@ -3682,7 +3712,7 @@ static int run(struct mddev *mddev)
 		    conf->geo.far_offset == 0)
 			goto out_free_conf;
 		if (conf->prev.far_copies != 1 &&
-		    conf->geo.far_offset == 0)
+		    conf->prev.far_offset == 0)
 			goto out_free_conf;
 	}
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9359828..2bf094a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -664,6 +664,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
 				bi->bi_rw |= REQ_FLUSH;
 
+			bi->bi_vcnt = 1;
 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 			bi->bi_io_vec[0].bv_offset = 0;
 			bi->bi_size = STRIPE_SIZE;
@@ -701,6 +702,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			else
 				rbi->bi_sector = (sh->sector
 						  + rrdev->data_offset);
+			rbi->bi_vcnt = 1;
 			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 			rbi->bi_io_vec[0].bv_offset = 0;
 			rbi->bi_size = STRIPE_SIZE;
@@ -4922,7 +4924,7 @@ raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
 	if (!conf)
 		return -ENODEV;
 
-	if (strict_strtoul(page, 10, &new))
+	if (kstrtoul(page, 10, &new))
 		return -EINVAL;
 	err = raid5_set_cache_size(mddev, new);
 	if (err)
@@ -4955,7 +4957,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
 	if (!conf)
 		return -ENODEV;
 
-	if (strict_strtoul(page, 10, &new))
+	if (kstrtoul(page, 10, &new))
 		return -EINVAL;
 	if (new > conf->max_nr_stripes)
 		return -EINVAL;
@@ -5464,7 +5466,7 @@ static int run(struct mddev *mddev)
 		if (mddev->major_version == 0 &&
 		    mddev->minor_version > 90)
 			rdev->recovery_offset = reshape_offset;
-			
+
 		if (rdev->recovery_offset < reshape_offset) {
 			/* We need to check old and new layout */
 			if (!only_parity(rdev->raid_disk,
@@ -5587,6 +5589,8 @@ static int run(struct mddev *mddev)
 		 */
 		mddev->queue->limits.discard_zeroes_data = 0;
 
+		blk_queue_max_write_same_sectors(mddev->queue, 0);
+
 		rdev_for_each(rdev, mddev) {
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
@@ -5910,7 +5914,7 @@ static int check_reshape(struct mddev *mddev)
 		return 0; /* nothing to do */
 	if (has_failed(conf))
 		return -EINVAL;
-	if (mddev->delta_disks < 0) {
+	if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
 		/* We might be able to shrink, but the devices must
 		 * be made bigger first.
 		 * For raid6, 4 is the minimum size.