From 9092c02d943515b3c9ffd5d0003527f8cc1dd77b Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 2 May 2013 14:19:24 -0500
Subject: DM RAID: Add ability to restore transiently failed devices on resume

DM RAID: Add ability to restore transiently failed devices on resume

This patch adds code to the resume function to check over the devices
in the RAID array.  If any are found to be marked as failed and their
superblocks can be read, an attempt is made to reintegrate them into
the array.  This allows the user to refresh the array with a simple
suspend and resume of the array - rather than having to load a
completely new table, allocate and initialize all the structures and
throw away the old instantiation.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index e919228..2bb3a68 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -222,3 +222,4 @@ Version History
 1.4.2   Add RAID10 "far" and "offset" algorithm support.
 1.5.0   Add message interface to allow manipulation of the sync_action.
 	New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt.
+1.5.1   Add ability to restore transiently failed devices on resume.
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1d3fe1a..facaf91 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1574,12 +1574,54 @@ static void raid_postsuspend(struct dm_target *ti)
 
 static void raid_resume(struct dm_target *ti)
 {
+	int i;
+	uint64_t failed_devices, cleared_failed_devices = 0;
+	unsigned long flags;
+	struct dm_raid_superblock *sb;
 	struct raid_set *rs = ti->private;
+	struct md_rdev *r;
 
 	set_bit(MD_CHANGE_DEVS, &rs->md.flags);
 	if (!rs->bitmap_loaded) {
 		bitmap_load(&rs->md);
 		rs->bitmap_loaded = 1;
+	} else {
+		/*
+		 * A secondary resume while the device is active.
+		 * Take this opportunity to check whether any failed
+		 * devices are reachable again.
+		 */
+		for (i = 0; i < rs->md.raid_disks; i++) {
+			r = &rs->dev[i].rdev;
+			if (test_bit(Faulty, &r->flags) && r->sb_page &&
+			    sync_page_io(r, 0, r->sb_size,
+					 r->sb_page, READ, 1)) {
+				DMINFO("Faulty device #%d has readable super"
+				       "block.  Attempting to revive it.", i);
+				r->raid_disk = i;
+				r->saved_raid_disk = i;
+				flags = r->flags;
+				clear_bit(Faulty, &r->flags);
+				clear_bit(WriteErrorSeen, &r->flags);
+				clear_bit(In_sync, &r->flags);
+				if (r->mddev->pers->hot_add_disk(r->mddev, r)) {
+					r->raid_disk = -1;
+					r->saved_raid_disk = -1;
+					r->flags = flags;
+				} else {
+					r->recovery_offset = 0;
+					cleared_failed_devices |= 1 << i;
+				}
+			}
+		}
+		if (cleared_failed_devices) {
+			rdev_for_each(r, &rs->md) {
+				sb = page_address(r->sb_page);
+				failed_devices = le64_to_cpu(sb->failed_devices);
+				failed_devices &= ~cleared_failed_devices;
+				sb->failed_devices = cpu_to_le64(failed_devices);
+			}
+		}
 	}
 
 	clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
@@ -1588,7 +1630,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 5, 0},
+	.version = {1, 5, 1},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6e17f81..ec73458 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1519,8 +1519,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		p = conf->mirrors+mirror;
 		if (!p->rdev) {
 
-			disk_stack_limits(mddev->gendisk, rdev->bdev,
-					  rdev->data_offset << 9);
+			if (mddev->gendisk)
+				disk_stack_limits(mddev->gendisk, rdev->bdev,
+						  rdev->data_offset << 9);
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
@@ -1559,7 +1560,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		clear_bit(Unmerged, &rdev->flags);
 	}
 	md_integrity_add_rdev(rdev, mddev);
-	if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 	print_conf(conf);
 	return err;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6ddae25..3c6b193 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1819,15 +1819,17 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			set_bit(Replacement, &rdev->flags);
 			rdev->raid_disk = mirror;
 			err = 0;
-			disk_stack_limits(mddev->gendisk, rdev->bdev,
-					  rdev->data_offset << 9);
+			if (mddev->gendisk)
+				disk_stack_limits(mddev->gendisk, rdev->bdev,
+						  rdev->data_offset << 9);
 			conf->fullsync = 1;
 			rcu_assign_pointer(p->replacement, rdev);
 			break;
 		}
 
-		disk_stack_limits(mddev->gendisk, rdev->bdev,
-				  rdev->data_offset << 9);
+		if (mddev->gendisk)
+			disk_stack_limits(mddev->gendisk, rdev->bdev,
+					  rdev->data_offset << 9);
 
 		p->head_position = 0;
 		p->recovery_disabled = mddev->recovery_disabled - 1;
-- 
cgit v0.10.2


From f381e71b042af910fbe5f8222792cc5092750993 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Wed, 8 May 2013 17:57:13 -0500
Subject: DM RAID: Break-up untidy function

DM RAID:  Break-up untidy function

Clean-up excessive indentation by moving some code in raid_resume()
into its own function.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index facaf91..59d15ec 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1572,15 +1572,51 @@ static void raid_postsuspend(struct dm_target *ti)
 	mddev_suspend(&rs->md);
 }
 
-static void raid_resume(struct dm_target *ti)
+static void attempt_restore_of_faulty_devices(struct raid_set *rs)
 {
 	int i;
 	uint64_t failed_devices, cleared_failed_devices = 0;
 	unsigned long flags;
 	struct dm_raid_superblock *sb;
-	struct raid_set *rs = ti->private;
 	struct md_rdev *r;
 
+	for (i = 0; i < rs->md.raid_disks; i++) {
+		r = &rs->dev[i].rdev;
+		if (test_bit(Faulty, &r->flags) && r->sb_page &&
+		    sync_page_io(r, 0, r->sb_size, r->sb_page, READ, 1)) {
+			DMINFO("Faulty %s device #%d has readable super block."
+			       "  Attempting to revive it.",
+			       rs->raid_type->name, i);
+			r->raid_disk = i;
+			r->saved_raid_disk = i;
+			flags = r->flags;
+			clear_bit(Faulty, &r->flags);
+			clear_bit(WriteErrorSeen, &r->flags);
+			clear_bit(In_sync, &r->flags);
+			if (r->mddev->pers->hot_add_disk(r->mddev, r)) {
+				r->raid_disk = -1;
+				r->saved_raid_disk = -1;
+				r->flags = flags;
+			} else {
+				r->recovery_offset = 0;
+				cleared_failed_devices |= 1 << i;
+			}
+		}
+	}
+	if (cleared_failed_devices) {
+		rdev_for_each(r, &rs->md) {
+			sb = page_address(r->sb_page);
+			failed_devices = le64_to_cpu(sb->failed_devices);
+			failed_devices &= ~cleared_failed_devices;
+			sb->failed_devices = cpu_to_le64(failed_devices);
+		}
+	}
+}
+
+static void raid_resume(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+
 	set_bit(MD_CHANGE_DEVS, &rs->md.flags);
 	if (!rs->bitmap_loaded) {
 		bitmap_load(&rs->md);
@@ -1591,37 +1627,7 @@ static void raid_resume(struct dm_target *ti)
 		 * Take this opportunity to check whether any failed
 		 * devices are reachable again.
 		 */
-		for (i = 0; i < rs->md.raid_disks; i++) {
-			r = &rs->dev[i].rdev;
-			if (test_bit(Faulty, &r->flags) && r->sb_page &&
-			    sync_page_io(r, 0, r->sb_size,
-					 r->sb_page, READ, 1)) {
-				DMINFO("Faulty device #%d has readable super"
-				       "block.  Attempting to revive it.", i);
-				r->raid_disk = i;
-				r->saved_raid_disk = i;
-				flags = r->flags;
-				clear_bit(Faulty, &r->flags);
-				clear_bit(WriteErrorSeen, &r->flags);
-				clear_bit(In_sync, &r->flags);
-				if (r->mddev->pers->hot_add_disk(r->mddev, r)) {
-					r->raid_disk = -1;
-					r->saved_raid_disk = -1;
-					r->flags = flags;
-				} else {
-					r->recovery_offset = 0;
-					cleared_failed_devices |= 1 << i;
-				}
-			}
-		}
-		if (cleared_failed_devices) {
-			rdev_for_each(r, &rs->md) {
-				sb = page_address(r->sb_page);
-				failed_devices = le64_to_cpu(sb->failed_devices);
-				failed_devices &= ~cleared_failed_devices;
-				sb->failed_devices = cpu_to_le64(failed_devices);
-			}
-		}
+		attempt_restore_of_faulty_devices(rs);
 	}
 
 	clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
-- 
cgit v0.10.2


From a4dc163a55964d683f92742705c90c78c0f56c0c Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Wed, 8 May 2013 18:00:54 -0500
Subject: DM RAID: Fix raid_resume not reviving failed devices in all cases

DM RAID:  Fix raid_resume not reviving failed devices in all cases

When a device fails in a RAID array, it is marked as Faulty.  Later,
md_check_recovery is called which (through the call chain) calls
'hot_remove_disk' in order to have the personalities remove the device
from use in the array.

Sometimes, it is possible for the array to be suspended before the
personalities get their chance to perform 'hot_remove_disk'.  This is
normally not an issue.  If the array is deactivated, then the failed
device will be noticed when the array is reinstantiated.  If the
array is resumed and the disk is still missing, md_check_recovery will
be called upon resume and 'hot_remove_disk' will be called at that
time.  However, (for dm-raid) if the device has been restored,
a resume on the array would cause it to attempt to revive the device
by calling 'hot_add_disk'.  If 'hot_remove_disk' had not been called,
a situation is then created where the device is thought to concurrently
be the replacement and the device to be replaced.  Thus, the device
is first sync'ed with the rest of the array (because it is the replacement
device) and then marked Faulty and removed from the array (because
it is also the device being replaced).

The solution is to check and see if the device had properly been removed
before the array was suspended.  This is done by seeing whether the
device's 'raid_disk' field is -1 - a condition that implies that
'md_check_recovery -> remove_and_add_spares (where raid_disk is set to -1)
-> hot_remove_disk' has been called.  If 'raid_disk' is not -1, then
'hot_remove_disk' must be called to complete the removal of the previously
faulty device before it can be revived via 'hot_add_disk'.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 59d15ec..49f0bd5 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1587,6 +1587,21 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
 			DMINFO("Faulty %s device #%d has readable super block."
 			       "  Attempting to revive it.",
 			       rs->raid_type->name, i);
+
+			/*
+			 * Faulty bit may be set, but sometimes the array can
+			 * be suspended before the personalities can respond
+			 * by removing the device from the array (i.e. calling
+			 * 'hot_remove_disk').  If they haven't yet removed
+			 * the failed device, its 'raid_disk' number will be
+			 * '>= 0' - meaning we must call this function
+			 * ourselves.
+			 */
+			if ((r->raid_disk >= 0) &&
+			    (r->mddev->pers->hot_remove_disk(r->mddev, r) != 0))
+				/* Failed to revive this device, try next */
+				continue;
+
 			r->raid_disk = i;
 			r->saved_raid_disk = i;
 			flags = r->flags;
-- 
cgit v0.10.2


From 3f6bbd3ffd7b733dd705e494663e5761aa2cb9c1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 9 May 2013 10:27:49 +1000
Subject: dm-raid: silence compiler warning on rebuilds_per_group.

This doesn't really need to be initialised, but it doesn't hurt,
silences the compiler, and as it is a counter it makes sense for it to
start at zero.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 49f0bd5..0f5a6fb 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -380,7 +380,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
 static int validate_raid_redundancy(struct raid_set *rs)
 {
 	unsigned i, rebuild_cnt = 0;
-	unsigned rebuilds_per_group, copies, d;
+	unsigned rebuilds_per_group = 0, copies, d;
 	unsigned group_size, last_group_start;
 
 	for (i = 0; i < rs->md.raid_disks; i++)
-- 
cgit v0.10.2


From 90f5f7ad4f38d67626b0c220a0ac390603e0bded Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 2 Apr 2013 08:38:55 +0200
Subject: md: Wait for md_check_recovery before attempting device removal.

When a device has failed, it needs to be removed from the personality
module before it can be removed from the array as a whole.
The first step is performed by md_check_recovery() which is called
from the raid management thread.

So when a HOT_REMOVE ioctl arrives, wait briefly for md_check_recovery
to have run.  This increases the chance that the ioctl will succeed.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Neil Brown <nfbrown@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9b82377..3b25b72 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6405,6 +6405,12 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 		/* need to ensure md_delayed_delete() has completed */
 		flush_workqueue(md_misc_wq);
 
+	if (cmd == HOT_REMOVE_DISK)
+		/* need to ensure recovery thread has run */
+		wait_event_interruptible_timeout(mddev->sb_wait,
+						 !test_bit(MD_RECOVERY_NEEDED,
+							   &mddev->flags),
+						 msecs_to_jiffies(5000));
 	err = mddev_lock(mddev);
 	if (err) {
 		printk(KERN_INFO 
@@ -7892,6 +7898,8 @@ void md_check_recovery(struct mddev *mddev)
 			md_new_event(mddev);
 		}
 	unlock:
+		wake_up(&mddev->sb_wait);
+
 		if (!mddev->sync_thread) {
 			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
-- 
cgit v0.10.2


From b29bebd66dbd492105668ec3515a5ffb0b25e4c1 Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Sat, 1 Jun 2013 16:15:16 +0900
Subject: md: replace strict_strto*() with kstrto*()

The usage of strict_strtoul() is not preferred, because
strict_strtoul() is obsolete. Thus, kstrtoul() should be
used.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 5a2c754..a7fd821 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -2002,9 +2002,9 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 		} else {
 			int rv;
 			if (buf[0] == '+')
-				rv = strict_strtoll(buf+1, 10, &offset);
+				rv = kstrtoll(buf+1, 10, &offset);
 			else
-				rv = strict_strtoll(buf, 10, &offset);
+				rv = kstrtoll(buf, 10, &offset);
 			if (rv)
 				return rv;
 			if (offset == 0)
@@ -2139,7 +2139,7 @@ static ssize_t
 backlog_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	unsigned long backlog;
-	int rv = strict_strtoul(buf, 10, &backlog);
+	int rv = kstrtoul(buf, 10, &backlog);
 	if (rv)
 		return rv;
 	if (backlog > COUNTER_MAX)
@@ -2165,7 +2165,7 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len)
 	unsigned long csize;
 	if (mddev->bitmap)
 		return -EBUSY;
-	rv = strict_strtoul(buf, 10, &csize);
+	rv = kstrtoul(buf, 10, &csize);
 	if (rv)
 		return rv;
 	if (csize < 512 ||
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 0f5a6fb..21e8e46 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -504,7 +504,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	 * First, parse the in-order required arguments
 	 * "chunk_size" is the only argument of this type.
 	 */
-	if ((strict_strtoul(argv[0], 10, &value) < 0)) {
+	if ((kstrtoul(argv[0], 10, &value) < 0)) {
 		rs->ti->error = "Bad chunk size";
 		return -EINVAL;
 	} else if (rs->raid_type->level == 1) {
@@ -585,7 +585,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 			continue;
 		}
 
-		if (strict_strtoul(argv[i], 10, &value) < 0) {
+		if (kstrtoul(argv[i], 10, &value) < 0) {
 			rs->ti->error = "Bad numerical argument given in raid params";
 			return -EINVAL;
 		}
@@ -1181,7 +1181,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	argv++;
 
 	/* number of RAID parameters */
-	if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
+	if (kstrtoul(argv[0], 10, &num_raid_params) < 0) {
 		ti->error = "Cannot understand number of RAID parameters";
 		return -EINVAL;
 	}
@@ -1194,7 +1194,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		return -EINVAL;
 	}
 
-	if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
+	if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
 	    (num_raid_devs >= INT_MAX)) {
 		ti->error = "Cannot understand number of raid devices";
 		return -EINVAL;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3b25b72..26f9452 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2867,7 +2867,7 @@ static ssize_t
 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
 {
 	unsigned long long offset;
-	if (strict_strtoull(buf, 10, &offset) < 0)
+	if (kstrtoull(buf, 10, &offset) < 0)
 		return -EINVAL;
 	if (rdev->mddev->pers && rdev->raid_disk >= 0)
 		return -EBUSY;
@@ -2895,7 +2895,7 @@ static ssize_t new_offset_store(struct md_rdev *rdev,
 	unsigned long long new_offset;
 	struct mddev *mddev = rdev->mddev;
 
-	if (strict_strtoull(buf, 10, &new_offset) < 0)
+	if (kstrtoull(buf, 10, &new_offset) < 0)
 		return -EINVAL;
 
 	if (mddev->sync_thread)
@@ -2961,7 +2961,7 @@ static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
 	unsigned long long blocks;
 	sector_t new;
 
-	if (strict_strtoull(buf, 10, &blocks) < 0)
+	if (kstrtoull(buf, 10, &blocks) < 0)
 		return -EINVAL;
 
 	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
@@ -3069,7 +3069,7 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_
 
 	if (cmd_match(buf, "none"))
 		recovery_start = MaxSector;
-	else if (strict_strtoull(buf, 10, &recovery_start))
+	else if (kstrtoull(buf, 10, &recovery_start))
 		return -EINVAL;
 
 	if (rdev->mddev->pers &&
@@ -3497,7 +3497,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
 	if (clevel[len-1] == '\n')
 		len--;
 	clevel[len] = 0;
-	if (strict_strtol(clevel, 10, &level))
+	if (kstrtol(clevel, 10, &level))
 		level = LEVEL_NONE;
 
 	if (request_module("md-%s", clevel) != 0)
@@ -4356,7 +4356,7 @@ sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	long n;
 
-	if (strict_strtol(buf, 10, &n))
+	if (kstrtol(buf, 10, &n))
 		return -EINVAL;
 
 	if (n != 0 && n != 1)
@@ -4424,7 +4424,7 @@ static ssize_t
 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	unsigned long long min;
-	if (strict_strtoull(buf, 10, &min))
+	if (kstrtoull(buf, 10, &min))
 		return -EINVAL;
 	if (min > mddev->resync_max)
 		return -EINVAL;
@@ -4461,7 +4461,7 @@ max_sync_store(struct mddev *mddev, const char *buf, size_t len)
 		mddev->resync_max = MaxSector;
 	else {
 		unsigned long long max;
-		if (strict_strtoull(buf, 10, &max))
+		if (kstrtoull(buf, 10, &max))
 			return -EINVAL;
 		if (max < mddev->resync_min)
 			return -EINVAL;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 05e4a10..cd9aab9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4924,7 +4924,7 @@ raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
 	if (!conf)
 		return -ENODEV;
 
-	if (strict_strtoul(page, 10, &new))
+	if (kstrtoul(page, 10, &new))
 		return -EINVAL;
 	err = raid5_set_cache_size(mddev, new);
 	if (err)
@@ -4957,7 +4957,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
 	if (!conf)
 		return -ENODEV;
 
-	if (strict_strtoul(page, 10, &new))
+	if (kstrtoul(page, 10, &new))
 		return -EINVAL;
 	if (new > conf->max_nr_stripes)
 		return -EINVAL;
-- 
cgit v0.10.2


From 635f6416a2e983adac8ccf90bffbeed0c1a76454 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 11 Jun 2013 14:57:09 +1000
Subject: md/raid10: locking changes for 'enough()'.

As 'enough' accesses conf->prev and conf->geo, which can change
spontanously, it should guard against changes.
This can be done with device_lock as start_reshape holds device_lock
while updating 'geo' and end_reshape holds it while updating 'prev'.

So 'error' needs to hold 'device_lock'.

On the other hand, raid10_end_read_request knows which of the two it
really wants to access, and as it is an active request on that one,
the value cannot change underneath it.

So change _enough to take flag rather than a pointer, pass the
appropriate flag from raid10_end_read_request(), and remove the locking.

All other calls to 'enough' are made with reconfig_mutex held, so
neither 'prev' nor 'geo' can change.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3c6b193..5169ed2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -97,7 +97,7 @@ static int max_queued_requests = 1024;
 
 static void allow_barrier(struct r10conf *conf);
 static void lower_barrier(struct r10conf *conf);
-static int enough(struct r10conf *conf, int ignore);
+static int _enough(struct r10conf *conf, int previous, int ignore);
 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 				int *skipped);
 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
@@ -392,11 +392,9 @@ static void raid10_end_read_request(struct bio *bio, int error)
 		 * than fail the last device.  Here we redefine
 		 * "uptodate" to mean "Don't want to retry"
 		 */
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
-		if (!enough(conf, rdev->raid_disk))
+		if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
+			     rdev->raid_disk))
 			uptodate = 1;
-		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	if (uptodate) {
 		raid_end_bio_io(r10_bio);
@@ -1632,9 +1630,17 @@ static void status(struct seq_file *seq, struct mddev *mddev)
  * Don't consider the device numbered 'ignore'
  * as we might be about to remove it.
  */
-static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
+static int _enough(struct r10conf *conf, int previous, int ignore)
 {
 	int first = 0;
+	int disks, ncopies;
+	if (previous) {
+		disks = conf->prev.raid_disks;
+		ncopies = conf->prev.near_copies;
+	} else {
+		disks = conf->geo.raid_disks;
+		ncopies = conf->geo.near_copies;
+	}
 
 	do {
 		int n = conf->copies;
@@ -1644,25 +1650,31 @@ static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
 			if (conf->mirrors[this].rdev &&
 			    this != ignore)
 				cnt++;
-			this = (this+1) % geo->raid_disks;
+			this = (this+1) % disks;
 		}
 		if (cnt == 0)
 			return 0;
-		first = (first + geo->near_copies) % geo->raid_disks;
+		first = (first + ncopies) % disks;
 	} while (first != 0);
 	return 1;
 }
 
 static int enough(struct r10conf *conf, int ignore)
 {
-	return _enough(conf, &conf->geo, ignore) &&
-		_enough(conf, &conf->prev, ignore);
+	/* when calling 'enough', both 'prev' and 'geo' must
+	 * be stable.
+	 * This is ensured if ->reconfig_mutex or ->device_lock
+	 * is held.
+	 */
+	return _enough(conf, 0, ignore) &&
+		_enough(conf, 1, ignore);
 }
 
 static void error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
 	struct r10conf *conf = mddev->private;
+	unsigned long flags;
 
 	/*
 	 * If it is not operational, then we have already marked it as dead
@@ -1670,18 +1682,18 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 	 * next level up know.
 	 * else mark the drive as failed
 	 */
+	spin_lock_irqsave(&conf->device_lock, flags);
 	if (test_bit(In_sync, &rdev->flags)
-	    && !enough(conf, rdev->raid_disk))
+	    && !enough(conf, rdev->raid_disk)) {
 		/*
 		 * Don't fail the drive, just return an IO error.
 		 */
+		spin_unlock_irqrestore(&conf->device_lock, flags);
 		return;
+	}
 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
 		mddev->degraded++;
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		/*
+			/*
 		 * if recovery is running, make sure it aborts.
 		 */
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
@@ -1689,6 +1701,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	spin_unlock_irqrestore(&conf->device_lock, flags);
 	printk(KERN_ALERT
 	       "md/raid10:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid10:%s: Operation continuing on %d devices.\n",
@@ -1791,7 +1804,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * very different from resync
 		 */
 		return -EBUSY;
-	if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
+	if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
 		return -EINVAL;
 
 	if (rdev->raid_disk >= 0)
-- 
cgit v0.10.2


From 725d6e579f06360744fc101b1af2f82d8aa282f1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 11 Jun 2013 15:08:03 +1000
Subject: md/raid10: check In_sync flag in 'enough()'.

It isn't really enough to check that the rdev is present, we need to
also be sure that the device is still In_sync.

Doing this requires using rcu_dereference to access the rdev, and
holding the rcu_read_lock() to ensure the rdev doesn't disappear while
we look at it.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5169ed2..aa8ba07 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1633,6 +1633,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)
 static int _enough(struct r10conf *conf, int previous, int ignore)
 {
 	int first = 0;
+	int has_enough = 0;
 	int disks, ncopies;
 	if (previous) {
 		disks = conf->prev.raid_disks;
@@ -1642,21 +1643,27 @@ static int _enough(struct r10conf *conf, int previous, int ignore)
 		ncopies = conf->geo.near_copies;
 	}
 
+	rcu_read_lock();
 	do {
 		int n = conf->copies;
 		int cnt = 0;
 		int this = first;
 		while (n--) {
-			if (conf->mirrors[this].rdev &&
-			    this != ignore)
+			struct md_rdev *rdev;
+			if (this != ignore &&
+			    (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
+			    test_bit(In_sync, &rdev->flags))
 				cnt++;
 			this = (this+1) % disks;
 		}
 		if (cnt == 0)
-			return 0;
+			goto out;
 		first = (first + ncopies) % disks;
 	} while (first != 0);
-	return 1;
+	has_enough = 1;
+out:
+	rcu_read_unlock();
+	return has_enough;
 }
 
 static int enough(struct r10conf *conf, int ignore)
-- 
cgit v0.10.2


From eea136d69f9facb2d3807386bbac1e6b1161795f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 26 Jun 2013 11:55:20 +1000
Subject: md: fix buglet in RAID5 -> RAID0 conversion.

RAID5 uses a 'per-array' value for the 'size' of each device.
RAID0 uses a 'per-device' value - it can be different for each device.

When converting a RAID5 to a RAID0 we must ensure that the per-device
size of each device matches the per-array size for the RAID5, else
the array will change size.

If the metadata cannot record a changed per-device size (as is the
case with v0.90 metadata) the array could get bigger on restart.  This
does not cause data corruption, so it not a big issue and is mainly
yet another a reason to not use 0.90.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index fcf65e5..c4d420b 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -597,6 +597,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 			       mdname(mddev));
 			return ERR_PTR(-EINVAL);
 		}
+		rdev->sectors = mddev->dev_sectors;
 	}
 
 	/* Set new parameters */
-- 
cgit v0.10.2


From c4a39551451666229b4ea5e8aae8ca0131d00665 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 25 Jun 2013 01:23:59 -0500
Subject: MD: Remember the last sync operation that was performed

MD:  Remember the last sync operation that was performed

This patch adds a field to the mddev structure to track the last
sync operation that was performed.  This is especially useful when
it comes to what is recorded in mismatch_cnt in sysfs.  If the
last operation was "data-check", then it reports the number of
descrepancies found by the user-initiated check.  If it was a
"repair" operation, then it is reporting the number of
descrepancies repaired.  etc.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 2bb3a68..ef8ba9f 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -223,3 +223,4 @@ Version History
 1.5.0   Add message interface to allow manipulation of the sync_action.
 	New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt.
 1.5.1   Add ability to restore transiently failed devices on resume.
+1.5.2   'mismatch_cnt' is zero unless [last_]sync_action is "check".
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 21e8e46..4880b69 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1388,6 +1388,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		 *   performing a "check" of the array.
 		 */
 		DMEMIT(" %llu",
+		       (strcmp(rs->md.last_sync_action, "check")) ? 0 :
 		       (unsigned long long)
 		       atomic64_read(&rs->md.resync_mismatches));
 		break;
@@ -1651,7 +1652,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 5, 1},
+	.version = {1, 5, 2},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 26f9452..dddc87b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -521,6 +521,7 @@ void mddev_init(struct mddev *mddev)
 	init_waitqueue_head(&mddev->recovery_wait);
 	mddev->reshape_position = MaxSector;
 	mddev->reshape_backwards = 0;
+	mddev->last_sync_action = "none";
 	mddev->resync_min = 0;
 	mddev->resync_max = MaxSector;
 	mddev->level = LEVEL_NONE;
@@ -4272,6 +4273,17 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 	return len;
 }
 
+static struct md_sysfs_entry md_scan_mode =
+__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
+
+static ssize_t
+last_sync_action_show(struct mddev *mddev, char *page)
+{
+	return sprintf(page, "%s\n", mddev->last_sync_action);
+}
+
+static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
+
 static ssize_t
 mismatch_cnt_show(struct mddev *mddev, char *page)
 {
@@ -4280,10 +4292,6 @@ mismatch_cnt_show(struct mddev *mddev, char *page)
 		       atomic64_read(&mddev->resync_mismatches));
 }
 
-static struct md_sysfs_entry md_scan_mode =
-__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
-
-
 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
 
 static ssize_t
@@ -4686,6 +4694,7 @@ static struct attribute *md_default_attrs[] = {
 
 static struct attribute *md_redundancy_attrs[] = {
 	&md_scan_mode.attr,
+	&md_last_scan_mode.attr,
 	&md_mismatches.attr,
 	&md_sync_min.attr,
 	&md_sync_max.attr,
@@ -7329,7 +7338,7 @@ void md_do_sync(struct md_thread *thread)
 	sector_t last_check;
 	int skipped = 0;
 	struct md_rdev *rdev;
-	char *desc;
+	char *desc, *action = NULL;
 	struct blk_plug plug;
 
 	/* just incase thread restarts... */
@@ -7339,17 +7348,21 @@ void md_do_sync(struct md_thread *thread)
 		return;
 
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
 			desc = "data-check";
-		else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+			action = "check";
+		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
 			desc = "requested-resync";
-		else
+			action = "repair";
+		} else
 			desc = "resync";
 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		desc = "reshape";
 	else
 		desc = "recovery";
 
+	mddev->last_sync_action = action ?: desc;
+
 	/* we overload curr_resync somewhat here.
 	 * 0 == not engaged in resync at all
 	 * 2 == checking that there is no conflict with another sync
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 653f992b6..20f02c0 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -268,6 +268,14 @@ struct mddev {
 
 	struct md_thread		*thread;	/* management thread */
 	struct md_thread		*sync_thread;	/* doing resync or reconstruct */
+
+	/* 'last_sync_action' is initialized to "none".  It is set when a
+	 * sync operation (i.e "data-check", "requested-resync", "resync",
+	 * "recovery", or "reshape") is started.  It holds this value even
+	 * when the sync thread is "frozen" (interrupted) or "idle" (stopped
+	 * or finished).  It is overwritten when a new sync operation is begun.
+	 */
+	char				*last_sync_action;
 	sector_t			curr_resync;	/* last block scheduled */
 	/* As resync requests can complete out of order, we cannot easily track
 	 * how much resync has been completed.  So we occasionally pause until
-- 
cgit v0.10.2


From 0baac4db56b1895743391a348ee9ce4ae5076f9f Mon Sep 17 00:00:00 2001
From: CoolCold <coolthecold@gmail.com>
Date: Wed, 26 Jun 2013 19:46:58 +0400
Subject: md: remove doubled description for sync_max, merging it within
 sync_min/sync_max

After last md.txt edits for sync_min/max, sync_max description became
doubled. Removing 1st copy, merging details into common
sync_min/sync_max section.

Signed-off-by: Roman Ovchinnikov <coolthecold@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/Documentation/md.txt b/Documentation/md.txt
index e0ddd32..fbb2fcb 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -566,13 +566,6 @@ also have
      when it reaches the current sync_max (below) and possibly at
      other times.
 
-   sync_max
-     This is a number of sectors at which point a resync/recovery
-     process will pause.  When a resync is active, the value can
-     only ever be increased, never decreased.  The value of 'max'
-     effectively disables the limit.
-
-
    sync_speed
      This shows the current actual speed, in K/sec, of the current
      sync_action.  It is averaged over the last 30 seconds.
@@ -593,6 +586,12 @@ also have
      that number to reach sync_max.  Then you can either increase
      "sync_max", or can write 'idle' to "sync_action".
 
+     The value of 'max' for "sync_max" effectively disables the limit.
+     When a resync is active, the value can only ever be increased,
+     never decreased.
+     The value of '0' is the minimum for "sync_min".
+
+
 
 Each active md device may also have attributes specific to the
 personality module that manages it.
-- 
cgit v0.10.2


From 78eaa0d4cbcdb345992fa3dd22b3bcbb473cc064 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 2 Jul 2013 15:58:05 +1000
Subject: md/raid10: fix two bugs affecting RAID10 reshape.

1/ If a RAID10 is being reshaped to a fewer number of devices
 and is stopped while this is ongoing, then when the array is
 reassembled the 'mirrors' array will be allocated too small.
 This will lead to an access error or memory corruption.

2/ A sanity test for a reshaping RAID10 array is restarted
 is slightly incorrect.

Due to the first bug, this is suitable for any -stable
kernel since 3.5 where this code was introduced.

Cc: stable@vger.kernel.org (v3.5+)
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index aa8ba07..3480bf7c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3554,7 +3554,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 
 	/* FIXME calc properly */
 	conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
-							    max(0,mddev->delta_disks)),
+							    max(0,-mddev->delta_disks)),
 				GFP_KERNEL);
 	if (!conf->mirrors)
 		goto out;
@@ -3713,7 +3713,7 @@ static int run(struct mddev *mddev)
 		    conf->geo.far_offset == 0)
 			goto out_free_conf;
 		if (conf->prev.far_copies != 1 &&
-		    conf->geo.far_offset == 0)
+		    conf->prev.far_offset == 0)
 			goto out_free_conf;
 	}
 
-- 
cgit v0.10.2


From fdcfbbb653b27964c4daa4d2bcb364259c257e7d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 4 Jul 2013 16:38:16 +1000
Subject: md/raid5: allow 5-device RAID6 to be reshaped to 4-device.

There is a bug in 'check_reshape' for raid5.c  To checks
that the new minimum number of devices is large enough (which is
good), but it does so also after the reshape has started (bad).

This is bad because
 - the calculation is now wrong as mddev->raid_disks has changed
   already, and
 - it is pointless because it is now too late to stop.

So only perform that test when reshape has not been committed to.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cd9aab9..2bf094a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5914,7 +5914,7 @@ static int check_reshape(struct mddev *mddev)
 		return 0; /* nothing to do */
 	if (has_failed(conf))
 		return -EINVAL;
-	if (mddev->delta_disks < 0) {
+	if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
 		/* We might be able to shrink, but the devices must
 		 * be made bigger first.
 		 * For raid6, 4 is the minimum size.
-- 
cgit v0.10.2


From 1376512065b23f39d5f9a160948f313397dde972 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 4 Jul 2013 16:41:53 +1000
Subject: md/raid10: fix bug which causes all RAID10 reshapes to move no data.

The recent comment:
commit 7e83ccbecd608b971f340e951c9e84cd0343002f
    md/raid10: Allow skipping recovery when clean arrays are assembled

Causes raid10 to skip a recovery in certain cases where it is safe to
do so.  Unfortunately it also causes a reshape to be skipped which is
never safe.  The result is that an attempt to reshape a RAID10 will
appear to complete instantly, but no data will have been moves so the
array will now contain garbage.
(If nothing is written, you can recovery by simple performing the
reverse reshape which will also complete instantly).

Bug was introduced in 3.10, so this is suitable for 3.10-stable.

Cc: stable@vger.kernel.org (3.10)
Cc: Martin Wilck <mwilck@arcor.de>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3480bf7c..cd066b6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2931,14 +2931,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 	 */
 	if (mddev->bitmap == NULL &&
 	    mddev->recovery_cp == MaxSector &&
+	    mddev->reshape_position == MaxSector &&
+	    !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+	    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    conf->fullsync == 0) {
 		*skipped = 1;
-		max_sector = mddev->dev_sectors;
-		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
-		    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
-			max_sector = mddev->resync_max_sectors;
-		return max_sector - sector_nr;
+		return mddev->dev_sectors - sector_nr;
 	}
 
  skipped:
-- 
cgit v0.10.2