From e2218350465e7e0931676b4849b594c978437bce Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 12 May 2010 08:25:37 +1000
Subject: md: set mddev readonly flag on blkdev BLKROSET ioctl

When the user sets the block device to readwrite then the mddev should
follow suit.  Otherwise, the BUG_ON in md_write_start() will be set to
trigger.

The reverse direction, setting mddev->ro to match a set readonly
request, can be ignored because the blkdev level readonly flag precludes
the need to have mddev->ro set correctly.  Nevermind the fact that
setting mddev->ro to 1 may fail if the array is in use.

Cc: <stable@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a20a71e..08f6651 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5489,6 +5489,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 	int err = 0;
 	void __user *argp = (void __user *)arg;
 	mddev_t *mddev = NULL;
+	int ro;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -5624,6 +5625,34 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 			err = do_md_stop(mddev, 1, 1);
 			goto done_unlock;
 
+		case BLKROSET:
+			if (get_user(ro, (int __user *)(arg))) {
+				err = -EFAULT;
+				goto done_unlock;
+			}
+			err = -EINVAL;
+
+			/* if the bdev is going readonly the value of mddev->ro
+			 * does not matter, no writes are coming
+			 */
+			if (ro)
+				goto done_unlock;
+
+			/* are we are already prepared for writes? */
+			if (mddev->ro != 1)
+				goto done_unlock;
+
+			/* transitioning to readauto need only happen for
+			 * arrays that call md_write_start
+			 */
+			if (mddev->pers) {
+				err = restart_array(mddev);
+				if (err == 0) {
+					mddev->ro = 2;
+					set_disk_ro(mddev->gendisk, 0);
+				}
+			}
+			goto done_unlock;
 	}
 
 	/*
-- 
cgit v0.10.2


From ef2f80ff7325b2c1888ff02ead28957b5840bf51 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 17 May 2010 11:27:00 +1000
Subject: md/linear: avoid possible oops and array stop

Since commit ef286f6fa673cd7fb367e1b145069d8dbfcc6081
it has been important that each personality clears
->private in the ->stop() function, or sets it to a
attribute group to be removed.
linear.c doesn't.  This can sometimes lead to an oops,
though it doesn't always.

Suitable for 2.6.33-stable and 2.6.34.

Signed-off-by: NeilBrown <neilb@suse.de>
Cc: stable@kernel.org

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index bb2a231..9db8ee0 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -281,6 +281,7 @@ static int linear_stop (mddev_t *mddev)
 	rcu_barrier();
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	kfree(conf);
+	mddev->private = NULL;
 
 	return 0;
 }
-- 
cgit v0.10.2


From b6eb127d274385d81ce8dd45c98190f097bce1b4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 15 Apr 2010 10:13:47 +1000
Subject: md: remove unneeded sysfs files more promptly

When an array is stopped we need to remove some
sysfs files which are dependent on the type of array.

We need to delay that deletion as deleting them while holding
reconfig_mutex can lead to deadlocks.

We currently delay them until the array is completely destroyed.
However it is possible to deactivate and then reactivate the array.
It is also possible to need to remove sysfs files when changing level,
which can potentially happen several times before an array is
destroyed.

So we need to delete these files more promptly: as soon as
reconfig_mutex is dropped.

We need to ensure this happens before do_md_run can restart the array,
so we use open_mutex for some extra locking.  This is not deadlock
prone.

Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 08f6651..edf777f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -507,9 +507,32 @@ static inline int mddev_trylock(mddev_t * mddev)
 	return mutex_trylock(&mddev->reconfig_mutex);
 }
 
+static struct attribute_group md_redundancy_group;
+
 static inline void mddev_unlock(mddev_t * mddev)
 {
-	mutex_unlock(&mddev->reconfig_mutex);
+	if (mddev->pers == NULL && mddev->private) {
+		/* These cannot be removed under reconfig_mutex as
+		 * an access to the files will try to take reconfig_mutex
+		 * while holding the file unremovable, which leads to
+		 * a deadlock.
+		 * So hold open_mutex instead - we are allowed to take
+		 * it while holding reconfig_mutex, and md_run can
+		 * use it to wait for the remove to complete.
+		 */
+		mutex_lock(&mddev->open_mutex);
+		mutex_unlock(&mddev->reconfig_mutex);
+
+		sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
+		if (mddev->private != (void*)1)
+			sysfs_remove_group(&mddev->kobj, mddev->private);
+		if (mddev->sysfs_action)
+			sysfs_put(mddev->sysfs_action);
+		mddev->sysfs_action = NULL;
+		mddev->private = NULL;
+		mutex_unlock(&mddev->open_mutex);
+	} else
+		mutex_unlock(&mddev->reconfig_mutex);
 
 	md_wakeup_thread(mddev->thread);
 }
@@ -4075,15 +4098,6 @@ static void mddev_delayed_delete(struct work_struct *ws)
 {
 	mddev_t *mddev = container_of(ws, mddev_t, del_work);
 
-	if (mddev->private) {
-		sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
-		if (mddev->private != (void*)1)
-			sysfs_remove_group(&mddev->kobj, mddev->private);
-		if (mddev->sysfs_action)
-			sysfs_put(mddev->sysfs_action);
-		mddev->sysfs_action = NULL;
-		mddev->private = NULL;
-	}
 	sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
 	kobject_del(&mddev->kobj);
 	kobject_put(&mddev->kobj);
@@ -4241,6 +4255,13 @@ static int do_md_run(mddev_t * mddev)
 	if (mddev->pers)
 		return -EBUSY;
 
+	/* These two calls synchronise us with the
+	 * sysfs_remove_group calls in mddev_unlock,
+	 * so they must have completed.
+	 */
+	mutex_lock(&mddev->open_mutex);
+	mutex_unlock(&mddev->open_mutex);
+
 	/*
 	 * Analyze all RAID superblock(s)
 	 */
-- 
cgit v0.10.2


From a64c876fd357906a1f7193723866562ad290654c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 14 Apr 2010 17:15:37 +1000
Subject: md: manage redundancy group in sysfs when changing level.

Some levels expect the 'redundancy group' to be present,
others don't.
So when we change level of an array we might need to
add or remove this group.

This requires fixing up the current practice of overloading ->private
to indicate (when ->pers == NULL) that something needs to be removed.
So create a new ->to_remove to fill that role.

When changing levels, we may need to add or remove attributes.  When
changing RAID5 -> RAID6, we both add and remove the same thing.  It is
important to catch this and optimise it out as the removal is delayed
until a lock is released, so trying to add immediately would cause
problems.


Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index edf777f..e8d2388 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -509,9 +509,9 @@ static inline int mddev_trylock(mddev_t * mddev)
 
 static struct attribute_group md_redundancy_group;
 
-static inline void mddev_unlock(mddev_t * mddev)
+static void mddev_unlock(mddev_t * mddev)
 {
-	if (mddev->pers == NULL && mddev->private) {
+	if (mddev->to_remove) {
 		/* These cannot be removed under reconfig_mutex as
 		 * an access to the files will try to take reconfig_mutex
 		 * while holding the file unremovable, which leads to
@@ -520,16 +520,20 @@ static inline void mddev_unlock(mddev_t * mddev)
 		 * it while holding reconfig_mutex, and md_run can
 		 * use it to wait for the remove to complete.
 		 */
+		struct attribute_group *to_remove = mddev->to_remove;
+		mddev->to_remove = NULL;
 		mutex_lock(&mddev->open_mutex);
 		mutex_unlock(&mddev->reconfig_mutex);
 
-		sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
-		if (mddev->private != (void*)1)
-			sysfs_remove_group(&mddev->kobj, mddev->private);
-		if (mddev->sysfs_action)
-			sysfs_put(mddev->sysfs_action);
-		mddev->sysfs_action = NULL;
-		mddev->private = NULL;
+		if (to_remove != &md_redundancy_group)
+			sysfs_remove_group(&mddev->kobj, to_remove);
+		if (mddev->pers == NULL ||
+		    mddev->pers->sync_request == NULL) {
+			sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
+			if (mddev->sysfs_action)
+				sysfs_put(mddev->sysfs_action);
+			mddev->sysfs_action = NULL;
+		}
 		mutex_unlock(&mddev->open_mutex);
 	} else
 		mutex_unlock(&mddev->reconfig_mutex);
@@ -2996,6 +3000,23 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	/* Looks like we have a winner */
 	mddev_suspend(mddev);
 	mddev->pers->stop(mddev);
+	
+	if (mddev->pers->sync_request == NULL &&
+	    pers->sync_request != NULL) {
+		/* need to add the md_redundancy_group */
+		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
+			printk(KERN_WARNING
+			       "md: cannot register extra attributes for %s\n",
+			       mdname(mddev));
+		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
+	}		
+	if (mddev->pers->sync_request != NULL &&
+	    pers->sync_request == NULL) {
+		/* need to remove the md_redundancy_group */
+		if (mddev->to_remove == NULL)
+			mddev->to_remove = &md_redundancy_group;
+	}
+
 	module_put(mddev->pers->owner);
 	/* Invalidate devices that are now superfluous */
 	list_for_each_entry(rdev, &mddev->disks, same_set)
@@ -4550,8 +4571,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			mddev->queue->unplug_fn = NULL;
 			mddev->queue->backing_dev_info.congested_fn = NULL;
 			module_put(mddev->pers->owner);
-			if (mddev->pers->sync_request && mddev->private == NULL)
-				mddev->private = (void*)1;
+			if (mddev->pers->sync_request && mddev->to_remove == NULL)
+				mddev->to_remove = &md_redundancy_group;
 			mddev->pers = NULL;
 			/* tell userspace to handle 'inactive' */
 			sysfs_notify_dirent(mddev->sysfs_state);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8e4c75c..722f5df 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -305,6 +305,7 @@ struct mddev_s
 	atomic_t 			max_corr_read_errors; /* max read retries */
 	struct list_head		all_mddevs;
 
+	struct attribute_group		*to_remove;
 	/* Generic barrier handling.
 	 * If there is a pending barrier request, all other
 	 * writes are blocked while the devices are flushed.
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 70ffbd0..a361398 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5090,7 +5090,9 @@ static int run(mddev_t *mddev)
 	}
 
 	/* Ok, everything is just fine now */
-	if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
+	if (mddev->to_remove == &raid5_attrs_group)
+		mddev->to_remove = NULL;
+	else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
 		printk(KERN_WARNING
 		       "raid5: failed to create sysfs attributes for %s\n",
 		       mdname(mddev));
@@ -5137,7 +5139,8 @@ static int stop(mddev_t *mddev)
 	mddev->queue->backing_dev_info.congested_fn = NULL;
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	free_conf(conf);
-	mddev->private = &raid5_attrs_group;
+	mddev->private = NULL;
+	mddev->to_remove = &raid5_attrs_group;
 	return 0;
 }
 
-- 
cgit v0.10.2


From 964147d5c86d63be79b442c30f3783d49860c078 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 18 May 2010 15:27:13 +1000
Subject: md/raid1: fix counting of write targets.

There is a very small race window when writing to a
RAID1 such that if a device is marked faulty at exactly the wrong
time, the write-in-progress will not be sent to the device,
but the bitmap (if present) will be updated to say that
the write was sent.

Then if the device turned out to still be usable as was re-added
to the array, the bitmap-based-resync would skip resyncing that
block, possibly leading to corruption.  This would only be a problem
if no further writes were issued to that area of the device (i.e.
that bitmap chunk).

Suitable for any pending -stable kernel.

Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f741f77..1ab30f6 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -911,9 +911,10 @@ static int make_request(struct request_queue *q, struct bio * bio)
 			if (test_bit(Faulty, &rdev->flags)) {
 				rdev_dec_pending(rdev, mddev);
 				r1_bio->bios[i] = NULL;
-			} else
+			} else {
 				r1_bio->bios[i] = bio;
-			targets++;
+				targets++;
+			}
 		} else
 			r1_bio->bios[i] = NULL;
 	}
-- 
cgit v0.10.2


From ee8b81b03dffa1c0075553d01c557714aedb85a1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 8 Mar 2010 16:02:36 +1100
Subject: md: remove some dead fields from mddev_s

These fields have never been used.
commit 4b6d287f627b5fb6a49f78f9e81649ff98c62bb7
added them, but also added identical files to bitmap_super_s,
and only used the latter.

So remove these unused fields.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.h b/drivers/md/md.h
index 722f5df..0514578 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -279,9 +279,6 @@ struct mddev_s
 	atomic_t			writes_pending; 
 	struct request_queue		*queue;	/* for plugging ... */
 
-	atomic_t                        write_behind; /* outstanding async IO */
-	unsigned int                    max_write_behind; /* 0 = sync */
-
 	struct bitmap                   *bitmap; /* the bitmap for the device */
 	struct {
 		struct file		*file; /* the bitmap file */
-- 
cgit v0.10.2


From 696fcd535b5a8cfc0617e9cf1d9d69a13895cc1e Mon Sep 17 00:00:00 2001
From: Paul Clements <paul.clements@steeleye.com>
Date: Mon, 8 Mar 2010 16:02:37 +1100
Subject: md: expose max value of behind writes counter

Keep track of the maximum number of concurrent write-behind requests
for an md array and exposed this number in sysfs at
   md/bitmap/max_backlog_used

Writing any value to this file will clear it.

This allows userspace to be involved in tuning bitmap/backlog.

Signed-off-by: Paul Clements <paul.clements@steeleye.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 26ac8aa..6279393 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1292,9 +1292,14 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
 	if (!bitmap) return 0;
 
 	if (behind) {
+		int bw;
 		atomic_inc(&bitmap->behind_writes);
+		bw = atomic_read(&bitmap->behind_writes);
+		if (bw > bitmap->behind_writes_used)
+			bitmap->behind_writes_used = bw;
+
 		PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n",
-		  atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
+		       bw, bitmap->max_write_behind);
 	}
 
 	while (sectors) {
@@ -2006,6 +2011,27 @@ static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry bitmap_can_clear =
 __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
 
+static ssize_t
+behind_writes_used_show(mddev_t *mddev, char *page)
+{
+	if (mddev->bitmap == NULL)
+		return sprintf(page, "0\n");
+	return sprintf(page, "%lu\n",
+		       mddev->bitmap->behind_writes_used);
+}
+
+static ssize_t
+behind_writes_used_reset(mddev_t *mddev, const char *buf, size_t len)
+{
+	if (mddev->bitmap)
+		mddev->bitmap->behind_writes_used = 0;
+	return len;
+}
+
+static struct md_sysfs_entry max_backlog_used =
+__ATTR(max_backlog_used, S_IRUGO | S_IWUSR,
+       behind_writes_used_show, behind_writes_used_reset);
+
 static struct attribute *md_bitmap_attrs[] = {
 	&bitmap_location.attr,
 	&bitmap_timeout.attr,
@@ -2013,6 +2039,7 @@ static struct attribute *md_bitmap_attrs[] = {
 	&bitmap_chunksize.attr,
 	&bitmap_metadata.attr,
 	&bitmap_can_clear.attr,
+	&max_backlog_used.attr,
 	NULL
 };
 struct attribute_group md_bitmap_group = {
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index cb821d7..aa82b7c 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -227,6 +227,7 @@ struct bitmap {
 	int allclean;
 
 	atomic_t behind_writes;
+	unsigned long behind_writes_used; /* highest actual value at runtime */
 
 	/*
 	 * the bitmap daemon - periodically wakes up and sweeps the bitmap
-- 
cgit v0.10.2


From 7b92813c3c0b6990f14838e3985fb385d2655d0c Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Mon, 8 Mar 2010 16:02:40 +1100
Subject: drivers/md: Remove unnecessary casts of void *

void pointers do not need to be cast to other pointer types.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 6279393..49d6080 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -505,7 +505,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
 		return;
 	}
 	spin_unlock_irqrestore(&bitmap->lock, flags);
-	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
+	sb = kmap_atomic(bitmap->sb_page, KM_USER0);
 	sb->events = cpu_to_le64(bitmap->mddev->events);
 	if (bitmap->mddev->events < bitmap->events_cleared) {
 		/* rocking back to read-only */
@@ -526,7 +526,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
 
 	if (!bitmap || !bitmap->sb_page)
 		return;
-	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
+	sb = kmap_atomic(bitmap->sb_page, KM_USER0);
 	printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
 	printk(KERN_DEBUG "         magic: %08x\n", le32_to_cpu(sb->magic));
 	printk(KERN_DEBUG "       version: %d\n", le32_to_cpu(sb->version));
@@ -575,7 +575,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 		return err;
 	}
 
-	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
+	sb = kmap_atomic(bitmap->sb_page, KM_USER0);
 
 	chunksize = le32_to_cpu(sb->chunksize);
 	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
@@ -661,7 +661,7 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
 		return 0;
 	}
 	spin_unlock_irqrestore(&bitmap->lock, flags);
-	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
+	sb = kmap_atomic(bitmap->sb_page, KM_USER0);
 	old = le32_to_cpu(sb->state) & bits;
 	switch (op) {
 		case MASK_SET: sb->state |= cpu_to_le32(bits);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 713acd0..608a8d3 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -171,7 +171,7 @@ static void add_sector(conf_t *conf, sector_t start, int mode)
 static int make_request(struct request_queue *q, struct bio *bio)
 {
 	mddev_t *mddev = q->queuedata;
-	conf_t *conf = (conf_t*)mddev->private;
+	conf_t *conf = mddev->private;
 	int failit = 0;
 
 	if (bio_data_dir(bio) == WRITE) {
@@ -224,7 +224,7 @@ static int make_request(struct request_queue *q, struct bio *bio)
 
 static void status(struct seq_file *seq, mddev_t *mddev)
 {
-	conf_t *conf = (conf_t*)mddev->private;
+	conf_t *conf = mddev->private;
 	int n;
 
 	if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
@@ -327,7 +327,7 @@ static int run(mddev_t *mddev)
 
 static int stop(mddev_t *mddev)
 {
-	conf_t *conf = (conf_t *)mddev->private;
+	conf_t *conf = mddev->private;
 
 	kfree(conf);
 	mddev->private = NULL;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 5558ebc..97befd5 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -84,7 +84,7 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
 static void multipath_end_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
+	struct multipath_bh *mp_bh = bio->bi_private;
 	multipath_conf_t *conf = mp_bh->mddev->private;
 	mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1ab30f6..23a7516 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -262,7 +262,7 @@ static inline void update_head_pos(int disk, r1bio_t *r1_bio)
 static void raid1_end_read_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+	r1bio_t *r1_bio = bio->bi_private;
 	int mirror;
 	conf_t *conf = r1_bio->mddev->private;
 
@@ -307,7 +307,7 @@ static void raid1_end_read_request(struct bio *bio, int error)
 static void raid1_end_write_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+	r1bio_t *r1_bio = bio->bi_private;
 	int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
 	conf_t *conf = r1_bio->mddev->private;
 	struct bio *to_put = NULL;
@@ -1223,7 +1223,7 @@ abort:
 
 static void end_sync_read(struct bio *bio, int error)
 {
-	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+	r1bio_t *r1_bio = bio->bi_private;
 	int i;
 
 	for (i=r1_bio->mddev->raid_disks; i--; )
@@ -1246,7 +1246,7 @@ static void end_sync_read(struct bio *bio, int error)
 static void end_sync_write(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+	r1bio_t *r1_bio = bio->bi_private;
 	mddev_t *mddev = r1_bio->mddev;
 	conf_t *conf = mddev->private;
 	int i;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b4ba41e..b90fef6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -254,7 +254,7 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio)
 static void raid10_end_read_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
+	r10bio_t *r10_bio = bio->bi_private;
 	int slot, dev;
 	conf_t *conf = r10_bio->mddev->private;
 
@@ -295,7 +295,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
 static void raid10_end_write_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
+	r10bio_t *r10_bio = bio->bi_private;
 	int slot, dev;
 	conf_t *conf = r10_bio->mddev->private;
 
@@ -1223,7 +1223,7 @@ abort:
 
 static void end_sync_read(struct bio *bio, int error)
 {
-	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
+	r10bio_t *r10_bio = bio->bi_private;
 	conf_t *conf = r10_bio->mddev->private;
 	int i,d;
 
@@ -1260,7 +1260,7 @@ static void end_sync_read(struct bio *bio, int error)
 static void end_sync_write(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
+	r10bio_t *r10_bio = bio->bi_private;
 	mddev_t *mddev = r10_bio->mddev;
 	conf_t *conf = mddev->private;
 	int i,d;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a361398..10af371 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1618,7 +1618,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	char b[BDEVNAME_SIZE];
-	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	raid5_conf_t *conf = mddev->private;
 	pr_debug("raid5: error called\n");
 
 	if (!test_bit(Faulty, &rdev->flags)) {
@@ -4057,7 +4057,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	 * As the reads complete, handle_stripe will copy the data
 	 * into the destination stripe and release that stripe.
 	 */
-	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	raid5_conf_t *conf = mddev->private;
 	struct stripe_head *sh;
 	sector_t first_sector, last_sector;
 	int raid_disks = conf->previous_raid_disks;
@@ -4266,7 +4266,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 /* FIXME go_faster isn't used */
 static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
-	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	raid5_conf_t *conf = mddev->private;
 	struct stripe_head *sh;
 	sector_t max_sector = mddev->dev_sectors;
 	int sync_blocks;
@@ -5132,7 +5132,7 @@ abort:
 
 static int stop(mddev_t *mddev)
 {
-	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	raid5_conf_t *conf = mddev->private;
 
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
@@ -5181,7 +5181,7 @@ static void printall(struct seq_file *seq, raid5_conf_t *conf)
 
 static void status(struct seq_file *seq, mddev_t *mddev)
 {
-	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	raid5_conf_t *conf = mddev->private;
 	int i;
 
 	seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
-- 
cgit v0.10.2


From c0cc75f84e0e413bce2dcabea74ef418da45c7c1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 22 Mar 2010 10:28:51 +1100
Subject: md: discard StateChanged device flag.

This was needed when sysfs files could only be 'notified'
from process context.  Now that we have sys_notify_direct,
we can call it directly from an interrupt.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e8d2388..2a64cba 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5970,7 +5970,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
 	mddev->pers->error_handler(mddev,rdev);
 	if (mddev->degraded)
 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
-	set_bit(StateChanged, &rdev->flags);
+	sysfs_notify_dirent(rdev->sysfs_state);
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
@@ -6962,11 +6962,6 @@ void md_check_recovery(mddev_t *mddev)
 		if (mddev->flags)
 			md_update_sb(mddev, 0);
 
-		list_for_each_entry(rdev, &mddev->disks, same_set)
-			if (test_and_clear_bit(StateChanged, &rdev->flags))
-				sysfs_notify_dirent(rdev->sysfs_state);
-
-
 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
 		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
 			/* resync/recovery still happening */
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 0514578..e4836c68 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -74,9 +74,6 @@ struct mdk_rdev_s
 #define Blocked		8		/* An error occured on an externally
 					 * managed array, don't allow writes
 					 * until it is cleared */
-#define StateChanged	9		/* Faulty or Blocked has changed during
-					 * interrupt, so it needs to be
-					 * notified by the thread */
 	wait_queue_head_t blocked_wait;
 
 	int desc_nr;			/* descriptor index in the superblock */
-- 
cgit v0.10.2


From 84707f38e767ac470fd82af6c45a8cafe2bd1b9a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 16 Mar 2010 17:23:35 +1100
Subject: md: don't use mddev->raid_disks in raid0 or raid10 while array is
 active.

In a subsequent patch we will make it possible to change
mddev->raid_disks while a RAID0 or RAID10 array is active.  This is
part of the process of reshaping such an array.

This means that we cannot use this value while processes requests
(it is OK to use it during initialisation as we are locked against
changes then).
Both RAID0 and RAID10 have the same value stored in the private data
structure, so use that value instead.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 377cf2a..c2e0d1d 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -28,9 +28,10 @@ static void raid0_unplug(struct request_queue *q)
 	mddev_t *mddev = q->queuedata;
 	raid0_conf_t *conf = mddev->private;
 	mdk_rdev_t **devlist = conf->devlist;
+	int raid_disks = conf->strip_zone[0].nb_dev;
 	int i;
 
-	for (i=0; i<mddev->raid_disks; i++) {
+	for (i=0; i < raid_disks; i++) {
 		struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev);
 
 		blk_unplug(r_queue);
@@ -42,12 +43,13 @@ static int raid0_congested(void *data, int bits)
 	mddev_t *mddev = data;
 	raid0_conf_t *conf = mddev->private;
 	mdk_rdev_t **devlist = conf->devlist;
+	int raid_disks = conf->strip_zone[0].nb_dev;
 	int i, ret = 0;
 
 	if (mddev_congested(mddev, bits))
 		return 1;
 
-	for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+	for (i = 0; i < raid_disks && !ret ; i++) {
 		struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
 
 		ret |= bdi_congested(&q->backing_dev_info, bits);
@@ -65,6 +67,7 @@ static void dump_zones(mddev_t *mddev)
 	sector_t zone_start = 0;
 	char b[BDEVNAME_SIZE];
 	raid0_conf_t *conf = mddev->private;
+	int raid_disks = conf->strip_zone[0].nb_dev;
 	printk(KERN_INFO "******* %s configuration *********\n",
 		mdname(mddev));
 	h = 0;
@@ -72,7 +75,7 @@ static void dump_zones(mddev_t *mddev)
 		printk(KERN_INFO "zone%d=[", j);
 		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
 			printk("%s/",
-			bdevname(conf->devlist[j*mddev->raid_disks
+			bdevname(conf->devlist[j*raid_disks
 						+ k]->bdev, b));
 		printk("]\n");
 
@@ -401,6 +404,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
 	unsigned int sect_in_chunk;
 	sector_t chunk;
 	raid0_conf_t *conf = mddev->private;
+	int raid_disks = conf->strip_zone[0].nb_dev;
 	unsigned int chunk_sects = mddev->chunk_sectors;
 
 	if (is_power_of_2(chunk_sects)) {
@@ -423,7 +427,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
 	*	+ the position in the chunk
 	*/
 	*sector_offset = (chunk * chunk_sects) + sect_in_chunk;
-	return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks
+	return conf->devlist[(zone - conf->strip_zone)*raid_disks
 			     + sector_div(sector, zone->nb_dev)];
 }
 
@@ -518,6 +522,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)
 	int j, k, h;
 	char b[BDEVNAME_SIZE];
 	raid0_conf_t *conf = mddev->private;
+	int raid_disks = conf->strip_zone[0].nb_dev;
 
 	sector_t zone_size;
 	sector_t zone_start = 0;
@@ -528,7 +533,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)
 		seq_printf(seq, "=[");
 		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
 			seq_printf(seq, "%s/", bdevname(
-				conf->devlist[j*mddev->raid_disks + k]
+				conf->devlist[j*raid_disks + k]
 						->bdev, b));
 
 		zone_size  = conf->strip_zone[j].zone_end - zone_start;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b90fef6..044c115 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -600,7 +600,7 @@ static void unplug_slaves(mddev_t *mddev)
 	int i;
 
 	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks; i++) {
+	for (i=0; i < conf->raid_disks; i++) {
 		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
 			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
@@ -634,7 +634,7 @@ static int raid10_congested(void *data, int bits)
 	if (mddev_congested(mddev, bits))
 		return 1;
 	rcu_read_lock();
-	for (i = 0; i < mddev->raid_disks && ret == 0; i++) {
+	for (i = 0; i < conf->raid_disks && ret == 0; i++) {
 		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 			struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -1131,7 +1131,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int mirror;
 	mirror_info_t *p;
 	int first = 0;
-	int last = mddev->raid_disks - 1;
+	int last = conf->raid_disks - 1;
 
 	if (mddev->recovery_cp < MaxSector)
 		/* only hot-add to in-sync arrays, as recovery is
@@ -2139,7 +2139,7 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 	conf_t *conf = mddev->private;
 
 	if (!raid_disks)
-		raid_disks = mddev->raid_disks;
+		raid_disks = conf->raid_disks;
 	if (!sectors)
 		sectors = mddev->dev_sectors;
 
@@ -2250,7 +2250,7 @@ static int run(mddev_t *mddev)
 
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		disk_idx = rdev->raid_disk;
-		if (disk_idx >= mddev->raid_disks
+		if (disk_idx >= conf->raid_disks
 		    || disk_idx < 0)
 			continue;
 		disk = conf->mirrors + disk_idx;
@@ -2311,8 +2311,8 @@ static int run(mddev_t *mddev)
 		       mdname(mddev));
 	printk(KERN_INFO
 		"raid10: raid set %s active with %d out of %d devices\n",
-		mdname(mddev), mddev->raid_disks - mddev->degraded,
-		mddev->raid_disks);
+		mdname(mddev), conf->raid_disks - mddev->degraded,
+		conf->raid_disks);
 	/*
 	 * Ok, everything is just fine now
 	 */
@@ -2335,7 +2335,7 @@ static int run(mddev_t *mddev)
 			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
 	}
 
-	if (conf->near_copies < mddev->raid_disks)
+	if (conf->near_copies < conf->raid_disks)
 		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
 	md_integrity_register(mddev);
 	return 0;
-- 
cgit v0.10.2


From 54071b3808ee3dc8624d9d6f1b06c4fd5308fa3b Mon Sep 17 00:00:00 2001
From: Trela Maciej <Maciej.Trela@intel.com>
Date: Mon, 8 Mar 2010 16:02:42 +1100
Subject: md:Add support for Raid0->Raid5 takeover

Signed-off-by: Maciej Trela <maciej.trela@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2a64cba..22c630b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3017,6 +3017,20 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 			mddev->to_remove = &md_redundancy_group;
 	}
 
+	if (mddev->pers->sync_request == NULL &&
+	    mddev->external) {
+		/* We are converting from a no-redundancy array
+		 * to a redundancy array and metadata is managed
+		 * externally so we need to be sure that writes
+		 * won't block due to a need to transition
+		 *      clean->dirty
+		 * until external management is started.
+		 */
+		mddev->in_sync = 0;
+		mddev->safemode_delay = 0;
+		mddev->safemode = 0;
+	}
+
 	module_put(mddev->pers->owner);
 	/* Invalidate devices that are now superfluous */
 	list_for_each_entry(rdev, &mddev->disks, same_set)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 10af371..bb28fd6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -52,6 +52,7 @@
 #include <linux/cpu.h>
 #include "md.h"
 #include "raid5.h"
+#include "raid0.h"
 #include "bitmap.h"
 
 /*
@@ -5619,6 +5620,21 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 }
 
 
+static void *raid5_takeover_raid0(mddev_t *mddev)
+{
+
+	mddev->new_level = 5;
+	mddev->new_layout = ALGORITHM_PARITY_N;
+	mddev->new_chunk_sectors = mddev->chunk_sectors;
+	mddev->raid_disks += 1;
+	mddev->delta_disks = 1;
+	/* make sure it will be not marked as dirty */
+	mddev->recovery_cp = MaxSector;
+
+	return setup_conf(mddev);
+}
+
+
 static void *raid5_takeover_raid1(mddev_t *mddev)
 {
 	int chunksect;
@@ -5748,6 +5764,16 @@ static void *raid5_takeover(mddev_t *mddev)
 	 *  raid4 - trivial - just use a raid4 layout.
 	 *  raid6 - Providing it is a *_6 layout
 	 */
+	if (mddev->level == 0) {
+		/* for raid0 takeover only one zone is supported */
+		struct raid0_private_data *raid0_priv
+			= mddev->private;
+		if (raid0_priv->nr_strip_zones > 1) {
+			printk(KERN_ERR "md: cannot takeover raid 0 with more than one zone.\n");
+			return ERR_PTR(-EINVAL);
+		}
+		return raid5_takeover_raid0(mddev);
+	}
 
 	if (mddev->level == 1)
 		return raid5_takeover_raid1(mddev);
-- 
cgit v0.10.2


From 9af204cf720cedf369cf823bbd806c350201f7ea Mon Sep 17 00:00:00 2001
From: "Trela, Maciej" <Maciej.Trela@intel.com>
Date: Mon, 8 Mar 2010 16:02:44 +1100
Subject: md: Add support for Raid5->Raid0 and Raid10->Raid0 takeover

Signed-off-by: Maciej Trela <maciej.trela@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 22c630b..7dcc740 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3045,6 +3045,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	mddev->layout = mddev->new_layout;
 	mddev->chunk_sectors = mddev->new_chunk_sectors;
 	mddev->delta_disks = 0;
+	if (mddev->pers->sync_request == NULL) {
+		/* this is now an array without redundancy, so
+		 * it must always be in_sync
+		 */
+		mddev->in_sync = 1;
+		del_timer_sync(&mddev->safemode_timer);
+	}
 	pers->run(mddev);
 	mddev_resume(mddev);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c2e0d1d..afddf62 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -22,6 +22,7 @@
 #include <linux/seq_file.h>
 #include "md.h"
 #include "raid0.h"
+#include "raid5.h"
 
 static void raid0_unplug(struct request_queue *q)
 {
@@ -90,7 +91,7 @@ static void dump_zones(mddev_t *mddev)
 	printk(KERN_INFO "**********************************\n\n");
 }
 
-static int create_strip_zones(mddev_t *mddev)
+static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
 {
 	int i, c, err;
 	sector_t curr_zone_end, sectors;
@@ -164,6 +165,10 @@ static int create_strip_zones(mddev_t *mddev)
 	list_for_each_entry(rdev1, &mddev->disks, same_set) {
 		int j = rdev1->raid_disk;
 
+		if (mddev->level == 10)
+			/* taking over a raid10-n2 array */
+			j /= 2;
+
 		if (j < 0 || j >= mddev->raid_disks) {
 			printk(KERN_ERR "raid0: bad disk number %d - "
 				"aborting!\n", j);
@@ -264,13 +269,14 @@ static int create_strip_zones(mddev_t *mddev)
 			 (mddev->chunk_sectors << 9) * mddev->raid_disks);
 
 	printk(KERN_INFO "raid0: done.\n");
-	mddev->private = conf;
+	*private_conf = conf;
+
 	return 0;
 abort:
 	kfree(conf->strip_zone);
 	kfree(conf->devlist);
 	kfree(conf);
-	mddev->private = NULL;
+	*private_conf = NULL;
 	return err;
 }
 
@@ -321,6 +327,7 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 
 static int raid0_run(mddev_t *mddev)
 {
+	raid0_conf_t *conf;
 	int ret;
 
 	if (mddev->chunk_sectors == 0) {
@@ -332,9 +339,20 @@ static int raid0_run(mddev_t *mddev)
 	blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
 	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
 
-	ret = create_strip_zones(mddev);
-	if (ret < 0)
-		return ret;
+	/* if private is not null, we are here after takeover */
+	if (mddev->private == NULL) {
+		ret = create_strip_zones(mddev, &conf);
+		if (ret < 0)
+			return ret;
+		mddev->private = conf;
+	}
+	conf = mddev->private;
+	if (conf->scale_raid_disks) {
+		int i;
+		for (i=0; i < conf->strip_zone[0].nb_dev; i++)
+			conf->devlist[i]->raid_disk /= conf->scale_raid_disks;
+		/* FIXME update sysfs rd links */
+	}
 
 	/* calculate array device size */
 	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
@@ -548,6 +566,99 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)
 	return;
 }
 
+static void *raid0_takeover_raid5(mddev_t *mddev)
+{
+	mdk_rdev_t *rdev;
+	raid0_conf_t *priv_conf;
+
+	if (mddev->degraded != 1) {
+		printk(KERN_ERR "md: raid5 must be degraded! Degraded disks: %d\n",
+		       mddev->degraded);
+		return ERR_PTR(-EINVAL);
+	}
+
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		/* check slot number for a disk */
+		if (rdev->raid_disk == mddev->raid_disks-1) {
+			printk(KERN_ERR "md: raid5 must have missing parity disk!\n");
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	/* Set new parameters */
+	mddev->new_level = 0;
+	mddev->new_chunk_sectors = mddev->chunk_sectors;
+	mddev->raid_disks--;
+	mddev->delta_disks = -1;
+	/* make sure it will be not marked as dirty */
+	mddev->recovery_cp = MaxSector;
+
+	create_strip_zones(mddev, &priv_conf);
+	return priv_conf;
+}
+
+static void *raid0_takeover_raid10(mddev_t *mddev)
+{
+	raid0_conf_t *priv_conf;
+
+	/* Check layout:
+	 *  - far_copies must be 1
+	 *  - near_copies must be 2
+	 *  - disks number must be even
+	 *  - all mirrors must be already degraded
+	 */
+	if (mddev->layout != ((1 << 8) + 2)) {
+		printk(KERN_ERR "md: Raid0 cannot takover layout: %x\n",
+		       mddev->layout);
+		return ERR_PTR(-EINVAL);
+	}
+	if (mddev->raid_disks & 1) {
+		printk(KERN_ERR "md: Raid0 cannot takover Raid10 with odd disk number.\n");
+		return ERR_PTR(-EINVAL);
+	}
+	if (mddev->degraded != (mddev->raid_disks>>1)) {
+		printk(KERN_ERR "md: All mirrors must be already degraded!\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* Set new parameters */
+	mddev->new_level = 0;
+	mddev->new_chunk_sectors = mddev->chunk_sectors;
+	mddev->delta_disks = - mddev->raid_disks / 2;
+	mddev->raid_disks += mddev->delta_disks;
+	mddev->degraded = 0;
+	/* make sure it will be not marked as dirty */
+	mddev->recovery_cp = MaxSector;
+
+	create_strip_zones(mddev, &priv_conf);
+	priv_conf->scale_raid_disks = 2;
+	return priv_conf;
+}
+
+static void *raid0_takeover(mddev_t *mddev)
+{
+	/* raid0 can take over:
+	 *  raid5 - providing it is Raid4 layout and one disk is faulty
+	 *  raid10 - assuming we have all necessary active disks
+	 */
+	if (mddev->level == 5) {
+		if (mddev->layout == ALGORITHM_PARITY_N)
+			return raid0_takeover_raid5(mddev);
+
+		printk(KERN_ERR "md: Raid can only takeover Raid5 with layout: %d\n",
+		       ALGORITHM_PARITY_N);
+	}
+
+	if (mddev->level == 10)
+		return raid0_takeover_raid10(mddev);
+
+	return ERR_PTR(-EINVAL);
+}
+
+static void raid0_quiesce(mddev_t *mddev, int state)
+{
+}
+
 static struct mdk_personality raid0_personality=
 {
 	.name		= "raid0",
@@ -558,6 +669,8 @@ static struct mdk_personality raid0_personality=
 	.stop		= raid0_stop,
 	.status		= raid0_status,
 	.size		= raid0_size,
+	.takeover	= raid0_takeover,
+	.quiesce	= raid0_quiesce,
 };
 
 static int __init raid0_init (void)
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 91f8e87..d724e66 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -13,6 +13,9 @@ struct raid0_private_data
 	struct strip_zone *strip_zone;
 	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
 	int nr_strip_zones;
+	int scale_raid_disks; /* divide rdev->raid_disks by this in run()
+			       * to handle conversion from raid10
+			       */
 };
 
 typedef struct raid0_private_data raid0_conf_t;
-- 
cgit v0.10.2


From dab8b29248b3f14f456651a2a6ee9b8fd16d1b3c Mon Sep 17 00:00:00 2001
From: "Trela, Maciej" <Maciej.Trela@intel.com>
Date: Mon, 8 Mar 2010 16:02:45 +1100
Subject: md: Add support for Raid0->Raid10 takeover

Signed-off-by: Maciej Trela <maciej.trela@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 044c115..57d71d5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -23,6 +23,7 @@
 #include <linux/seq_file.h>
 #include "md.h"
 #include "raid10.h"
+#include "raid0.h"
 #include "bitmap.h"
 
 /*
@@ -2141,7 +2142,7 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 	if (!raid_disks)
 		raid_disks = conf->raid_disks;
 	if (!sectors)
-		sectors = mddev->dev_sectors;
+		sectors = conf->dev_sectors;
 
 	size = sectors >> conf->chunk_shift;
 	sector_div(size, conf->far_copies);
@@ -2151,62 +2152,60 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 	return size << conf->chunk_shift;
 }
 
-static int run(mddev_t *mddev)
+
+static conf_t *setup_conf(mddev_t *mddev)
 {
-	conf_t *conf;
-	int i, disk_idx, chunk_size;
-	mirror_info_t *disk;
-	mdk_rdev_t *rdev;
+	conf_t *conf = NULL;
 	int nc, fc, fo;
 	sector_t stride, size;
+	int err = -EINVAL;
 
 	if (mddev->chunk_sectors < (PAGE_SIZE >> 9) ||
 	    !is_power_of_2(mddev->chunk_sectors)) {
 		printk(KERN_ERR "md/raid10: chunk size must be "
 		       "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE);
-		return -EINVAL;
+		goto out;
 	}
 
 	nc = mddev->layout & 255;
 	fc = (mddev->layout >> 8) & 255;
 	fo = mddev->layout & (1<<16);
+
 	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
 	    (mddev->layout >> 17)) {
 		printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
 		       mdname(mddev), mddev->layout);
 		goto out;
 	}
-	/*
-	 * copy the already verified devices into our private RAID10
-	 * bookkeeping area. [whatever we allocate in run(),
-	 * should be freed in stop()]
-	 */
+
+	err = -ENOMEM;
 	conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
-	mddev->private = conf;
-	if (!conf) {
-		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
-			mdname(mddev));
+	if (!conf)
 		goto out;
-	}
+
 	conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
-				 GFP_KERNEL);
-	if (!conf->mirrors) {
-		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
-		       mdname(mddev));
-		goto out_free_conf;
-	}
+				GFP_KERNEL);
+	if (!conf->mirrors)
+		goto out;
 
 	conf->tmppage = alloc_page(GFP_KERNEL);
 	if (!conf->tmppage)
-		goto out_free_conf;
+		goto out;
+
 
 	conf->raid_disks = mddev->raid_disks;
 	conf->near_copies = nc;
 	conf->far_copies = fc;
 	conf->copies = nc*fc;
 	conf->far_offset = fo;
-	conf->chunk_mask = mddev->chunk_sectors - 1;
-	conf->chunk_shift = ffz(~mddev->chunk_sectors);
+	conf->chunk_mask = mddev->new_chunk_sectors - 1;
+	conf->chunk_shift = ffz(~mddev->new_chunk_sectors);
+
+	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
+					   r10bio_pool_free, conf);
+	if (!conf->r10bio_pool)
+		goto out;
+
 	size = mddev->dev_sectors >> conf->chunk_shift;
 	sector_div(size, fc);
 	size = size * conf->raid_disks;
@@ -2220,7 +2219,8 @@ static int run(mddev_t *mddev)
 	 */
 	stride += conf->raid_disks - 1;
 	sector_div(stride, conf->raid_disks);
-	mddev->dev_sectors = stride << conf->chunk_shift;
+
+	conf->dev_sectors = stride << conf->chunk_shift;
 
 	if (fo)
 		stride = 1;
@@ -2228,18 +2228,63 @@ static int run(mddev_t *mddev)
 		sector_div(stride, fc);
 	conf->stride = stride << conf->chunk_shift;
 
-	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
-						r10bio_pool_free, conf);
-	if (!conf->r10bio_pool) {
-		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
-			mdname(mddev));
-		goto out_free_conf;
-	}
 
-	conf->mddev = mddev;
 	spin_lock_init(&conf->device_lock);
+	INIT_LIST_HEAD(&conf->retry_list);
+
+	spin_lock_init(&conf->resync_lock);
+	init_waitqueue_head(&conf->wait_barrier);
+
+	conf->thread = md_register_thread(raid10d, mddev, NULL);
+	if (!conf->thread)
+		goto out;
+
+	conf->scale_disks = 0;
+	conf->mddev = mddev;
+	return conf;
+
+ out:
+	printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
+	       mdname(mddev));
+	if (conf) {
+		if (conf->r10bio_pool)
+			mempool_destroy(conf->r10bio_pool);
+		kfree(conf->mirrors);
+		safe_put_page(conf->tmppage);
+		kfree(conf);
+	}
+	return ERR_PTR(err);
+}
+
+static int run(mddev_t *mddev)
+{
+	conf_t *conf;
+	int i, disk_idx, chunk_size;
+	mirror_info_t *disk;
+	mdk_rdev_t *rdev;
+	sector_t size;
+
+	/*
+	 * copy the already verified devices into our private RAID10
+	 * bookkeeping area. [whatever we allocate in run(),
+	 * should be freed in stop()]
+	 */
+
+	if (mddev->private == NULL) {
+		conf = setup_conf(mddev);
+		if (IS_ERR(conf))
+			return PTR_ERR(conf);
+		mddev->private = conf;
+	}
+	conf = mddev->private;
+	if (!conf)
+		goto out;
+
 	mddev->queue->queue_lock = &conf->device_lock;
 
+	mddev->thread = conf->thread;
+	conf->thread = NULL;
+
 	chunk_size = mddev->chunk_sectors << 9;
 	blk_queue_io_min(mddev->queue, chunk_size);
 	if (conf->raid_disks % conf->near_copies)
@@ -2253,6 +2298,11 @@ static int run(mddev_t *mddev)
 		if (disk_idx >= conf->raid_disks
 		    || disk_idx < 0)
 			continue;
+		if (conf->scale_disks) {
+			disk_idx *= conf->scale_disks;
+			rdev->raid_disk = disk_idx;
+			/* MOVE 'rd%d' link !! */
+		}
 		disk = conf->mirrors + disk_idx;
 
 		disk->rdev = rdev;
@@ -2270,11 +2320,6 @@ static int run(mddev_t *mddev)
 
 		disk->head_position = 0;
 	}
-	INIT_LIST_HEAD(&conf->retry_list);
-
-	spin_lock_init(&conf->resync_lock);
-	init_waitqueue_head(&conf->wait_barrier);
-
 	/* need to check that every block has at least one working mirror */
 	if (!enough(conf)) {
 		printk(KERN_ERR "raid10: not enough operational mirrors for %s\n",
@@ -2296,15 +2341,6 @@ static int run(mddev_t *mddev)
 		}
 	}
 
-
-	mddev->thread = md_register_thread(raid10d, mddev, NULL);
-	if (!mddev->thread) {
-		printk(KERN_ERR
-		       "raid10: couldn't allocate thread for %s\n",
-		       mdname(mddev));
-		goto out_free_conf;
-	}
-
 	if (mddev->recovery_cp != MaxSector)
 		printk(KERN_NOTICE "raid10: %s is not clean"
 		       " -- starting background reconstruction\n",
@@ -2316,8 +2352,10 @@ static int run(mddev_t *mddev)
 	/*
 	 * Ok, everything is just fine now
 	 */
-	md_set_array_sectors(mddev, raid10_size(mddev, 0, 0));
-	mddev->resync_max_sectors = raid10_size(mddev, 0, 0);
+	mddev->dev_sectors = conf->dev_sectors;
+	size = raid10_size(mddev, 0, 0);
+	md_set_array_sectors(mddev, size);
+	mddev->resync_max_sectors = size;
 
 	mddev->queue->unplug_fn = raid10_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid10_congested;
@@ -2347,6 +2385,7 @@ out_free_conf:
 	kfree(conf->mirrors);
 	kfree(conf);
 	mddev->private = NULL;
+	md_unregister_thread(mddev->thread);
 out:
 	return -EIO;
 }
@@ -2383,6 +2422,58 @@ static void raid10_quiesce(mddev_t *mddev, int state)
 	}
 }
 
+static void *raid10_takeover_raid0(mddev_t *mddev)
+{
+	mdk_rdev_t *rdev;
+	conf_t *conf;
+
+	if (mddev->degraded > 0) {
+		printk(KERN_ERR "error: degraded raid0!\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* Update slot numbers to obtain
+	 * degraded raid10 with missing mirrors
+	 */
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		rdev->raid_disk *= 2;
+	}
+
+	/* Set new parameters */
+	mddev->new_level = 10;
+	/* new layout: far_copies = 1, near_copies = 2 */
+	mddev->new_layout = (1<<8) + 2;
+	mddev->new_chunk_sectors = mddev->chunk_sectors;
+	mddev->delta_disks = mddev->raid_disks;
+	mddev->degraded = mddev->raid_disks;
+	mddev->raid_disks *= 2;
+	/* make sure it will be not marked as dirty */
+	mddev->recovery_cp = MaxSector;
+
+	conf = setup_conf(mddev);
+	conf->scale_disks = 2;
+	return conf;
+}
+
+static void *raid10_takeover(mddev_t *mddev)
+{
+	struct raid0_private_data *raid0_priv;
+
+	/* raid10 can take over:
+	 *  raid0 - providing it has only two drives
+	 */
+	if (mddev->level == 0) {
+		/* for raid0 takeover only one zone is supported */
+		raid0_priv = mddev->private;
+		if (raid0_priv->nr_strip_zones > 1) {
+			printk(KERN_ERR "md: cannot takeover raid 0 with more than one zone.\n");
+			return ERR_PTR(-EINVAL);
+		}
+		return raid10_takeover_raid0(mddev);
+	}
+	return ERR_PTR(-EINVAL);
+}
+
 static struct mdk_personality raid10_personality =
 {
 	.name		= "raid10",
@@ -2399,6 +2490,7 @@ static struct mdk_personality raid10_personality =
 	.sync_request	= sync_request,
 	.quiesce	= raid10_quiesce,
 	.size		= raid10_size,
+	.takeover	= raid10_takeover,
 };
 
 static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 59cd1ef..3824a08 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -33,9 +33,16 @@ struct r10_private_data_s {
 					       * 1 stripe.
 					       */
 
+	sector_t		dev_sectors;  /* temp copy of mddev->dev_sectors */
+
 	int chunk_shift; /* shift from chunks to sectors */
 	sector_t chunk_mask;
 
+	int			scale_disks;  /* When starting array, multiply
+					       * each ->raid_disk by this.
+					       * Need for raid0->raid10 migration
+					       */
+
 	struct list_head	retry_list;
 	/* queue pending writes and submit them on unplug */
 	struct bio_list		pending_bio_list;
@@ -57,6 +64,11 @@ struct r10_private_data_s {
 	mempool_t *r10bio_pool;
 	mempool_t *r10buf_pool;
 	struct page		*tmppage;
+
+	/* When taking over an array from a different personality, we store
+	 * the new thread here until we fully activate the array.
+	 */
+	struct mdk_thread_s	*thread;
 };
 
 typedef struct r10_private_data_s conf_t;
-- 
cgit v0.10.2


From b71031076e1169e89bdac1b245ad1488587e4730 Mon Sep 17 00:00:00 2001
From: Maciej Trela <Maciej.Trela@intel.com>
Date: Wed, 14 Apr 2010 16:58:16 +1000
Subject: md: Correctly handle device removal via sysfs

Writing "none" to "../md/dev-xx/slot" removes that device
from being an active part of the array, but it didn't
set ->raid_disk to -1 to record this fact.


Signed-off-by: Maciej Trela <Maciej.Trela@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7dcc740..766be87 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2385,6 +2385,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 			return err;
 		sprintf(nm, "rd%d", rdev->raid_disk);
 		sysfs_remove_link(&rdev->mddev->kobj, nm);
+		rdev->raid_disk = -1;
 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
 		md_wakeup_thread(rdev->mddev->thread);
 	} else if (rdev->mddev->pers) {
-- 
cgit v0.10.2


From 233fca36bb439eadcad28500b5139fed7c64a0ae Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 14 Apr 2010 17:02:09 +1000
Subject: md: Relax checks on ->max_disks when external metadata handling is
 used.

When metadata is being managed by user-space, md doesn't know
what the maximum number of devices allowed in an array is
so ->max_disks is 0.  In this case we should allow any (+ve)
number of disks.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 766be87..46bdf4b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2801,8 +2801,9 @@ static void analyze_sbs(mddev_t * mddev)
 
 	i = 0;
 	rdev_for_each(rdev, tmp, mddev) {
-		if (rdev->desc_nr >= mddev->max_disks ||
-		    i > mddev->max_disks) {
+		if (mddev->max_disks &&
+		    (rdev->desc_nr >= mddev->max_disks ||
+		     i > mddev->max_disks)) {
 			printk(KERN_WARNING
 			       "md: %s: %s: only %d devices permitted\n",
 			       mdname(mddev), bdevname(rdev->bdev, b),
@@ -5406,7 +5407,7 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
 	if (mddev->pers->check_reshape == NULL)
 		return -EINVAL;
 	if (raid_disks <= 0 ||
-	    raid_disks >= mddev->max_disks)
+	    (mddev->max_disks && raid_disks >= mddev->max_disks))
 		return -EINVAL;
 	if (mddev->sync_thread || mddev->reshape_position != MaxSector)
 		return -EBUSY;
-- 
cgit v0.10.2


From 5cac7861b2de95a1f714ebdc652813abd0afcc73 Mon Sep 17 00:00:00 2001
From: Maciej Trela <Maciej.Trela@intel.com>
Date: Wed, 14 Apr 2010 17:17:39 +1000
Subject: md: notify level changes through sysfs.

Level changes can be very significant, so make sure
to notify them via sysfs.

Signed-off-by: Maciej Trela <maciej.trela@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 46bdf4b..c5a1b07 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3059,6 +3059,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
+	sysfs_notify(&mddev->kobj, NULL, "level");
 	return rv;
 }
 
-- 
cgit v0.10.2


From a78d38a1a16c8e009aa512caa71d483757fefc1c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 22 Mar 2010 16:53:49 +1100
Subject: md: add support for raid5 to raid4 conversion

This is unlikely to be wanted, but we may as well provide it
for completeness.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index bb28fd6..020143d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5788,6 +5788,18 @@ static void *raid5_takeover(mddev_t *mddev)
 	return ERR_PTR(-EINVAL);
 }
 
+static void *raid4_takeover(mddev_t *mddev)
+{
+	/* raid4 can take over raid5 if layout is right.
+	 */
+	if (mddev->level == 5 &&
+	    mddev->layout == ALGORITHM_PARITY_N) {
+		mddev->new_layout = 0;
+		mddev->new_level = 4;
+		return setup_conf(mddev);
+	}
+	return ERR_PTR(-EINVAL);
+}
 
 static struct mdk_personality raid5_personality;
 
@@ -5903,6 +5915,7 @@ static struct mdk_personality raid4_personality =
 	.start_reshape  = raid5_start_reshape,
 	.finish_reshape = raid5_finish_reshape,
 	.quiesce	= raid5_quiesce,
+	.takeover	= raid4_takeover,
 };
 
 static int __init raid5_init(void)
-- 
cgit v0.10.2


From 2b7f22284d71975e37a82db154386348eec0e52c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 25 Mar 2010 16:06:03 +1100
Subject: md/raid5: small tidyup in raid5_align_endio

Diving through ->queue to find mddev is unnecessarily complex - there
is an easier path to finding mddev, so use that.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 020143d..7bfeba3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3713,10 +3713,10 @@ static void raid5_align_endio(struct bio *bi, int error)
 
 	bio_put(bi);
 
-	mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
-	conf = mddev->private;
 	rdev = (void*)raid_bi->bi_next;
 	raid_bi->bi_next = NULL;
+	mddev = rdev->mddev;
+	conf = mddev->private;
 
 	rdev_dec_pending(rdev, conf->mddev);
 
-- 
cgit v0.10.2


From 490773268cf64f68da2470e07b52c7944da6312d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 25 Mar 2010 16:20:56 +1100
Subject: md: move io accounting out of personalities into md_make_request

While I generally prefer letting personalities do as much as possible,
given that we have a central md_make_request anyway we may as well use
it to simplify code.
Also this centralises knowledge of ->gendisk which will help later.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 9db8ee0..3048c17 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -288,23 +288,15 @@ static int linear_stop (mddev_t *mddev)
 
 static int linear_make_request (struct request_queue *q, struct bio *bio)
 {
-	const int rw = bio_data_dir(bio);
 	mddev_t *mddev = q->queuedata;
 	dev_info_t *tmp_dev;
 	sector_t start_sector;
-	int cpu;
 
 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		md_barrier_request(mddev, bio);
 		return 0;
 	}
 
-	cpu = part_stat_lock();
-	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
-	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
-		      bio_sectors(bio));
-	part_stat_unlock();
-
 	rcu_read_lock();
 	tmp_dev = which_dev(mddev, bio->bi_sector);
 	start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c5a1b07..117663d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -214,8 +214,11 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
  */
 static int md_make_request(struct request_queue *q, struct bio *bio)
 {
+	const int rw = bio_data_dir(bio);
 	mddev_t *mddev = q->queuedata;
 	int rv;
+	int cpu;
+
 	if (mddev == NULL || mddev->pers == NULL) {
 		bio_io_error(bio);
 		return 0;
@@ -236,7 +239,15 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
 	}
 	atomic_inc(&mddev->active_io);
 	rcu_read_unlock();
+
 	rv = mddev->pers->make_request(q, bio);
+
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
+		      bio_sectors(bio));
+	part_stat_unlock();
+
 	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
 		wake_up(&mddev->sb_wait);
 
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 97befd5..5b4e291 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -141,8 +141,6 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 	multipath_conf_t *conf = mddev->private;
 	struct multipath_bh * mp_bh;
 	struct multipath_info *multipath;
-	const int rw = bio_data_dir(bio);
-	int cpu;
 
 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		md_barrier_request(mddev, bio);
@@ -154,12 +152,6 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 	mp_bh->master_bio = bio;
 	mp_bh->mddev = mddev;
 
-	cpu = part_stat_lock();
-	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
-	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
-		      bio_sectors(bio));
-	part_stat_unlock();
-
 	mp_bh->path = multipath_map(conf);
 	if (mp_bh->path < 0) {
 		bio_endio(bio, -EIO);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index afddf62..d535f9b 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -472,20 +472,12 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
 	sector_t sector_offset;
 	struct strip_zone *zone;
 	mdk_rdev_t *tmp_dev;
-	const int rw = bio_data_dir(bio);
-	int cpu;
 
 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		md_barrier_request(mddev, bio);
 		return 0;
 	}
 
-	cpu = part_stat_lock();
-	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
-	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
-		      bio_sectors(bio));
-	part_stat_unlock();
-
 	chunk_sects = mddev->chunk_sectors;
 	if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
 		sector_t sector = bio->bi_sector;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 23a7516..e277013 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -787,7 +787,6 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	struct page **behind_pages = NULL;
 	const int rw = bio_data_dir(bio);
 	const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
-	int cpu;
 	bool do_barriers;
 	mdk_rdev_t *blocked_rdev;
 
@@ -833,12 +832,6 @@ static int make_request(struct request_queue *q, struct bio * bio)
 
 	bitmap = mddev->bitmap;
 
-	cpu = part_stat_lock();
-	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
-	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
-		      bio_sectors(bio));
-	part_stat_unlock();
-
 	/*
 	 * make_request() can abort the operation when READA is being
 	 * used and no empty request is available.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 57d71d5..ca313d6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -795,7 +795,6 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	mirror_info_t *mirror;
 	r10bio_t *r10_bio;
 	struct bio *read_bio;
-	int cpu;
 	int i;
 	int chunk_sects = conf->chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
@@ -850,12 +849,6 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	 */
 	wait_barrier(conf);
 
-	cpu = part_stat_lock();
-	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
-	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
-		      bio_sectors(bio));
-	part_stat_unlock();
-
 	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
 	r10_bio->master_bio = bio;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7bfeba3..c6ae7c1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3879,7 +3879,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 	sector_t logical_sector, last_sector;
 	struct stripe_head *sh;
 	const int rw = bio_data_dir(bi);
-	int cpu, remaining;
+	int remaining;
 
 	if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
 		/* Drain all pending writes.  We only really need
@@ -3894,12 +3894,6 @@ static int make_request(struct request_queue *q, struct bio * bi)
 
 	md_write_start(mddev, bi);
 
-	cpu = part_stat_lock();
-	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
-	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
-		      bio_sectors(bi));
-	part_stat_unlock();
-
 	if (rw == READ &&
 	     mddev->reshape_position == MaxSector &&
 	     chunk_aligned_read(q,bi))
-- 
cgit v0.10.2


From 49ce6cea85fb8d25ee59486c919406e9cecf1762 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 29 Mar 2010 10:51:42 +1100
Subject: md: don't reference gendisk in getgeo

Using ->array_sectors rather than get_capacity() is more
direct and is a step towards relaxing the tight connection
between mddev and gendisk.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 117663d..69f2a8e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5556,7 +5556,7 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 
 	geo->heads = 2;
 	geo->sectors = 4;
-	geo->cylinders = get_capacity(mddev->gendisk) / 8;
+	geo->cylinders = mddev->array_sectors / 8;
 	return 0;
 }
 
-- 
cgit v0.10.2


From b821eaa572fd737faaf6928ba046e571526c36c6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 29 Mar 2010 11:18:15 +1100
Subject: md: remove ->changed and related code.

We set ->changed to 1 and call check_disk_change at the end
of md_open so that bd_invalidated would be set and thus
partition rescan would happen appropriately.

Now that we call revalidate_disk directly, which sets bd_invalidates,
that indirection is no longer needed and can be removed.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 69f2a8e..f2b3001 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4501,7 +4501,6 @@ static int do_md_run(mddev_t * mddev)
 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
 
 	revalidate_disk(mddev->gendisk);
-	mddev->changed = 1;
 	md_new_event(mddev);
 	sysfs_notify_dirent(mddev->sysfs_state);
 	if (mddev->sysfs_action)
@@ -4620,7 +4619,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 				}
 
 			set_capacity(disk, 0);
-			mddev->changed = 1;
+			revalidate_disk(disk);
 
 			if (mddev->ro)
 				mddev->ro = 0;
@@ -4686,7 +4685,6 @@ out:
 		mddev->sync_speed_min = mddev->sync_speed_max = 0;
 		mddev->recovery = 0;
 		mddev->in_sync = 0;
-		mddev->changed = 0;
 		mddev->degraded = 0;
 		mddev->barriers_work = 0;
 		mddev->safemode = 0;
@@ -5850,7 +5848,6 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 	atomic_inc(&mddev->openers);
 	mutex_unlock(&mddev->open_mutex);
 
-	check_disk_change(bdev);
  out:
 	return err;
 }
@@ -5865,21 +5862,6 @@ static int md_release(struct gendisk *disk, fmode_t mode)
 
 	return 0;
 }
-
-static int md_media_changed(struct gendisk *disk)
-{
-	mddev_t *mddev = disk->private_data;
-
-	return mddev->changed;
-}
-
-static int md_revalidate(struct gendisk *disk)
-{
-	mddev_t *mddev = disk->private_data;
-
-	mddev->changed = 0;
-	return 0;
-}
 static const struct block_device_operations md_fops =
 {
 	.owner		= THIS_MODULE,
@@ -5890,8 +5872,6 @@ static const struct block_device_operations md_fops =
 	.compat_ioctl	= md_compat_ioctl,
 #endif
 	.getgeo		= md_getgeo,
-	.media_changed	= md_media_changed,
-	.revalidate_disk= md_revalidate,
 };
 
 static int md_thread(void * arg)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e4836c68..3225e25 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -237,7 +237,6 @@ struct mddev_s
 	atomic_t			active;		/* general refcount */
 	atomic_t			openers;	/* number of active opens */
 
-	int				changed;	/* true if we might need to reread partition info */
 	int				degraded;	/* whether md should consider
 							 * adding a spare
 							 */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e277013..eebce16 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2184,7 +2184,6 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
 	if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
 		return -EINVAL;
 	set_capacity(mddev->gendisk, mddev->array_sectors);
-	mddev->changed = 1;
 	revalidate_disk(mddev->gendisk);
 	if (sectors > mddev->dev_sectors &&
 	    mddev->recovery_cp == MaxSector) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c6ae7c1..231afda 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5335,7 +5335,6 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	    raid5_size(mddev, sectors, mddev->raid_disks))
 		return -EINVAL;
 	set_capacity(mddev->gendisk, mddev->array_sectors);
-	mddev->changed = 1;
 	revalidate_disk(mddev->gendisk);
 	if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
 		mddev->recovery_cp = mddev->dev_sectors;
@@ -5549,7 +5548,6 @@ static void raid5_finish_reshape(mddev_t *mddev)
 		if (mddev->delta_disks > 0) {
 			md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
 			set_capacity(mddev->gendisk, mddev->array_sectors);
-			mddev->changed = 1;
 			revalidate_disk(mddev->gendisk);
 		} else {
 			int d;
-- 
cgit v0.10.2


From fe60b0142813002be16dfae28780d9779ee22473 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 29 Mar 2010 11:10:42 +1100
Subject: md: factor do_md_run to separate accesses to ->gendisk

As part of relaxing the binding between an mddev and gendisk,
we separate do_md_run into two functions.
  md_run does all the work internal to md
  do_md_run calls md_run and makes and changes to gendisk
     that are required.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f2b3001..e752332 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4297,11 +4297,10 @@ static void md_safemode_timeout(unsigned long data)
 
 static int start_dirty_degraded;
 
-static int do_md_run(mddev_t * mddev)
+static int md_run(mddev_t *mddev)
 {
 	int err;
 	mdk_rdev_t *rdev;
-	struct gendisk *disk;
 	struct mdk_personality *pers;
 
 	if (list_empty(&mddev->disks))
@@ -4366,8 +4365,6 @@ static int do_md_run(mddev_t * mddev)
 		sysfs_notify_dirent(rdev->sysfs_state);
 	}
 
-	disk = mddev->gendisk;
-
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
 	if (!pers || !try_module_get(pers->owner)) {
@@ -4495,21 +4492,32 @@ static int do_md_run(mddev_t * mddev)
 	if (mddev->flags)
 		md_update_sb(mddev, 0);
 
-	set_capacity(disk, mddev->array_sectors);
-
 	md_wakeup_thread(mddev->thread);
 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
 
-	revalidate_disk(mddev->gendisk);
 	md_new_event(mddev);
 	sysfs_notify_dirent(mddev->sysfs_state);
 	if (mddev->sysfs_action)
 		sysfs_notify_dirent(mddev->sysfs_action);
 	sysfs_notify(&mddev->kobj, NULL, "degraded");
-	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
 	return 0;
 }
 
+static int do_md_run(mddev_t *mddev)
+{
+	int err;
+
+	err = md_run(mddev);
+	if (err)
+		goto out;
+
+	set_capacity(mddev->gendisk, mddev->array_sectors);
+	revalidate_disk(mddev->gendisk);
+	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+out:
+	return err;
+}
+
 static int restart_array(mddev_t *mddev)
 {
 	struct gendisk *disk = mddev->gendisk;
-- 
cgit v0.10.2


From 6177b472ab14e1ac88896960370dd54ba577d926 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 29 Mar 2010 11:37:13 +1100
Subject: md: start to refactor do_md_stop

do_md_stop is large and clunky, so hard to understand.

This is a first step of refactoring, pulling two simple
sub-functions out.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e752332..002d0a3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4568,6 +4568,58 @@ void restore_bitmap_write_access(struct file *file)
 	spin_unlock(&inode->i_lock);
 }
 
+static void md_clean(mddev_t *mddev)
+{
+	mddev->array_sectors = 0;
+	mddev->external_size = 0;
+	mddev->dev_sectors = 0;
+	mddev->raid_disks = 0;
+	mddev->recovery_cp = 0;
+	mddev->resync_min = 0;
+	mddev->resync_max = MaxSector;
+	mddev->reshape_position = MaxSector;
+	mddev->external = 0;
+	mddev->persistent = 0;
+	mddev->level = LEVEL_NONE;
+	mddev->clevel[0] = 0;
+	mddev->flags = 0;
+	mddev->ro = 0;
+	mddev->metadata_type[0] = 0;
+	mddev->chunk_sectors = 0;
+	mddev->ctime = mddev->utime = 0;
+	mddev->layout = 0;
+	mddev->max_disks = 0;
+	mddev->events = 0;
+	mddev->delta_disks = 0;
+	mddev->new_level = LEVEL_NONE;
+	mddev->new_layout = 0;
+	mddev->new_chunk_sectors = 0;
+	mddev->curr_resync = 0;
+	mddev->resync_mismatches = 0;
+	mddev->suspend_lo = mddev->suspend_hi = 0;
+	mddev->sync_speed_min = mddev->sync_speed_max = 0;
+	mddev->recovery = 0;
+	mddev->in_sync = 0;
+	mddev->degraded = 0;
+	mddev->barriers_work = 0;
+	mddev->safemode = 0;
+	mddev->bitmap_info.offset = 0;
+	mddev->bitmap_info.default_offset = 0;
+	mddev->bitmap_info.chunksize = 0;
+	mddev->bitmap_info.daemon_sleep = 0;
+	mddev->bitmap_info.max_write_behind = 0;
+}
+
+static void md_stop(mddev_t *mddev)
+{
+	mddev->pers->stop(mddev);
+	if (mddev->pers->sync_request && mddev->to_remove == NULL)
+		mddev->to_remove = &md_redundancy_group;
+	module_put(mddev->pers->owner);
+	mddev->pers = NULL;
+
+}
+
 /* mode:
  *   0 - completely stop and dis-assemble array
  *   1 - switch to readonly
@@ -4608,14 +4660,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			if (mddev->ro)
 				set_disk_ro(disk, 0);
 
-			mddev->pers->stop(mddev);
+			md_stop(mddev);
 			mddev->queue->merge_bvec_fn = NULL;
 			mddev->queue->unplug_fn = NULL;
 			mddev->queue->backing_dev_info.congested_fn = NULL;
-			module_put(mddev->pers->owner);
-			if (mddev->pers->sync_request && mddev->to_remove == NULL)
-				mddev->to_remove = &md_redundancy_group;
-			mddev->pers = NULL;
+
 			/* tell userspace to handle 'inactive' */
 			sysfs_notify_dirent(mddev->sysfs_state);
 
@@ -4663,44 +4712,7 @@ out:
 
 		export_array(mddev);
 
-		mddev->array_sectors = 0;
-		mddev->external_size = 0;
-		mddev->dev_sectors = 0;
-		mddev->raid_disks = 0;
-		mddev->recovery_cp = 0;
-		mddev->resync_min = 0;
-		mddev->resync_max = MaxSector;
-		mddev->reshape_position = MaxSector;
-		mddev->external = 0;
-		mddev->persistent = 0;
-		mddev->level = LEVEL_NONE;
-		mddev->clevel[0] = 0;
-		mddev->flags = 0;
-		mddev->ro = 0;
-		mddev->metadata_type[0] = 0;
-		mddev->chunk_sectors = 0;
-		mddev->ctime = mddev->utime = 0;
-		mddev->layout = 0;
-		mddev->max_disks = 0;
-		mddev->events = 0;
-		mddev->delta_disks = 0;
-		mddev->new_level = LEVEL_NONE;
-		mddev->new_layout = 0;
-		mddev->new_chunk_sectors = 0;
-		mddev->curr_resync = 0;
-		mddev->resync_mismatches = 0;
-		mddev->suspend_lo = mddev->suspend_hi = 0;
-		mddev->sync_speed_min = mddev->sync_speed_max = 0;
-		mddev->recovery = 0;
-		mddev->in_sync = 0;
-		mddev->degraded = 0;
-		mddev->barriers_work = 0;
-		mddev->safemode = 0;
-		mddev->bitmap_info.offset = 0;
-		mddev->bitmap_info.default_offset = 0;
-		mddev->bitmap_info.chunksize = 0;
-		mddev->bitmap_info.daemon_sleep = 0;
-		mddev->bitmap_info.max_write_behind = 0;
+		md_clean(mddev);
 		kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
 		if (mddev->hold_active == UNTIL_STOP)
 			mddev->hold_active = 0;
-- 
cgit v0.10.2


From a047e125403112ceb4d41e68307a2e7498ddba4e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 29 Mar 2010 12:07:53 +1100
Subject: md: factor md_stop_writes out of do_md_stop.

Further refactoring of do_md_stop.
This one requires some explanation as it takes code from different
places in do_md_stop, so some re-ordering happens.

We only get into this part of do_md_stop if there are no active opens
of the device, so no writes can be happening and the device must have
been flushed.  In md_stop_writes we want to stop any internal sources
of writes - i.e. resync - and flush out the metadata.

The only code that was previously before some of this code is
code to clean up the queue, the mddev, the gendisk, or sysfs, all
of which is probably better after code that makes active changes (i.e.
triggers writes).

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 002d0a3..86dfbc3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4610,6 +4610,27 @@ static void md_clean(mddev_t *mddev)
 	mddev->bitmap_info.max_write_behind = 0;
 }
 
+static void md_stop_writes(mddev_t *mddev)
+{
+	if (mddev->sync_thread) {
+		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+		md_unregister_thread(mddev->sync_thread);
+		mddev->sync_thread = NULL;
+	}
+
+	del_timer_sync(&mddev->safemode_timer);
+
+	bitmap_flush(mddev);
+	md_super_wait(mddev);
+
+	if (!mddev->in_sync || mddev->flags) {
+		/* mark array as shutdown cleanly */
+		mddev->in_sync = 1;
+		md_update_sb(mddev, 1);
+	}
+}
+
 static void md_stop(mddev_t *mddev)
 {
 	mddev->pers->stop(mddev);
@@ -4637,14 +4658,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		err = -EBUSY;
 	} else if (mddev->pers) {
 
-		if (mddev->sync_thread) {
-			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-			md_unregister_thread(mddev->sync_thread);
-			mddev->sync_thread = NULL;
-		}
-
-		del_timer_sync(&mddev->safemode_timer);
+		md_stop_writes(mddev);
 
 		switch(mode) {
 		case 1: /* readonly */
@@ -4655,8 +4669,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			break;
 		case 0: /* disassemble */
 		case 2: /* stop */
-			bitmap_flush(mddev);
-			md_super_wait(mddev);
 			if (mddev->ro)
 				set_disk_ro(disk, 0);
 
@@ -4681,11 +4693,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			if (mddev->ro)
 				mddev->ro = 0;
 		}
-		if (!mddev->in_sync || mddev->flags) {
-			/* mark array as shutdown cleanly */
-			mddev->in_sync = 1;
-			md_update_sb(mddev, 1);
-		}
 		if (mode == 1)
 			set_disk_ro(disk, 1);
 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-- 
cgit v0.10.2


From a4bd82d0d03b1485975579f131ccfd0aad6b7d6e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 29 Mar 2010 13:23:10 +1100
Subject: md: split md_set_readonly out of do_md_stop

Using do_md_stop to set an array to read-only is a little confusing.
Now most of the common code has been factored out, split
md_set_readonly off in to a separate function.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 86dfbc3..3a2710a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3309,6 +3309,7 @@ array_state_show(mddev_t *mddev, char *page)
 }
 
 static int do_md_stop(mddev_t * mddev, int ro, int is_open);
+static int md_set_readonly(mddev_t * mddev, int is_open);
 static int do_md_run(mddev_t * mddev);
 static int restart_array(mddev_t *mddev);
 
@@ -3339,7 +3340,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 		break; /* not supported yet */
 	case readonly:
 		if (mddev->pers)
-			err = do_md_stop(mddev, 1, 0);
+			err = md_set_readonly(mddev, 0);
 		else {
 			mddev->ro = 1;
 			set_disk_ro(mddev->gendisk, 1);
@@ -3349,7 +3350,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 	case read_auto:
 		if (mddev->pers) {
 			if (mddev->ro == 0)
-				err = do_md_stop(mddev, 1, 0);
+				err = md_set_readonly(mddev, 0);
 			else if (mddev->ro == 1)
 				err = restart_array(mddev);
 			if (err == 0) {
@@ -4641,9 +4642,34 @@ static void md_stop(mddev_t *mddev)
 
 }
 
+static int md_set_readonly(mddev_t *mddev, int is_open)
+{
+	int err = 0;
+	mutex_lock(&mddev->open_mutex);
+	if (atomic_read(&mddev->openers) > is_open) {
+		printk("md: %s still in use.\n",mdname(mddev));
+		err = -EBUSY;
+		goto out;
+	}
+	if (mddev->pers) {
+		md_stop_writes(mddev);
+
+		err  = -ENXIO;
+		if (mddev->ro==1)
+			goto out;
+		mddev->ro = 1;
+		set_disk_ro(mddev->gendisk, 1);
+		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+		sysfs_notify_dirent(mddev->sysfs_state);
+		err = 0;	
+	}
+out:
+	mutex_unlock(&mddev->open_mutex);
+	return err;
+}
+
 /* mode:
  *   0 - completely stop and dis-assemble array
- *   1 - switch to readonly
  *   2 - stop but do not disassemble array
  */
 static int do_md_stop(mddev_t * mddev, int mode, int is_open)
@@ -4660,45 +4686,33 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 
 		md_stop_writes(mddev);
 
-		switch(mode) {
-		case 1: /* readonly */
-			err  = -ENXIO;
-			if (mddev->ro==1)
-				goto out;
-			mddev->ro = 1;
-			break;
-		case 0: /* disassemble */
-		case 2: /* stop */
-			if (mddev->ro)
-				set_disk_ro(disk, 0);
+		if (mddev->ro)
+			set_disk_ro(disk, 0);
 
-			md_stop(mddev);
-			mddev->queue->merge_bvec_fn = NULL;
-			mddev->queue->unplug_fn = NULL;
-			mddev->queue->backing_dev_info.congested_fn = NULL;
+		md_stop(mddev);
+		mddev->queue->merge_bvec_fn = NULL;
+		mddev->queue->unplug_fn = NULL;
+		mddev->queue->backing_dev_info.congested_fn = NULL;
 
-			/* tell userspace to handle 'inactive' */
-			sysfs_notify_dirent(mddev->sysfs_state);
+		/* tell userspace to handle 'inactive' */
+		sysfs_notify_dirent(mddev->sysfs_state);
 
-			list_for_each_entry(rdev, &mddev->disks, same_set)
-				if (rdev->raid_disk >= 0) {
-					char nm[20];
-					sprintf(nm, "rd%d", rdev->raid_disk);
-					sysfs_remove_link(&mddev->kobj, nm);
-				}
+		list_for_each_entry(rdev, &mddev->disks, same_set)
+			if (rdev->raid_disk >= 0) {
+				char nm[20];
+				sprintf(nm, "rd%d", rdev->raid_disk);
+				sysfs_remove_link(&mddev->kobj, nm);
+			}
 
-			set_capacity(disk, 0);
-			revalidate_disk(disk);
+		set_capacity(disk, 0);
+		revalidate_disk(disk);
 
-			if (mddev->ro)
-				mddev->ro = 0;
-		}
-		if (mode == 1)
-			set_disk_ro(disk, 1);
+		if (mddev->ro)
+			mddev->ro = 0;
+		
 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 		err = 0;
 	}
-out:
 	mutex_unlock(&mddev->open_mutex);
 	if (err)
 		return err;
@@ -4724,9 +4738,7 @@ out:
 		if (mddev->hold_active == UNTIL_STOP)
 			mddev->hold_active = 0;
 
-	} else if (mddev->pers)
-		printk(KERN_INFO "md: %s switched to read-only mode.\n",
-			mdname(mddev));
+	}
 	err = 0;
 	blk_integrity_unregister(disk);
 	md_new_event(mddev);
@@ -5724,7 +5736,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 			goto done_unlock;
 
 		case STOP_ARRAY_RO:
-			err = do_md_stop(mddev, 1, 1);
+			err = md_set_readonly(mddev, 1);
 			goto done_unlock;
 
 		case BLKROSET:
@@ -7140,7 +7152,7 @@ static int md_notify_reboot(struct notifier_block *this,
 				 * appears to still be in use.  Hence
 				 * the '100'.
 				 */
-				do_md_stop(mddev, 1, 100);
+				md_set_readonly(mddev, 100);
 				mddev_unlock(mddev);
 			}
 		/*
-- 
cgit v0.10.2


From cca9cf90c504d98644ace52c474770970729f0eb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 1 Apr 2010 12:08:16 +1100
Subject: md: call md_stop_writes from md_stop

This moves the call to the other side of set_readonly, but that should
not be an issue.
This encapsulates in 'md_stop' all of the functionality for internally
stopping the array, leaving all the interactions with externalities
(sysfs, request_queue, gendisk) in do_md_stop.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3a2710a..f48ba41 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4634,12 +4634,14 @@ static void md_stop_writes(mddev_t *mddev)
 
 static void md_stop(mddev_t *mddev)
 {
+	md_stop_writes(mddev);
+
 	mddev->pers->stop(mddev);
 	if (mddev->pers->sync_request && mddev->to_remove == NULL)
 		mddev->to_remove = &md_redundancy_group;
 	module_put(mddev->pers->owner);
 	mddev->pers = NULL;
-
+	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 }
 
 static int md_set_readonly(mddev_t *mddev, int is_open)
@@ -4684,8 +4686,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		err = -EBUSY;
 	} else if (mddev->pers) {
 
-		md_stop_writes(mddev);
-
 		if (mddev->ro)
 			set_disk_ro(disk, 0);
 
@@ -4710,7 +4710,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		if (mddev->ro)
 			mddev->ro = 0;
 		
-		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 		err = 0;
 	}
 	mutex_unlock(&mddev->open_mutex);
-- 
cgit v0.10.2


From 21a52c6d05c15f862797736393915bfa8cd40ee9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 1 Apr 2010 15:02:13 +1100
Subject: md: pass mddev to make_request functions rather than request_queue

We used to pass the personality make_request function direct
to the block layer so the first argument had to be a queue.
But now we have the intermediary md_make_request so it makes
at lot more sense to pass a struct mddev_s.
It makes it possible to have an mddev without its own queue too.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 608a8d3..bd4348f 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -168,9 +168,8 @@ static void add_sector(conf_t *conf, sector_t start, int mode)
 		conf->nfaults = n+1;
 }
 
-static int make_request(struct request_queue *q, struct bio *bio)
+static int make_request(mddev_t *mddev, struct bio *bio)
 {
-	mddev_t *mddev = q->queuedata;
 	conf_t *conf = mddev->private;
 	int failit = 0;
 
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3048c17..3204a22 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -286,9 +286,8 @@ static int linear_stop (mddev_t *mddev)
 	return 0;
 }
 
-static int linear_make_request (struct request_queue *q, struct bio *bio)
+static int linear_make_request (mddev_t *mddev, struct bio *bio)
 {
-	mddev_t *mddev = q->queuedata;
 	dev_info_t *tmp_dev;
 	sector_t start_sector;
 
@@ -328,9 +327,9 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 
 		bp = bio_split(bio, end_sector - bio->bi_sector);
 
-		if (linear_make_request(q, &bp->bio1))
+		if (linear_make_request(mddev, &bp->bio1))
 			generic_make_request(&bp->bio1);
-		if (linear_make_request(q, &bp->bio2))
+		if (linear_make_request(mddev, &bp->bio2))
 			generic_make_request(&bp->bio2);
 		bio_pair_release(bp);
 		return 0;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f48ba41..2e05b0c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -240,7 +240,7 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
 	atomic_inc(&mddev->active_io);
 	rcu_read_unlock();
 
-	rv = mddev->pers->make_request(q, bio);
+	rv = mddev->pers->make_request(mddev, bio);
 
 	cpu = part_stat_lock();
 	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
@@ -354,7 +354,7 @@ static void md_submit_barrier(struct work_struct *ws)
 		bio_endio(bio, 0);
 	else {
 		bio->bi_rw &= ~(1<<BIO_RW_BARRIER);
-		if (mddev->pers->make_request(mddev->queue, bio))
+		if (mddev->pers->make_request(mddev, bio))
 			generic_make_request(bio);
 		mddev->barrier = POST_REQUEST_BARRIER;
 		submit_barriers(mddev);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 3225e25..a536f54 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -330,7 +330,7 @@ struct mdk_personality
 	int level;
 	struct list_head list;
 	struct module *owner;
-	int (*make_request)(struct request_queue *q, struct bio *bio);
+	int (*make_request)(mddev_t *mddev, struct bio *bio);
 	int (*run)(mddev_t *mddev);
 	int (*stop)(mddev_t *mddev);
 	void (*status)(struct seq_file *seq, mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 5b4e291..50bf8e6 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -135,9 +135,8 @@ static void multipath_unplug(struct request_queue *q)
 }
 
 
-static int multipath_make_request (struct request_queue *q, struct bio * bio)
+static int multipath_make_request(mddev_t *mddev, struct bio * bio)
 {
-	mddev_t *mddev = q->queuedata;
 	multipath_conf_t *conf = mddev->private;
 	struct multipath_bh * mp_bh;
 	struct multipath_info *multipath;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d535f9b..9f9c6b7 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -465,9 +465,8 @@ static inline int is_io_in_chunk_boundary(mddev_t *mddev,
 	}
 }
 
-static int raid0_make_request(struct request_queue *q, struct bio *bio)
+static int raid0_make_request(mddev_t *mddev, struct bio *bio)
 {
-	mddev_t *mddev = q->queuedata;
 	unsigned int chunk_sects;
 	sector_t sector_offset;
 	struct strip_zone *zone;
@@ -495,9 +494,9 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
 		else
 			bp = bio_split(bio, chunk_sects -
 				       sector_div(sector, chunk_sects));
-		if (raid0_make_request(q, &bp->bio1))
+		if (raid0_make_request(mddev, &bp->bio1))
 			generic_make_request(&bp->bio1);
-		if (raid0_make_request(q, &bp->bio2))
+		if (raid0_make_request(mddev, &bp->bio2))
 			generic_make_request(&bp->bio2);
 
 		bio_pair_release(bp);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index eebce16..5ff75c4 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -773,9 +773,8 @@ do_sync_io:
 	return NULL;
 }
 
-static int make_request(struct request_queue *q, struct bio * bio)
+static int make_request(mddev_t *mddev, struct bio * bio)
 {
-	mddev_t *mddev = q->queuedata;
 	conf_t *conf = mddev->private;
 	mirror_info_t *mirror;
 	r1bio_t *r1_bio;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ca313d6..a1d72761 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -788,9 +788,8 @@ static void unfreeze_array(conf_t *conf)
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static int make_request(struct request_queue *q, struct bio * bio)
+static int make_request(mddev_t *mddev, struct bio * bio)
 {
-	mddev_t *mddev = q->queuedata;
 	conf_t *conf = mddev->private;
 	mirror_info_t *mirror;
 	r10bio_t *r10_bio;
@@ -824,9 +823,9 @@ static int make_request(struct request_queue *q, struct bio * bio)
 		 */
 		bp = bio_split(bio,
 			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
-		if (make_request(q, &bp->bio1))
+		if (make_request(mddev, &bp->bio1))
 			generic_make_request(&bp->bio1);
-		if (make_request(q, &bp->bio2))
+		if (make_request(mddev, &bp->bio2))
 			generic_make_request(&bp->bio2);
 
 		bio_pair_release(bp);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 231afda..2882a26 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3753,9 +3753,8 @@ static int bio_fits_rdev(struct bio *bi)
 }
 
 
-static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
+static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
 {
-	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev->private;
 	int dd_idx;
 	struct bio* align_bi;
@@ -3870,9 +3869,8 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
 	return sh;
 }
 
-static int make_request(struct request_queue *q, struct bio * bi)
+static int make_request(mddev_t *mddev, struct bio * bi)
 {
-	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev->private;
 	int dd_idx;
 	sector_t new_sector;
@@ -3896,7 +3894,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 
 	if (rw == READ &&
 	     mddev->reshape_position == MaxSector &&
-	     chunk_aligned_read(q,bi))
+	     chunk_aligned_read(mddev,bi))
 		return 0;
 
 	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-- 
cgit v0.10.2


From fafd7fb052182e087b5a3c6c408e4ac8c2b5fa14 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 1 Apr 2010 15:55:30 +1100
Subject: md: factor out init code for an mddev

This is a simple factorisation that makes mddev_find easier to read.


Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2e05b0c..d3579fc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -416,6 +416,27 @@ static void mddev_put(mddev_t *mddev)
 	spin_unlock(&all_mddevs_lock);
 }
 
+static void mddev_init(mddev_t *mddev)
+{
+	mutex_init(&mddev->open_mutex);
+	mutex_init(&mddev->reconfig_mutex);
+	mutex_init(&mddev->bitmap_info.mutex);
+	INIT_LIST_HEAD(&mddev->disks);
+	INIT_LIST_HEAD(&mddev->all_mddevs);
+	init_timer(&mddev->safemode_timer);
+	atomic_set(&mddev->active, 1);
+	atomic_set(&mddev->openers, 0);
+	atomic_set(&mddev->active_io, 0);
+	spin_lock_init(&mddev->write_lock);
+	atomic_set(&mddev->flush_pending, 0);
+	init_waitqueue_head(&mddev->sb_wait);
+	init_waitqueue_head(&mddev->recovery_wait);
+	mddev->reshape_position = MaxSector;
+	mddev->resync_min = 0;
+	mddev->resync_max = MaxSector;
+	mddev->level = LEVEL_NONE;
+}
+
 static mddev_t * mddev_find(dev_t unit)
 {
 	mddev_t *mddev, *new = NULL;
@@ -482,23 +503,7 @@ static mddev_t * mddev_find(dev_t unit)
 	else
 		new->md_minor = MINOR(unit) >> MdpMinorShift;
 
-	mutex_init(&new->open_mutex);
-	mutex_init(&new->reconfig_mutex);
-	mutex_init(&new->bitmap_info.mutex);
-	INIT_LIST_HEAD(&new->disks);
-	INIT_LIST_HEAD(&new->all_mddevs);
-	init_timer(&new->safemode_timer);
-	atomic_set(&new->active, 1);
-	atomic_set(&new->openers, 0);
-	atomic_set(&new->active_io, 0);
-	spin_lock_init(&new->write_lock);
-	atomic_set(&new->flush_pending, 0);
-	init_waitqueue_head(&new->sb_wait);
-	init_waitqueue_head(&new->recovery_wait);
-	new->reshape_position = MaxSector;
-	new->resync_min = 0;
-	new->resync_max = MaxSector;
-	new->level = LEVEL_NONE;
+	mddev_init(new);
 
 	goto retry;
 }
-- 
cgit v0.10.2


From 9e35b99c7efacfddc748c89a0c53b1122b0ee72c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 6 Apr 2010 14:23:02 +1000
Subject: md: don't unregister the thread in mddev_suspend

This is
 - unnecessary because mddev_suspend is always followed by a call to
   ->stop, and each ->stop unregisters the thread, and
 - a problem as it makes it awkwards to suspend and then resume a
   device as we will want later.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d3579fc..af0780a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -254,6 +254,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
 	return rv;
 }
 
+/* mddev_suspend makes sure no new requests are submitted
+ * to the device, and that any requests that have been submitted
+ * are completely handled.
+ * Once ->stop is called and completes, the module will be completely
+ * unused.
+ */
 static void mddev_suspend(mddev_t *mddev)
 {
 	BUG_ON(mddev->suspended);
@@ -261,13 +267,6 @@ static void mddev_suspend(mddev_t *mddev)
 	synchronize_rcu();
 	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
 	mddev->pers->quiesce(mddev, 1);
-	md_unregister_thread(mddev->thread);
-	mddev->thread = NULL;
-	/* we now know that no code is executing in the personality module,
-	 * except possibly the tail end of a ->bi_end_io function, but that
-	 * is certain to complete before the module has a chance to get
-	 * unloaded
-	 */
 }
 
 static void mddev_resume(mddev_t *mddev)
-- 
cgit v0.10.2


From d754c5ae1ff76b20d3ecde8ad666d7865eada8ae Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 7 Apr 2010 12:14:43 +1000
Subject: md/raid1: fix confusing 'redirect sector' message.

This message seems to suggest the named device is the one on which a
read failed, however it is actually the device that the read will be
redirected to.
So make the message a little clearer.

Reported-by: Tim Burgess <ozburgess@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5ff75c4..2e08e48 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1689,10 +1689,10 @@ static void raid1d(mddev_t *mddev)
 				r1_bio->bios[r1_bio->read_disk] = bio;
 				rdev = conf->mirrors[disk].rdev;
 				if (printk_ratelimit())
-					printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
-					       " another mirror\n",
-					       bdevname(rdev->bdev,b),
-					       (unsigned long long)r1_bio->sector);
+					printk(KERN_ERR "raid1: redirecting sector %llu to"
+					       " other mirror: %s\n",
+					       (unsigned long long)r1_bio->sector,
+					       bdevname(rdev->bdev,b));
 				bio->bi_sector = r1_bio->sector + rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
 				bio->bi_end_io = raid1_end_read_request;
-- 
cgit v0.10.2


From e555190d82c0f58e825e3cbd9e6ebe2e7ac713bd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 31 Mar 2010 11:21:44 +1100
Subject: md/raid1: delay reads that could overtake behind-writes.

When a raid1 array is configured to support write-behind
on some devices, it normally only reads from other devices.
If all devices are write-behind (because the rest have failed)
it is possible for a read request to be serviced before a
behind-write request, which would appear as data corruption.

So when forced to read from a WriteMostly device, wait for any
write-behind to complete, and don't start any more behind-writes.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 49d6080..c9c6a34 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1356,7 +1356,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 {
 	if (!bitmap) return;
 	if (behind) {
-		atomic_dec(&bitmap->behind_writes);
+		if (atomic_dec_and_test(&bitmap->behind_writes))
+			wake_up(&bitmap->behind_wait);
 		PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
 		  atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
 	}
@@ -1680,6 +1681,7 @@ int bitmap_create(mddev_t *mddev)
 	atomic_set(&bitmap->pending_writes, 0);
 	init_waitqueue_head(&bitmap->write_wait);
 	init_waitqueue_head(&bitmap->overflow_wait);
+	init_waitqueue_head(&bitmap->behind_wait);
 
 	bitmap->mddev = mddev;
 
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index aa82b7c..3797dea 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -240,6 +240,7 @@ struct bitmap {
 	atomic_t pending_writes; /* pending writes to the bitmap file */
 	wait_queue_head_t write_wait;
 	wait_queue_head_t overflow_wait;
+	wait_queue_head_t behind_wait;
 
 	struct sysfs_dirent *sysfs_can_clear;
 };
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 2e08e48..cb2da87 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -857,6 +857,15 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		}
 		mirror = conf->mirrors + rdisk;
 
+		if (test_bit(WriteMostly, &mirror->rdev->flags) &&
+		    bitmap) {
+			/* Reading from a write-mostly device must
+			 * take care not to over-take any writes
+			 * that are 'behind'
+			 */
+			wait_event(bitmap->behind_wait,
+				   atomic_read(&bitmap->behind_writes) == 0);
+		}
 		r1_bio->read_disk = rdisk;
 
 		read_bio = bio_clone(bio, GFP_NOIO);
@@ -934,10 +943,14 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		set_bit(R1BIO_Degraded, &r1_bio->state);
 	}
 
-	/* do behind I/O ? */
+	/* do behind I/O ?
+	 * Not if there are too many, or cannot allocate memory,
+	 * or a reader on WriteMostly is waiting for behind writes 
+	 * to flush */
 	if (bitmap &&
 	    (atomic_read(&bitmap->behind_writes)
 	     < mddev->bitmap_info.max_write_behind) &&
+	    !waitqueue_active(&bitmap->behind_wait) &&
 	    (behind_pages = alloc_behind_pages(bio)) != NULL)
 		set_bit(R1BIO_BehindIO, &r1_bio->state);
 
@@ -2144,15 +2157,13 @@ static int stop(mddev_t *mddev)
 {
 	conf_t *conf = mddev->private;
 	struct bitmap *bitmap = mddev->bitmap;
-	int behind_wait = 0;
 
 	/* wait for behind writes to complete */
-	while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
-		behind_wait++;
-		printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(HZ); /* wait a second */
+	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+		printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop.\n", mdname(mddev));
 		/* need to kick something here to make sure I/O goes? */
+		wait_event(bitmap->behind_wait,
+			   atomic_read(&bitmap->behind_writes) == 0);
 	}
 
 	raise_barrier(conf);
-- 
cgit v0.10.2


From f1b29bcae116409db5e543622aadab43041c9ae9 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 1 May 2010 18:09:05 -0700
Subject: md/raid4: permit raid0 takeover

For consistency allow raid4 to takeover raid0 in addition to raid5 (with a
raid4 layout).

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2882a26..81563b7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5610,10 +5610,17 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 }
 
 
-static void *raid5_takeover_raid0(mddev_t *mddev)
+static void *raid45_takeover_raid0(mddev_t *mddev, int level)
 {
+	struct raid0_private_data *raid0_priv = mddev->private;
 
-	mddev->new_level = 5;
+	/* for raid0 takeover only one zone is supported */
+	if (raid0_priv->nr_strip_zones > 1) {
+		printk(KERN_ERR "md: cannot takeover raid0 with more than one zone.\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	mddev->new_level = level;
 	mddev->new_layout = ALGORITHM_PARITY_N;
 	mddev->new_chunk_sectors = mddev->chunk_sectors;
 	mddev->raid_disks += 1;
@@ -5749,22 +5756,13 @@ static int raid6_check_reshape(mddev_t *mddev)
 static void *raid5_takeover(mddev_t *mddev)
 {
 	/* raid5 can take over:
-	 *  raid0 - if all devices are the same - make it a raid4 layout
+	 *  raid0 - if there is only one strip zone - make it a raid4 layout
 	 *  raid1 - if there are two drives.  We need to know the chunk size
 	 *  raid4 - trivial - just use a raid4 layout.
 	 *  raid6 - Providing it is a *_6 layout
 	 */
-	if (mddev->level == 0) {
-		/* for raid0 takeover only one zone is supported */
-		struct raid0_private_data *raid0_priv
-			= mddev->private;
-		if (raid0_priv->nr_strip_zones > 1) {
-			printk(KERN_ERR "md: cannot takeover raid 0 with more than one zone.\n");
-			return ERR_PTR(-EINVAL);
-		}
-		return raid5_takeover_raid0(mddev);
-	}
-
+	if (mddev->level == 0)
+		return raid45_takeover_raid0(mddev, 5);
 	if (mddev->level == 1)
 		return raid5_takeover_raid1(mddev);
 	if (mddev->level == 4) {
@@ -5780,8 +5778,12 @@ static void *raid5_takeover(mddev_t *mddev)
 
 static void *raid4_takeover(mddev_t *mddev)
 {
-	/* raid4 can take over raid5 if layout is right.
+	/* raid4 can take over:
+	 *  raid0 - if there is only one strip zone
+	 *  raid5 - if layout is right
 	 */
+	if (mddev->level == 0)
+		return raid45_takeover_raid0(mddev, 4);
 	if (mddev->level == 5 &&
 	    mddev->layout == ALGORITHM_PARITY_N) {
 		mddev->new_layout = 0;
-- 
cgit v0.10.2


From bb7f8d2217d8753ab5008c78f16697d9e697d570 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 1 May 2010 18:14:57 -0700
Subject: md: notify mdstat waiters of level change

Level modifications change the output of mdstat.  The mdmon manager
thread is interested in these events for external metadata management.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index af0780a..69f659e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3075,6 +3075,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 	sysfs_notify(&mddev->kobj, NULL, "level");
+	md_new_event(mddev);
 	return rv;
 }
 
-- 
cgit v0.10.2


From f2859af6716ce99cac7f35c5a0c6b7fed346312f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 2 May 2010 10:04:16 -0700
Subject: md: allow integers to be passed to md/level

e.g. allow md to interpret 'echo 4 > md/level' as a request for raid4.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 69f659e..b8a0fcf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2934,9 +2934,10 @@ level_show(mddev_t *mddev, char *page)
 static ssize_t
 level_store(mddev_t *mddev, const char *buf, size_t len)
 {
-	char level[16];
+	char clevel[16];
 	ssize_t rv = len;
 	struct mdk_personality *pers;
+	long level;
 	void *priv;
 	mdk_rdev_t *rdev;
 
@@ -2969,19 +2970,22 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	}
 
 	/* Now find the new personality */
-	if (len == 0 || len >= sizeof(level))
+	if (len == 0 || len >= sizeof(clevel))
 		return -EINVAL;
-	strncpy(level, buf, len);
-	if (level[len-1] == '\n')
+	strncpy(clevel, buf, len);
+	if (clevel[len-1] == '\n')
 		len--;
-	level[len] = 0;
+	clevel[len] = 0;
+	if (strict_strtol(clevel, 10, &level))
+		level = LEVEL_NONE;
 
-	request_module("md-%s", level);
+	if (request_module("md-%s", clevel) != 0)
+		request_module("md-level-%s", clevel);
 	spin_lock(&pers_lock);
-	pers = find_pers(LEVEL_NONE, level);
+	pers = find_pers(level, clevel);
 	if (!pers || !try_module_get(pers->owner)) {
 		spin_unlock(&pers_lock);
-		printk(KERN_WARNING "md: personality %s not loaded\n", level);
+		printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
 		return -EINVAL;
 	}
 	spin_unlock(&pers_lock);
@@ -2994,7 +2998,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	if (!pers->takeover) {
 		module_put(pers->owner);
 		printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
-		       mdname(mddev), level);
+		       mdname(mddev), clevel);
 		return -EINVAL;
 	}
 
@@ -3010,7 +3014,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 		mddev->delta_disks = 0;
 		module_put(pers->owner);
 		printk(KERN_WARNING "md: %s: %s would not accept array\n",
-		       mdname(mddev), level);
+		       mdname(mddev), clevel);
 		return PTR_ERR(priv);
 	}
 
-- 
cgit v0.10.2


From 08fb730ca346ff16598ef31911c88fbca6133bf5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 3 May 2010 13:16:56 +1000
Subject: md: remove EXPERIMENTAL designation from RAID10

RAID10 has been available for quite a while now and is quite well
tested, so we can remove the EXPERIMENTAL designation.

Reported-by: Eric MSP Veith <eveith@wwweb-library.net>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index acb3a4e..4a6feac 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -100,8 +100,8 @@ config MD_RAID1
 	  If unsure, say Y.
 
 config MD_RAID10
-	tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)"
-	depends on BLK_DEV_MD && EXPERIMENTAL
+	tristate "RAID-10 (mirrored striping) mode"
+	depends on BLK_DEV_MD
 	---help---
 	  RAID-10 provides a combination of striping (RAID-0) and
 	  mirroring (RAID-1) with easier configuration and more flexible
-- 
cgit v0.10.2


From 0c55e02259115c151e4835dd417cf41467bb02e2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 3 May 2010 14:09:02 +1000
Subject: md/raid5: improve consistency of error messages.

Many 'printk' messages from the raid456 module mention 'raid5' even
though it may be a 'raid6' or even 'raid4' array.  This can cause
confusion.
Also the actual array name is not always reported and when it is
it is not reported consistently.

So change all the messages to start:
    md/raid:%s:
where '%s' becomes e.g. md3 to identify the particular array.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 81563b7..cee9f93 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1509,7 +1509,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
 			rdev = conf->disks[i].rdev;
-			printk_rl(KERN_INFO "raid5:%s: read error corrected"
+			printk_rl(KERN_INFO "md/raid:%s: read error corrected"
 				  " (%lu sectors at %llu on %s)\n",
 				  mdname(conf->mddev), STRIPE_SECTORS,
 				  (unsigned long long)(sh->sector
@@ -1529,7 +1529,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		atomic_inc(&rdev->read_errors);
 		if (conf->mddev->degraded)
 			printk_rl(KERN_WARNING
-				  "raid5:%s: read error not correctable "
+				  "md/raid:%s: read error not correctable "
 				  "(sector %llu on %s).\n",
 				  mdname(conf->mddev),
 				  (unsigned long long)(sh->sector
@@ -1538,7 +1538,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
 			/* Oh, no!!! */
 			printk_rl(KERN_WARNING
-				  "raid5:%s: read error NOT corrected!! "
+				  "md/raid:%s: read error NOT corrected!! "
 				  "(sector %llu on %s).\n",
 				  mdname(conf->mddev),
 				  (unsigned long long)(sh->sector
@@ -1547,7 +1547,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		else if (atomic_read(&rdev->read_errors)
 			 > conf->max_nr_stripes)
 			printk(KERN_WARNING
-			       "raid5:%s: Too many read errors, failing device %s.\n",
+			       "md/raid:%s: Too many read errors, failing device %s.\n",
 			       mdname(conf->mddev), bdn);
 		else
 			retry = 1;
@@ -1620,7 +1620,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	char b[BDEVNAME_SIZE];
 	raid5_conf_t *conf = mddev->private;
-	pr_debug("raid5: error called\n");
+	pr_debug("raid456: error called\n");
 
 	if (!test_bit(Faulty, &rdev->flags)) {
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1636,9 +1636,13 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 		}
 		set_bit(Faulty, &rdev->flags);
 		printk(KERN_ALERT
-		       "raid5: Disk failure on %s, disabling device.\n"
-		       "raid5: Operation continuing on %d devices.\n",
-		       bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
+		       "md/raid:%s: Disk failure on %s, disabling device.\n"
+		       KERN_ALERT
+		       "md/raid:%s: Operation continuing on %d devices.\n",
+		       mdname(mddev),
+		       bdevname(rdev->bdev, b),
+		       mdname(mddev),
+		       conf->raid_disks - mddev->degraded);
 	}
 }
 
@@ -1719,8 +1723,6 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
 			pd_idx = data_disks;
 			break;
 		default:
-			printk(KERN_ERR "raid5: unsupported algorithm %d\n",
-				algorithm);
 			BUG();
 		}
 		break;
@@ -1836,10 +1838,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
 			qd_idx = raid_disks - 1;
 			break;
 
-
 		default:
-			printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
-			       algorithm);
 			BUG();
 		}
 		break;
@@ -1902,8 +1901,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
 		case ALGORITHM_PARITY_N:
 			break;
 		default:
-			printk(KERN_ERR "raid5: unsupported algorithm %d\n",
-			       algorithm);
 			BUG();
 		}
 		break;
@@ -1962,8 +1959,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
 			i -= 1;
 			break;
 		default:
-			printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
-			       algorithm);
 			BUG();
 		}
 		break;
@@ -1976,7 +1971,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
 				     previous, &dummy1, &sh2);
 	if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
 		|| sh2.qd_idx != sh->qd_idx) {
-		printk(KERN_ERR "compute_blocknr: map not correct\n");
+		printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
+		       mdname(conf->mddev));
 		return 0;
 	}
 	return r_sector;
@@ -3942,7 +3938,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 		new_sector = raid5_compute_sector(conf, logical_sector,
 						  previous,
 						  &dd_idx, NULL);
-		pr_debug("raid5: make_request, sector %llu logical %llu\n",
+		pr_debug("raid456: make_request, sector %llu logical %llu\n",
 			(unsigned long long)new_sector, 
 			(unsigned long long)logical_sector);
 
@@ -4721,7 +4717,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 	if (mddev->new_level != 5
 	    && mddev->new_level != 4
 	    && mddev->new_level != 6) {
-		printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
+		printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
 		       mdname(mddev), mddev->new_level);
 		return ERR_PTR(-EIO);
 	}
@@ -4729,12 +4725,12 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 	     && !algorithm_valid_raid5(mddev->new_layout)) ||
 	    (mddev->new_level == 6
 	     && !algorithm_valid_raid6(mddev->new_layout))) {
-		printk(KERN_ERR "raid5: %s: layout %d not supported\n",
+		printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
 		       mdname(mddev), mddev->new_layout);
 		return ERR_PTR(-EIO);
 	}
 	if (mddev->new_level == 6 && mddev->raid_disks < 4) {
-		printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
+		printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
 		       mdname(mddev), mddev->raid_disks);
 		return ERR_PTR(-EINVAL);
 	}
@@ -4742,8 +4738,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 	if (!mddev->new_chunk_sectors ||
 	    (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
 	    !is_power_of_2(mddev->new_chunk_sectors)) {
-		printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
-		       mddev->new_chunk_sectors << 9, mdname(mddev));
+		printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
+		       mdname(mddev), mddev->new_chunk_sectors << 9);
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -4785,7 +4781,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 	if (raid5_alloc_percpu(conf) != 0)
 		goto abort;
 
-	pr_debug("raid5: run(%s) called.\n", mdname(mddev));
+	pr_debug("raid456: run(%s) called.\n", mdname(mddev));
 
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		raid_disk = rdev->raid_disk;
@@ -4798,9 +4794,9 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 
 		if (test_bit(In_sync, &rdev->flags)) {
 			char b[BDEVNAME_SIZE];
-			printk(KERN_INFO "raid5: device %s operational as raid"
-				" disk %d\n", bdevname(rdev->bdev,b),
-				raid_disk);
+			printk(KERN_INFO "md/raid:%s: device %s operational as raid"
+			       " disk %d\n",
+			       mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
 		} else
 			/* Cannot rely on bitmap to complete recovery */
 			conf->fullsync = 1;
@@ -4824,16 +4820,17 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
 	if (grow_stripes(conf, conf->max_nr_stripes)) {
 		printk(KERN_ERR
-			"raid5: couldn't allocate %dkB for buffers\n", memory);
+		       "md/raid:%s: couldn't allocate %dkB for buffers\n",
+		       mdname(mddev), memory);
 		goto abort;
 	} else
-		printk(KERN_INFO "raid5: allocated %dkB for %s\n",
-			memory, mdname(mddev));
+		printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
+		       mdname(mddev), memory);
 
 	conf->thread = md_register_thread(raid5d, mddev, NULL);
 	if (!conf->thread) {
 		printk(KERN_ERR
-		       "raid5: couldn't allocate thread for %s\n",
+		       "md/raid:%s: couldn't allocate thread.\n",
 		       mdname(mddev));
 		goto abort;
 	}
@@ -4884,7 +4881,7 @@ static int run(mddev_t *mddev)
 	sector_t reshape_offset = 0;
 
 	if (mddev->recovery_cp != MaxSector)
-		printk(KERN_NOTICE "raid5: %s is not clean"
+		printk(KERN_NOTICE "md/raid:%s: not clean"
 		       " -- starting background reconstruction\n",
 		       mdname(mddev));
 	if (mddev->reshape_position != MaxSector) {
@@ -4898,7 +4895,7 @@ static int run(mddev_t *mddev)
 		int max_degraded = (mddev->level == 6 ? 2 : 1);
 
 		if (mddev->new_level != mddev->level) {
-			printk(KERN_ERR "raid5: %s: unsupported reshape "
+			printk(KERN_ERR "md/raid:%s: unsupported reshape "
 			       "required - aborting.\n",
 			       mdname(mddev));
 			return -EINVAL;
@@ -4911,8 +4908,8 @@ static int run(mddev_t *mddev)
 		here_new = mddev->reshape_position;
 		if (sector_div(here_new, mddev->new_chunk_sectors *
 			       (mddev->raid_disks - max_degraded))) {
-			printk(KERN_ERR "raid5: reshape_position not "
-			       "on a stripe boundary\n");
+			printk(KERN_ERR "md/raid:%s: reshape_position not "
+			       "on a stripe boundary\n", mdname(mddev));
 			return -EINVAL;
 		}
 		reshape_offset = here_new * mddev->new_chunk_sectors;
@@ -4933,8 +4930,9 @@ static int run(mddev_t *mddev)
 			if ((here_new * mddev->new_chunk_sectors != 
 			     here_old * mddev->chunk_sectors) ||
 			    mddev->ro == 0) {
-				printk(KERN_ERR "raid5: in-place reshape must be started"
-				       " in read-only mode - aborting\n");
+				printk(KERN_ERR "md/raid:%s: in-place reshape must be started"
+				       " in read-only mode - aborting\n",
+				       mdname(mddev));
 				return -EINVAL;
 			}
 		} else if (mddev->delta_disks < 0
@@ -4943,11 +4941,13 @@ static int run(mddev_t *mddev)
 		    : (here_new * mddev->new_chunk_sectors >=
 		       here_old * mddev->chunk_sectors)) {
 			/* Reading from the same stripe as writing to - bad */
-			printk(KERN_ERR "raid5: reshape_position too early for "
-			       "auto-recovery - aborting.\n");
+			printk(KERN_ERR "md/raid:%s: reshape_position too early for "
+			       "auto-recovery - aborting.\n",
+			       mdname(mddev));
 			return -EINVAL;
 		}
-		printk(KERN_INFO "raid5: reshape will continue\n");
+		printk(KERN_INFO "md/raid:%s: reshape will continue\n",
+		       mdname(mddev));
 		/* OK, we should be able to continue; */
 	} else {
 		BUG_ON(mddev->level != mddev->new_level);
@@ -4989,18 +4989,6 @@ static int run(mddev_t *mddev)
 		    mddev->minor_version > 90)
 			rdev->recovery_offset = reshape_offset;
 			
-		printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n",
-		       rdev->raid_disk, working_disks, conf->prev_algo,
-		       conf->previous_raid_disks, conf->max_degraded,
-		       conf->algorithm, conf->raid_disks, 
-		       only_parity(rdev->raid_disk,
-				   conf->prev_algo,
-				   conf->previous_raid_disks,
-				   conf->max_degraded),
-		       only_parity(rdev->raid_disk,
-				   conf->algorithm,
-				   conf->raid_disks,
-				   conf->max_degraded));
 		if (rdev->recovery_offset < reshape_offset) {
 			/* We need to check old and new layout */
 			if (!only_parity(rdev->raid_disk,
@@ -5021,7 +5009,7 @@ static int run(mddev_t *mddev)
 			   - working_disks);
 
 	if (mddev->degraded > conf->max_degraded) {
-		printk(KERN_ERR "raid5: not enough operational devices for %s"
+		printk(KERN_ERR "md/raid:%s: not enough operational devices"
 			" (%d/%d failed)\n",
 			mdname(mddev), mddev->degraded, conf->raid_disks);
 		goto abort;
@@ -5035,32 +5023,32 @@ static int run(mddev_t *mddev)
 	    mddev->recovery_cp != MaxSector) {
 		if (mddev->ok_start_degraded)
 			printk(KERN_WARNING
-			       "raid5: starting dirty degraded array: %s"
-			       "- data corruption possible.\n",
+			       "md/raid:%s: starting dirty degraded array"
+			       " - data corruption possible.\n",
 			       mdname(mddev));
 		else {
 			printk(KERN_ERR
-			       "raid5: cannot start dirty degraded array for %s\n",
+			       "md/raid:%s: cannot start dirty degraded array.\n",
 			       mdname(mddev));
 			goto abort;
 		}
 	}
 
 	if (mddev->degraded == 0)
-		printk("raid5: raid level %d set %s active with %d out of %d"
-		       " devices, algorithm %d\n", conf->level, mdname(mddev),
+		printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d"
+		       " devices, algorithm %d\n", mdname(mddev), conf->level,
 		       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
 		       mddev->new_layout);
 	else
-		printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
-			" out of %d devices, algorithm %d\n", conf->level,
-			mdname(mddev), mddev->raid_disks - mddev->degraded,
-			mddev->raid_disks, mddev->new_layout);
+		printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
+		       " out of %d devices, algorithm %d\n",
+		       mdname(mddev), conf->level,
+		       mddev->raid_disks - mddev->degraded,
+		       mddev->raid_disks, mddev->new_layout);
 
 	print_raid5_conf(conf);
 
 	if (conf->reshape_progress != MaxSector) {
-		printk("...ok start reshape thread\n");
 		conf->reshape_safe = conf->reshape_progress;
 		atomic_set(&conf->reshape_stripes, 0);
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -5087,7 +5075,7 @@ static int run(mddev_t *mddev)
 		mddev->to_remove = NULL;
 	else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
 		printk(KERN_WARNING
-		       "raid5: failed to create sysfs attributes for %s\n",
+		       "md/raid:%s: failed to create sysfs attributes.\n",
 		       mdname(mddev));
 
 	mddev->queue->queue_lock = &conf->device_lock;
@@ -5117,12 +5105,10 @@ abort:
 		free_conf(conf);
 	}
 	mddev->private = NULL;
-	printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
+	printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
 	return -EIO;
 }
 
-
-
 static int stop(mddev_t *mddev)
 {
 	raid5_conf_t *conf = mddev->private;
@@ -5196,21 +5182,22 @@ static void print_raid5_conf (raid5_conf_t *conf)
 	int i;
 	struct disk_info *tmp;
 
-	printk("RAID5 conf printout:\n");
+	printk(KERN_DEBUG "RAID conf printout:\n");
 	if (!conf) {
 		printk("(conf==NULL)\n");
 		return;
 	}
-	printk(" --- rd:%d wd:%d\n", conf->raid_disks,
-		 conf->raid_disks - conf->mddev->degraded);
+	printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level,
+	       conf->raid_disks,
+	       conf->raid_disks - conf->mddev->degraded);
 
 	for (i = 0; i < conf->raid_disks; i++) {
 		char b[BDEVNAME_SIZE];
 		tmp = conf->disks + i;
 		if (tmp->rdev)
-		printk(" disk %d, o:%d, dev:%s\n",
-			i, !test_bit(Faulty, &tmp->rdev->flags),
-			bdevname(tmp->rdev->bdev,b));
+			printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n",
+			       i, !test_bit(Faulty, &tmp->rdev->flags),
+			       bdevname(tmp->rdev->bdev, b));
 	}
 }
 
@@ -5358,7 +5345,8 @@ static int check_stripe_cache(mddev_t *mddev)
 	    > conf->max_nr_stripes ||
 	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
 	    > conf->max_nr_stripes) {
-		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
+		printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
+		       mdname(mddev),
 		       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
 			/ STRIPE_SIZE)*4);
 		return 0;
@@ -5429,7 +5417,7 @@ static int raid5_start_reshape(mddev_t *mddev)
 	 */
 	if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
 	    < mddev->array_sectors) {
-		printk(KERN_ERR "md: %s: array size must be reduced "
+		printk(KERN_ERR "md/raid:%s: array size must be reduced "
 		       "before number of disks\n", mdname(mddev));
 		return -EINVAL;
 	}
@@ -5467,9 +5455,9 @@ static int raid5_start_reshape(mddev_t *mddev)
 				if (sysfs_create_link(&mddev->kobj,
 						      &rdev->kobj, nm))
 					printk(KERN_WARNING
-					       "raid5: failed to create "
-					       " link %s for %s\n",
-					       nm, mdname(mddev));
+					       "md/raid:%s: failed to create "
+					       " link %s\n",
+					       mdname(mddev), nm);
 			} else
 				break;
 		}
@@ -5616,7 +5604,8 @@ static void *raid45_takeover_raid0(mddev_t *mddev, int level)
 
 	/* for raid0 takeover only one zone is supported */
 	if (raid0_priv->nr_strip_zones > 1) {
-		printk(KERN_ERR "md: cannot takeover raid0 with more than one zone.\n");
+		printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
+		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 
-- 
cgit v0.10.2


From 9dd1e2faf72f79a2af9dcbd059473c06648726c2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 3 May 2010 14:30:35 +1000
Subject: md/raid1: improve printk messages

Make sure the array name is included in a uniform way in all printk
messages.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index cb2da87..1db02c4 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -296,7 +296,8 @@ static void raid1_end_read_request(struct bio *bio, int error)
 		 */
 		char b[BDEVNAME_SIZE];
 		if (printk_ratelimit())
-			printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
+			printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n",
+			       mdname(conf->mddev),
 			       bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
 		reschedule_retry(r1_bio);
 	}
@@ -1075,21 +1076,22 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 	} else
 		set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-	printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n"
-		"raid1: Operation continuing on %d devices.\n",
-		bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
+	printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n"
+	       KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n",
+	       mdname(mddev), bdevname(rdev->bdev, b),
+	       mdname(mddev), conf->raid_disks - mddev->degraded);
 }
 
 static void print_conf(conf_t *conf)
 {
 	int i;
 
-	printk("RAID1 conf printout:\n");
+	printk(KERN_DEBUG "RAID1 conf printout:\n");
 	if (!conf) {
-		printk("(!conf)\n");
+		printk(KERN_DEBUG "(!conf)\n");
 		return;
 	}
-	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
+	printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
 		conf->raid_disks);
 
 	rcu_read_lock();
@@ -1097,7 +1099,7 @@ static void print_conf(conf_t *conf)
 		char b[BDEVNAME_SIZE];
 		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev)
-			printk(" disk %d, wo:%d, o:%d, dev:%s\n",
+			printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
 			       i, !test_bit(In_sync, &rdev->flags),
 			       !test_bit(Faulty, &rdev->flags),
 			       bdevname(rdev->bdev,b));
@@ -1458,9 +1460,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 				char b[BDEVNAME_SIZE];
 				/* Cannot read from anywhere, array is toast */
 				md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
-				printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"
+				printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
 				       " for block %llu\n",
-				       bdevname(bio->bi_bdev,b),
+				       mdname(mddev),
+				       bdevname(bio->bi_bdev, b),
 				       (unsigned long long)r1_bio->sector);
 				md_done_sync(mddev, r1_bio->sectors, 0);
 				put_buf(r1_bio);
@@ -1582,7 +1585,7 @@ static void fix_read_error(conf_t *conf, int read_disk,
 				else {
 					atomic_add(s, &rdev->corrected_errors);
 					printk(KERN_INFO
-					       "raid1:%s: read error corrected "
+					       "md/raid1:%s: read error corrected "
 					       "(%d sectors at %llu on %s)\n",
 					       mdname(mddev), s,
 					       (unsigned long long)(sect +
@@ -1687,8 +1690,9 @@ static void raid1d(mddev_t *mddev)
 
 			bio = r1_bio->bios[r1_bio->read_disk];
 			if ((disk=read_balance(conf, r1_bio)) == -1) {
-				printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
+				printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
 				       " read error for block %llu\n",
+				       mdname(mddev),
 				       bdevname(bio->bi_bdev,b),
 				       (unsigned long long)r1_bio->sector);
 				raid_end_bio_io(r1_bio);
@@ -1702,8 +1706,9 @@ static void raid1d(mddev_t *mddev)
 				r1_bio->bios[r1_bio->read_disk] = bio;
 				rdev = conf->mirrors[disk].rdev;
 				if (printk_ratelimit())
-					printk(KERN_ERR "raid1: redirecting sector %llu to"
+					printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to"
 					       " other mirror: %s\n",
+					       mdname(mddev),
 					       (unsigned long long)r1_bio->sector,
 					       bdevname(rdev->bdev,b));
 				bio->bi_sector = r1_bio->sector + rdev->data_offset;
@@ -1760,13 +1765,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	int still_degraded = 0;
 
 	if (!conf->r1buf_pool)
-	{
-/*
-		printk("sync start - bitmap %p\n", mddev->bitmap);
-*/
 		if (init_resync(conf))
 			return 0;
-	}
 
 	max_sector = mddev->dev_sectors;
 	if (sector_nr >= max_sector) {
@@ -2047,7 +2047,7 @@ static conf_t *setup_conf(mddev_t *mddev)
 
 	err = -EIO;
 	if (conf->last_used < 0) {
-		printk(KERN_ERR "raid1: no operational mirrors for %s\n",
+		printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
 		       mdname(mddev));
 		goto abort;
 	}
@@ -2055,7 +2055,7 @@ static conf_t *setup_conf(mddev_t *mddev)
 	conf->thread = md_register_thread(raid1d, mddev, NULL);
 	if (!conf->thread) {
 		printk(KERN_ERR
-		       "raid1: couldn't allocate thread for %s\n",
+		       "md/raid1:%s: couldn't allocate thread\n",
 		       mdname(mddev));
 		goto abort;
 	}
@@ -2081,12 +2081,12 @@ static int run(mddev_t *mddev)
 	mdk_rdev_t *rdev;
 
 	if (mddev->level != 1) {
-		printk("raid1: %s: raid level not set to mirroring (%d)\n",
+		printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
 		       mdname(mddev), mddev->level);
 		return -EIO;
 	}
 	if (mddev->reshape_position != MaxSector) {
-		printk("raid1: %s: reshape_position set but not supported\n",
+		printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n",
 		       mdname(mddev));
 		return -EIO;
 	}
@@ -2129,11 +2129,11 @@ static int run(mddev_t *mddev)
 		mddev->recovery_cp = MaxSector;
 
 	if (mddev->recovery_cp != MaxSector)
-		printk(KERN_NOTICE "raid1: %s is not clean"
+		printk(KERN_NOTICE "md/raid1:%s: not clean"
 		       " -- starting background reconstruction\n",
 		       mdname(mddev));
 	printk(KERN_INFO 
-		"raid1: raid set %s active with %d out of %d mirrors\n",
+		"md/raid1:%s: active with %d out of %d mirrors\n",
 		mdname(mddev), mddev->raid_disks - mddev->degraded, 
 		mddev->raid_disks);
 
@@ -2160,7 +2160,8 @@ static int stop(mddev_t *mddev)
 
 	/* wait for behind writes to complete */
 	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
-		printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop.\n", mdname(mddev));
+		printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n",
+		       mdname(mddev));
 		/* need to kick something here to make sure I/O goes? */
 		wait_event(bitmap->behind_wait,
 			   atomic_read(&bitmap->behind_writes) == 0);
@@ -2288,9 +2289,9 @@ static int raid1_reshape(mddev_t *mddev)
 			if (sysfs_create_link(&mddev->kobj,
 					      &rdev->kobj, nm))
 				printk(KERN_WARNING
-				       "md/raid1: cannot register "
-				       "%s for %s\n",
-				       nm, mdname(mddev));
+				       "md/raid1:%s: cannot register "
+				       "%s\n",
+				       mdname(mddev), nm);
 		}
 		if (rdev)
 			newmirrors[d2++].rdev = rdev;
-- 
cgit v0.10.2


From 128595ed6ff2c7358ae253a560d47a0af463bc99 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 3 May 2010 14:47:14 +1000
Subject: md/raid10: tidy up printk messages.

All raid10 printk messages now start
   md/raid10:md-device-name:

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a1d72761..e0742c4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -285,7 +285,8 @@ static void raid10_end_read_request(struct bio *bio, int error)
 		 */
 		char b[BDEVNAME_SIZE];
 		if (printk_ratelimit())
-			printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n",
+			printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n",
+			       mdname(conf->mddev),
 			       bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
 		reschedule_retry(r10_bio);
 	}
@@ -831,8 +832,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		bio_pair_release(bp);
 		return 0;
 	bad_map:
-		printk("raid10_make_request bug: can't convert block across chunks"
-		       " or bigger than %dk %llu %d\n", chunk_sects/2,
+		printk("md/raid10:%s: make_request bug: can't convert block across chunks"
+		       " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
 		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
 
 		bio_io_error(bio);
@@ -1031,9 +1032,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 	}
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-	printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n"
-		"raid10: Operation continuing on %d devices.\n",
-		bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
+	printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n"
+	       KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n",
+	       mdname(mddev), bdevname(rdev->bdev, b),
+	       mdname(mddev), conf->raid_disks - mddev->degraded);
 }
 
 static void print_conf(conf_t *conf)
@@ -1041,19 +1043,19 @@ static void print_conf(conf_t *conf)
 	int i;
 	mirror_info_t *tmp;
 
-	printk("RAID10 conf printout:\n");
+	printk(KERN_DEBUG "RAID10 conf printout:\n");
 	if (!conf) {
-		printk("(!conf)\n");
+		printk(KERN_DEBUG "(!conf)\n");
 		return;
 	}
-	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
+	printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
 		conf->raid_disks);
 
 	for (i = 0; i < conf->raid_disks; i++) {
 		char b[BDEVNAME_SIZE];
 		tmp = conf->mirrors + i;
 		if (tmp->rdev)
-			printk(" disk %d, wo:%d, o:%d, dev:%s\n",
+			printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
 				i, !test_bit(In_sync, &tmp->rdev->flags),
 			        !test_bit(Faulty, &tmp->rdev->flags),
 				bdevname(tmp->rdev->bdev,b));
@@ -1502,13 +1504,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 		if (cur_read_error_count > max_read_errors) {
 			rcu_read_unlock();
 			printk(KERN_NOTICE
-			       "raid10: %s: Raid device exceeded "
+			       "md/raid10:%s: %s: Raid device exceeded "
 			       "read_error threshold "
 			       "[cur %d:max %d]\n",
+			       mdname(mddev),
 			       b, cur_read_error_count, max_read_errors);
 			printk(KERN_NOTICE
-			       "raid10: %s: Failing raid "
-			       "device\n", b);
+			       "md/raid10:%s: %s: Failing raid "
+			       "device\n", mdname(mddev), b);
 			md_error(mddev, conf->mirrors[d].rdev);
 			return;
 		}
@@ -1578,15 +1581,16 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 				    == 0) {
 					/* Well, this device is dead */
 					printk(KERN_NOTICE
-					       "raid10:%s: read correction "
+					       "md/raid10:%s: read correction "
 					       "write failed"
 					       " (%d sectors at %llu on %s)\n",
 					       mdname(mddev), s,
 					       (unsigned long long)(sect+
 					       rdev->data_offset),
 					       bdevname(rdev->bdev, b));
-					printk(KERN_NOTICE "raid10:%s: failing "
+					printk(KERN_NOTICE "md/raid10:%s: %s: failing "
 					       "drive\n",
+					       mdname(mddev),
 					       bdevname(rdev->bdev, b));
 					md_error(mddev, rdev);
 				}
@@ -1614,20 +1618,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 						 READ) == 0) {
 					/* Well, this device is dead */
 					printk(KERN_NOTICE
-					       "raid10:%s: unable to read back "
+					       "md/raid10:%s: unable to read back "
 					       "corrected sectors"
 					       " (%d sectors at %llu on %s)\n",
 					       mdname(mddev), s,
 					       (unsigned long long)(sect+
 						    rdev->data_offset),
 					       bdevname(rdev->bdev, b));
-					printk(KERN_NOTICE "raid10:%s: failing drive\n",
+					printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
+					       mdname(mddev),
 					       bdevname(rdev->bdev, b));
 
 					md_error(mddev, rdev);
 				} else {
 					printk(KERN_INFO
-					       "raid10:%s: read error corrected"
+					       "md/raid10:%s: read error corrected"
 					       " (%d sectors at %llu on %s)\n",
 					       mdname(mddev), s,
 					       (unsigned long long)(sect+
@@ -1702,8 +1707,9 @@ static void raid10d(mddev_t *mddev)
 				mddev->ro ? IO_BLOCKED : NULL;
 			mirror = read_balance(conf, r10_bio);
 			if (mirror == -1) {
-				printk(KERN_ALERT "raid10: %s: unrecoverable I/O"
+				printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
 				       " read error for block %llu\n",
+				       mdname(mddev),
 				       bdevname(bio->bi_bdev,b),
 				       (unsigned long long)r10_bio->sector);
 				raid_end_bio_io(r10_bio);
@@ -1713,8 +1719,9 @@ static void raid10d(mddev_t *mddev)
 				bio_put(bio);
 				rdev = conf->mirrors[mirror].rdev;
 				if (printk_ratelimit())
-					printk(KERN_ERR "raid10: %s: redirecting sector %llu to"
+					printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
 					       " another mirror\n",
+					       mdname(mddev),
 					       bdevname(rdev->bdev,b),
 					       (unsigned long long)r10_bio->sector);
 				bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
@@ -1972,7 +1979,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 					r10_bio = rb2;
 					if (!test_and_set_bit(MD_RECOVERY_INTR,
 							      &mddev->recovery))
-						printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",
+						printk(KERN_INFO "md/raid10:%s: insufficient "
+						       "working devices for recovery.\n",
 						       mdname(mddev));
 					break;
 				}
@@ -2154,8 +2162,9 @@ static conf_t *setup_conf(mddev_t *mddev)
 
 	if (mddev->chunk_sectors < (PAGE_SIZE >> 9) ||
 	    !is_power_of_2(mddev->chunk_sectors)) {
-		printk(KERN_ERR "md/raid10: chunk size must be "
-		       "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE);
+		printk(KERN_ERR "md/raid10:%s: chunk size must be "
+		       "at least PAGE_SIZE(%ld) and be a power of 2.\n",
+		       mdname(mddev), PAGE_SIZE);
 		goto out;
 	}
 
@@ -2165,7 +2174,7 @@ static conf_t *setup_conf(mddev_t *mddev)
 
 	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
 	    (mddev->layout >> 17)) {
-		printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
+		printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
 		       mdname(mddev), mddev->layout);
 		goto out;
 	}
@@ -2236,7 +2245,7 @@ static conf_t *setup_conf(mddev_t *mddev)
 	return conf;
 
  out:
-	printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
+	printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
 	       mdname(mddev));
 	if (conf) {
 		if (conf->r10bio_pool)
@@ -2314,7 +2323,7 @@ static int run(mddev_t *mddev)
 	}
 	/* need to check that every block has at least one working mirror */
 	if (!enough(conf)) {
-		printk(KERN_ERR "raid10: not enough operational mirrors for %s\n",
+		printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
 		       mdname(mddev));
 		goto out_free_conf;
 	}
@@ -2334,11 +2343,11 @@ static int run(mddev_t *mddev)
 	}
 
 	if (mddev->recovery_cp != MaxSector)
-		printk(KERN_NOTICE "raid10: %s is not clean"
+		printk(KERN_NOTICE "md/raid10:%s: not clean"
 		       " -- starting background reconstruction\n",
 		       mdname(mddev));
 	printk(KERN_INFO
-		"raid10: raid set %s active with %d out of %d devices\n",
+		"md/raid10:%s: active with %d out of %d devices\n",
 		mdname(mddev), conf->raid_disks - mddev->degraded,
 		conf->raid_disks);
 	/*
@@ -2420,7 +2429,8 @@ static void *raid10_takeover_raid0(mddev_t *mddev)
 	conf_t *conf;
 
 	if (mddev->degraded > 0) {
-		printk(KERN_ERR "error: degraded raid0!\n");
+		printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
+		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -2458,7 +2468,9 @@ static void *raid10_takeover(mddev_t *mddev)
 		/* for raid0 takeover only one zone is supported */
 		raid0_priv = mddev->private;
 		if (raid0_priv->nr_strip_zones > 1) {
-			printk(KERN_ERR "md: cannot takeover raid 0 with more than one zone.\n");
+			printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
+			       " with more than one zone.\n",
+			       mdname(mddev));
 			return ERR_PTR(-EINVAL);
 		}
 		return raid10_takeover_raid0(mddev);
-- 
cgit v0.10.2


From b5a20961f3479dda48bdc340354ee5469997839d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 3 May 2010 15:06:27 +1000
Subject: md/raid0: tidy up printk messages.

All messages now start
   md/raid0:md-device-name:

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 9f9c6b7..dc38c1a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -75,10 +75,10 @@ static void dump_zones(mddev_t *mddev)
 	for (j = 0; j < conf->nr_strip_zones; j++) {
 		printk(KERN_INFO "zone%d=[", j);
 		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
-			printk("%s/",
+			printk(KERN_CONT "%s/",
 			bdevname(conf->devlist[j*raid_disks
 						+ k]->bdev, b));
-		printk("]\n");
+		printk(KERN_CONT "]\n");
 
 		zone_size  = conf->strip_zone[j].zone_end - zone_start;
 		printk(KERN_INFO "        zone offset=%llukb "
@@ -104,8 +104,9 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
 	if (!conf)
 		return -ENOMEM;
 	list_for_each_entry(rdev1, &mddev->disks, same_set) {
-		printk(KERN_INFO "raid0: looking at %s\n",
-			bdevname(rdev1->bdev,b));
+		printk(KERN_INFO "md/raid0:%s: looking at %s\n",
+		       mdname(mddev),
+		       bdevname(rdev1->bdev, b));
 		c = 0;
 
 		/* round size to chunk_size */
@@ -114,14 +115,16 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
 		rdev1->sectors = sectors * mddev->chunk_sectors;
 
 		list_for_each_entry(rdev2, &mddev->disks, same_set) {
-			printk(KERN_INFO "raid0:   comparing %s(%llu)",
+			printk(KERN_INFO "md/raid0:%s:   comparing %s(%llu)",
+			       mdname(mddev),
 			       bdevname(rdev1->bdev,b),
 			       (unsigned long long)rdev1->sectors);
-			printk(KERN_INFO " with %s(%llu)\n",
+			printk(KERN_CONT " with %s(%llu)\n",
 			       bdevname(rdev2->bdev,b),
 			       (unsigned long long)rdev2->sectors);
 			if (rdev2 == rdev1) {
-				printk(KERN_INFO "raid0:   END\n");
+				printk(KERN_INFO "md/raid0:%s:   END\n",
+				       mdname(mddev));
 				break;
 			}
 			if (rdev2->sectors == rdev1->sectors) {
@@ -129,20 +132,24 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
 				 * Not unique, don't count it as a new
 				 * group
 				 */
-				printk(KERN_INFO "raid0:   EQUAL\n");
+				printk(KERN_INFO "md/raid0:%s:   EQUAL\n",
+				       mdname(mddev));
 				c = 1;
 				break;
 			}
-			printk(KERN_INFO "raid0:   NOT EQUAL\n");
+			printk(KERN_INFO "md/raid0:%s:   NOT EQUAL\n",
+			       mdname(mddev));
 		}
 		if (!c) {
-			printk(KERN_INFO "raid0:   ==> UNIQUE\n");
+			printk(KERN_INFO "md/raid0:%s:   ==> UNIQUE\n",
+			       mdname(mddev));
 			conf->nr_strip_zones++;
-			printk(KERN_INFO "raid0: %d zones\n",
-				conf->nr_strip_zones);
+			printk(KERN_INFO "md/raid0:%s: %d zones\n",
+			       mdname(mddev), conf->nr_strip_zones);
 		}
 	}
-	printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones);
+	printk(KERN_INFO "md/raid0:%s: FINAL %d zones\n",
+	       mdname(mddev), conf->nr_strip_zones);
 	err = -ENOMEM;
 	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
 				conf->nr_strip_zones, GFP_KERNEL);
@@ -170,13 +177,13 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
 			j /= 2;
 
 		if (j < 0 || j >= mddev->raid_disks) {
-			printk(KERN_ERR "raid0: bad disk number %d - "
-				"aborting!\n", j);
+			printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
+			       "aborting!\n", mdname(mddev), j);
 			goto abort;
 		}
 		if (dev[j]) {
-			printk(KERN_ERR "raid0: multiple devices for %d - "
-				"aborting!\n", j);
+			printk(KERN_ERR "md/raid0:%s: multiple devices for %d - "
+			       "aborting!\n", mdname(mddev), j);
 			goto abort;
 		}
 		dev[j] = rdev1;
@@ -198,8 +205,8 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
 		cnt++;
 	}
 	if (cnt != mddev->raid_disks) {
-		printk(KERN_ERR "raid0: too few disks (%d of %d) - "
-			"aborting!\n", cnt, mddev->raid_disks);
+		printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
+		       "aborting!\n", mdname(mddev), cnt, mddev->raid_disks);
 		goto abort;
 	}
 	zone->nb_dev = cnt;
@@ -215,39 +222,44 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
 		zone = conf->strip_zone + i;
 		dev = conf->devlist + i * mddev->raid_disks;
 
-		printk(KERN_INFO "raid0: zone %d\n", i);
+		printk(KERN_INFO "md/raid0:%s: zone %d\n",
+		       mdname(mddev), i);
 		zone->dev_start = smallest->sectors;
 		smallest = NULL;
 		c = 0;
 
 		for (j=0; j<cnt; j++) {
 			rdev = conf->devlist[j];
-			printk(KERN_INFO "raid0: checking %s ...",
-				bdevname(rdev->bdev, b));
+			printk(KERN_INFO "md/raid0:%s: checking %s ...",
+			       mdname(mddev),
+			       bdevname(rdev->bdev, b));
 			if (rdev->sectors <= zone->dev_start) {
-				printk(KERN_INFO " nope.\n");
+				printk(KERN_CONT " nope.\n");
 				continue;
 			}
-			printk(KERN_INFO " contained as device %d\n", c);
+			printk(KERN_CONT " contained as device %d\n", c);
 			dev[c] = rdev;
 			c++;
 			if (!smallest || rdev->sectors < smallest->sectors) {
 				smallest = rdev;
-				printk(KERN_INFO "  (%llu) is smallest!.\n",
-					(unsigned long long)rdev->sectors);
+				printk(KERN_INFO "md/raid0:%s:  (%llu) is smallest!.\n",
+				       mdname(mddev),
+				       (unsigned long long)rdev->sectors);
 			}
 		}
 
 		zone->nb_dev = c;
 		sectors = (smallest->sectors - zone->dev_start) * c;
-		printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
-			zone->nb_dev, (unsigned long long)sectors);
+		printk(KERN_INFO "md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n",
+		       mdname(mddev),
+		       zone->nb_dev, (unsigned long long)sectors);
 
 		curr_zone_end += sectors;
 		zone->zone_end = curr_zone_end;
 
-		printk(KERN_INFO "raid0: current zone start: %llu\n",
-			(unsigned long long)smallest->sectors);
+		printk(KERN_INFO "md/raid0:%s: current zone start: %llu\n",
+		       mdname(mddev),
+		       (unsigned long long)smallest->sectors);
 	}
 	mddev->queue->unplug_fn = raid0_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid0_congested;
@@ -258,7 +270,7 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
 	 * chunk size is a multiple of that sector size
 	 */
 	if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
-		printk(KERN_ERR "%s chunk_size of %d not valid\n",
+		printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n",
 		       mdname(mddev),
 		       mddev->chunk_sectors << 9);
 		goto abort;
@@ -268,7 +280,7 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
 	blk_queue_io_opt(mddev->queue,
 			 (mddev->chunk_sectors << 9) * mddev->raid_disks);
 
-	printk(KERN_INFO "raid0: done.\n");
+	printk(KERN_INFO "md/raid0:%s: done.\n", mdname(mddev));
 	*private_conf = conf;
 
 	return 0;
@@ -331,7 +343,8 @@ static int raid0_run(mddev_t *mddev)
 	int ret;
 
 	if (mddev->chunk_sectors == 0) {
-		printk(KERN_ERR "md/raid0: chunk size must be set.\n");
+		printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n",
+		       mdname(mddev));
 		return -EINVAL;
 	}
 	if (md_check_no_bitmap(mddev))
@@ -357,8 +370,9 @@ static int raid0_run(mddev_t *mddev)
 	/* calculate array device size */
 	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
 
-	printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
-		(unsigned long long)mddev->array_sectors);
+	printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n",
+	       mdname(mddev),
+	       (unsigned long long)mddev->array_sectors);
 	/* calculate the max read-ahead size.
 	 * For read-ahead of large files to be effective, we need to
 	 * readahead at least twice a whole stripe. i.e. number of devices
@@ -516,9 +530,10 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
 	return 1;
 
 bad_map:
-	printk("raid0_make_request bug: can't convert block across chunks"
-		" or bigger than %dk %llu %d\n", chunk_sects / 2,
-		(unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+	printk("md/raid0:%s: make_request bug: can't convert block across chunks"
+	       " or bigger than %dk %llu %d\n",
+	       mdname(mddev), chunk_sects / 2,
+	       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
 
 	bio_io_error(bio);
 	return 0;
@@ -563,7 +578,8 @@ static void *raid0_takeover_raid5(mddev_t *mddev)
 	raid0_conf_t *priv_conf;
 
 	if (mddev->degraded != 1) {
-		printk(KERN_ERR "md: raid5 must be degraded! Degraded disks: %d\n",
+		printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
+		       mdname(mddev),
 		       mddev->degraded);
 		return ERR_PTR(-EINVAL);
 	}
@@ -571,7 +587,8 @@ static void *raid0_takeover_raid5(mddev_t *mddev)
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		/* check slot number for a disk */
 		if (rdev->raid_disk == mddev->raid_disks-1) {
-			printk(KERN_ERR "md: raid5 must have missing parity disk!\n");
+			printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
+			       mdname(mddev));
 			return ERR_PTR(-EINVAL);
 		}
 	}
@@ -599,16 +616,19 @@ static void *raid0_takeover_raid10(mddev_t *mddev)
 	 *  - all mirrors must be already degraded
 	 */
 	if (mddev->layout != ((1 << 8) + 2)) {
-		printk(KERN_ERR "md: Raid0 cannot takover layout: %x\n",
+		printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n",
+		       mdname(mddev),
 		       mddev->layout);
 		return ERR_PTR(-EINVAL);
 	}
 	if (mddev->raid_disks & 1) {
-		printk(KERN_ERR "md: Raid0 cannot takover Raid10 with odd disk number.\n");
+		printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n",
+		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 	if (mddev->degraded != (mddev->raid_disks>>1)) {
-		printk(KERN_ERR "md: All mirrors must be already degraded!\n");
+		printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n",
+		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -636,8 +656,8 @@ static void *raid0_takeover(mddev_t *mddev)
 		if (mddev->layout == ALGORITHM_PARITY_N)
 			return raid0_takeover_raid5(mddev);
 
-		printk(KERN_ERR "md: Raid can only takeover Raid5 with layout: %d\n",
-		       ALGORITHM_PARITY_N);
+		printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
+		       mdname(mddev), ALGORITHM_PARITY_N);
 	}
 
 	if (mddev->level == 10)
-- 
cgit v0.10.2


From 2dc40f80945ac3e5ec05c3a6c75baf09b13cee51 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 3 May 2010 15:12:04 +1000
Subject: md/linear: standardise all printk messages

  md/linear:mdname:

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3204a22..d5d5064 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -158,7 +158,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		sector_t sectors;
 
 		if (j < 0 || j >= raid_disks || disk->rdev) {
-			printk("linear: disk numbering problem. Aborting!\n");
+			printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n",
+			       mdname(mddev));
 			goto out;
 		}
 
@@ -186,7 +187,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 
 	}
 	if (cnt != raid_disks) {
-		printk("linear: not enough drives present. Aborting!\n");
+		printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
+		       mdname(mddev));
 		goto out;
 	}
 
@@ -305,12 +307,14 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
 		     || (bio->bi_sector < start_sector))) {
 		char b[BDEVNAME_SIZE];
 
-		printk("linear_make_request: Sector %llu out of bounds on "
-			"dev %s: %llu sectors, offset %llu\n",
-			(unsigned long long)bio->bi_sector,
-			bdevname(tmp_dev->rdev->bdev, b),
-			(unsigned long long)tmp_dev->rdev->sectors,
-			(unsigned long long)start_sector);
+		printk(KERN_ERR
+		       "md/linear:%s: make_request: Sector %llu out of bounds on "
+		       "dev %s: %llu sectors, offset %llu\n",
+		       mdname(mddev),
+		       (unsigned long long)bio->bi_sector,
+		       bdevname(tmp_dev->rdev->bdev, b),
+		       (unsigned long long)tmp_dev->rdev->sectors,
+		       (unsigned long long)start_sector);
 		rcu_read_unlock();
 		bio_io_error(bio);
 		return 0;
-- 
cgit v0.10.2


From af3a2cd6b8a479345786e7fe5e199ad2f6240e56 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sat, 8 May 2010 08:20:17 +1000
Subject: md: Fix read balancing in RAID1 and RAID10 on drives > 2TB

read_balance uses a "unsigned long" for a sector number which
will get truncated beyond 2TB.
This will cause read-balancing to be non-optimal, and can cause
data to be read from the 'wrong' branch during a resync.  This has a
very small chance of returning wrong data.

Reported-by: Jordan Russell <jr-list-2010@quo.to>
Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1db02c4..a9b9972 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -418,7 +418,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
  */
 static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 {
-	const unsigned long this_sector = r1_bio->sector;
+	const sector_t this_sector = r1_bio->sector;
 	int new_disk = conf->last_used, disk = new_disk;
 	int wonly_disk = -1;
 	const int sectors = r1_bio->sectors;
@@ -434,7 +434,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
  retry:
 	if (conf->mddev->recovery_cp < MaxSector &&
 	    (this_sector + sectors >= conf->next_resync)) {
-		/* Choose the first operation device, for consistancy */
+		/* Choose the first operational device, for consistancy */
 		new_disk = 0;
 
 		for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e0742c4..a1f5fd2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -495,7 +495,7 @@ static int raid10_mergeable_bvec(struct request_queue *q,
  */
 static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 {
-	const unsigned long this_sector = r10_bio->sector;
+	const sector_t this_sector = r10_bio->sector;
 	int disk, slot, nslot;
 	const int sectors = r10_bio->sectors;
 	sector_t new_distance, current_distance;
-- 
cgit v0.10.2


From 75a73a29e520a6ce982b0da6dd8b7560ae3faa90 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 7 May 2010 19:44:26 +1000
Subject: md: restore ability of spare drives to spin down.

Some time ago we stopped the clean/active metadata updates
from being written to a 'spare' device in most cases so that
it could spin down and say spun down.  Device failure/removal
etc are still recorded on spares.

However commit 51d5668cb2e3fd1827a55 broke this 50% of the time,
depending on whether the event count is even or odd.
The change log entry said:

   This means that the alignment between 'odd/even' and
    'clean/dirty' might take a little longer to attain,

how ever the code makes no attempt to create that alignment, so it
could take arbitrarily long.

So when we find that clean/dirty is not aligned with odd/even,
force a second metadata-update immediately.  There are already cases
where a second metadata-update is needed immediately (e.g. when a
device fails during the metadata update).  We just piggy-back on that.

Reported-by: Joe Bryant <tenminjoe@yahoo.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Cc: stable@kernel.org

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b8a0fcf..fec4abc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2150,12 +2150,18 @@ repeat:
 		if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
 			/* .. if the array isn't clean, an 'even' event must also go
 			 * to spares. */
-			if ((mddev->events&1)==0)
+			if ((mddev->events&1)==0) {
 				nospares = 0;
+				sync_req = 2; /* force a second update to get the
+					       * even/odd in sync */
+			}
 		} else {
 			/* otherwise an 'odd' event must go to spares */
-			if ((mddev->events&1))
+			if ((mddev->events&1)) {
 				nospares = 0;
+				sync_req = 2; /* force a second update to get the
+					       * even/odd in sync */
+			}
 		}
 	}
 
-- 
cgit v0.10.2


From 7b0bb5368a7195606eca475d9f4e291ab7227052 Mon Sep 17 00:00:00 2001
From: "Gabriele A. Trombetti" <g.trombetti.lkrnl1213@logicschema.com>
Date: Wed, 28 Apr 2010 11:51:17 +1000
Subject: md/raid6: Fix raid-6 read-error correction in degraded state

Fix: Raid-6 was not trying to correct a read-error when in
singly-degraded state and was instead dropping one more device, going to
doubly-degraded state. This patch fixes this behaviour.

Tested-by: Janos Haar <janos.haar@netcenter.hu>
Signed-off-by: Gabriele A. Trombetti <g.trombetti.lkrnl1213@logicschema.com>
Reported-by: Janos Haar <janos.haar@netcenter.hu>
Signed-off-by: NeilBrown <neilb@suse.de>
Cc: stable@kernel.org

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cee9f93..eacf02a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1527,7 +1527,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 
 		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
 		atomic_inc(&rdev->read_errors);
-		if (conf->mddev->degraded)
+		if (conf->mddev->degraded >= conf->max_degraded)
 			printk_rl(KERN_WARNING
 				  "md/raid:%s: read error not correctable "
 				  "(sector %llu on %s).\n",
-- 
cgit v0.10.2


From a8707c08f4f718bb0ed65499d3f43201f6e41455 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 18 May 2010 09:28:43 +1000
Subject: md: simplify updating of event count to sometimes avoid updating
 spares.

When updating the event count for a simple clean <-> dirty transition,
we try to avoid updating the spares so they can safely spin-down.
As the event_counts across an array must be +/- 1, this means
decrementing the event_count on a dirty->clean transition.
This is not always safe and we have to avoid the unsafe time.
We current do this with a misguided idea about it being safe or
not depending on whether the event_count is odd or even.  This
approach only works reliably in a few common instances, but easily
falls down.

So instead, simply keep internal state concerning whether it is safe
or not, and always assume it is not safe when an array is first
assembled.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index fec4abc..9ef21d9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2088,7 +2088,6 @@ static void sync_sbs(mddev_t * mddev, int nospares)
 		if (rdev->sb_events == mddev->events ||
 		    (nospares &&
 		     rdev->raid_disk < 0 &&
-		     (rdev->sb_events&1)==0 &&
 		     rdev->sb_events+1 == mddev->events)) {
 			/* Don't update this superblock */
 			rdev->sb_loaded = 2;
@@ -2141,28 +2140,14 @@ repeat:
 	 * and 'events' is odd, we can roll back to the previous clean state */
 	if (nospares
 	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
-	    && (mddev->events & 1)
-	    && mddev->events != 1)
+	    && mddev->can_decrease_events
+	    && mddev->events != 1) {
 		mddev->events--;
-	else {
+		mddev->can_decrease_events = 0;
+	} else {
 		/* otherwise we have to go forward and ... */
 		mddev->events ++;
-		if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
-			/* .. if the array isn't clean, an 'even' event must also go
-			 * to spares. */
-			if ((mddev->events&1)==0) {
-				nospares = 0;
-				sync_req = 2; /* force a second update to get the
-					       * even/odd in sync */
-			}
-		} else {
-			/* otherwise an 'odd' event must go to spares */
-			if ((mddev->events&1)) {
-				nospares = 0;
-				sync_req = 2; /* force a second update to get the
-					       * even/odd in sync */
-			}
-		}
+		mddev->can_decrease_events = nospares;
 	}
 
 	if (!mddev->events) {
@@ -4606,6 +4591,7 @@ static void md_clean(mddev_t *mddev)
 	mddev->layout = 0;
 	mddev->max_disks = 0;
 	mddev->events = 0;
+	mddev->can_decrease_events = 0;
 	mddev->delta_disks = 0;
 	mddev->new_level = LEVEL_NONE;
 	mddev->new_layout = 0;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index a536f54..7ab5ea1 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -150,6 +150,12 @@ struct mddev_s
 	int				external_size; /* size managed
 							* externally */
 	__u64				events;
+	/* If the last 'event' was simply a clean->dirty transition, and
+	 * we didn't write it to the spares, then it is safe and simple
+	 * to just decrement the event count on a dirty->clean transition.
+	 * So we record that possibility here.
+	 */
+	int				can_decrease_events;
 
 	char				uuid[16];
 
-- 
cgit v0.10.2


From be6800a73aa2f3dc14744c3b80e676d189789f04 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 18 May 2010 10:17:09 +1000
Subject: md: don't insist on valid event count for spare devices.

Devices which know that they are spares do not really need to have
an event count that matches the rest of the array, so there are no
data-in-sync issues. It is enough that the uuid matches.
So remove the requirement that the event count is up-to-date.

We currently still write out and event count on spares, but this
allows us in a year or 3 to stop doing that completely.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9ef21d9..26b3d28 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1070,10 +1070,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 				mddev->bitmap_info.default_offset;
 
 	} else if (mddev->pers == NULL) {
-		/* Insist on good event counter while assembling */
+		/* Insist on good event counter while assembling, except
+		 * for spares (which don't need an event count) */
 		++ev1;
-		if (ev1 < mddev->events) 
-			return -EINVAL;
+		if (sb->disks[rdev->desc_nr].state & (
+			    (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
+			if (ev1 < mddev->events) 
+				return -EINVAL;
 	} else if (mddev->bitmap) {
 		/* if adding to array with a bitmap, then we can accept an
 		 * older device ... but not too old.
@@ -1469,10 +1472,14 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		}
 
 	} else if (mddev->pers == NULL) {
-		/* Insist of good event counter while assembling */
+		/* Insist of good event counter while assembling, except for
+		 * spares (which don't need an event count) */
 		++ev1;
-		if (ev1 < mddev->events)
-			return -EINVAL;
+		if (rdev->desc_nr >= 0 &&
+		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
+		    le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
+			if (ev1 < mddev->events)
+				return -EINVAL;
 	} else if (mddev->bitmap) {
 		/* If adding to array with a bitmap, then we can accept an
 		 * older device, but not too old.
-- 
cgit v0.10.2