From d87f064f5874ba60814c6ccac67104761be5b26c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 24 Apr 2013 11:42:40 +1000 Subject: md: never update metadata when array is read-only. Normally we don't even try to update the metadata if the array is read-only. However future patches will increase the number of things that can happen on a read-only array, so it is safest to explicitly disable this. Every time that mddev->ro is set to 0, either - md_update_sb will be called again (at least if MD_CHANGE_DEVS is set) or - the mddev->thread is scheduled, which will also run md_update_sb if needed. So this is safe: if the array ever become read-write the metadata will be updated. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index aeceedf..c6e44a8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2411,6 +2411,11 @@ static void md_update_sb(struct mddev * mddev, int force_change) int nospares = 0; int any_badblocks_changed = 0; + if (mddev->ro) { + if (force_change) + set_bit(MD_CHANGE_DEVS, &mddev->flags); + return; + } repeat: /* First make sure individual recovery_offsets are correct */ rdev_for_each(rdev, mddev) { -- cgit v0.10.2 From 746d3207ae2f9cf71fd87c9df884aa104fdaafa6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 24 Apr 2013 11:42:41 +1000 Subject: md: use common code for all calls to ->hot_remove_disk() slot_store and remove_and_add_spares both call ->hot_remove_disk(), but with slightly different tests and consequences, which is at least untidy and might be buggy. So modify remove_and_add_spaces() so that it can be asked to remove a specific device, and call it from slot_store(). We also clear the Blocked flag to ensure that doesn't prevent removal. The purpose of Blocked is to prevent automatic removal by the kernel before an error is acknowledged. If the array is read/write then user-space would have not reason to remove a device unless it was known to be 'spare' or 'faulty' in which it would have already cleared the Blocked flag. If the array is read-only, the flag might still be blocked, but there is no harm in clearing the flag for read-only arrays. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index c6e44a8..07f207e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -72,6 +72,9 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this); + #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } /* @@ -2805,12 +2808,10 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) /* personality does all needed checks */ if (rdev->mddev->pers->hot_remove_disk == NULL) return -EINVAL; - err = rdev->mddev->pers-> - hot_remove_disk(rdev->mddev, rdev); - if (err) - return err; - sysfs_unlink_rdev(rdev->mddev, rdev); - rdev->raid_disk = -1; + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(rdev->mddev, rdev); + if (rdev->raid_disk >= 0) + return -EBUSY; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { @@ -7649,14 +7650,16 @@ void md_do_sync(struct md_thread *thread) } EXPORT_SYMBOL_GPL(md_do_sync); -static int remove_and_add_spares(struct mddev *mddev) +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this) { struct md_rdev *rdev; int spares = 0; int removed = 0; rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0 && + if ((this == NULL || rdev == this) && + rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && @@ -7671,6 +7674,9 @@ static int remove_and_add_spares(struct mddev *mddev) if (removed && mddev->kobj.sd) sysfs_notify(&mddev->kobj, NULL, "degraded"); + if (this) + goto no_add; + rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && @@ -7689,6 +7695,7 @@ static int remove_and_add_spares(struct mddev *mddev) } } } +no_add: if (removed) set_bit(MD_CHANGE_DEVS, &mddev->flags); return spares; @@ -7872,7 +7879,7 @@ void md_check_recovery(struct mddev *mddev) goto unlock; set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if ((spares = remove_and_add_spares(mddev))) { + } else if ((spares = remove_and_add_spares(mddev, NULL))) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); -- cgit v0.10.2 From 3ea8929da3e61b54d4adcab1bf9a0034648d578e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 24 Apr 2013 11:42:41 +1000 Subject: md: HOT_DISK_REMOVE shouldn't make a read-auto device active. If a fail device or a spare is removed from an array, there is not need to make the array 'active'. If/when the array does become active for some other reason the metadata will be update to reflect the removal. If that never happens and the array is stopped while still read-auto, then there is no loss in forgetting the that the device had 'failed'. A read-only array will leave failed devices attached to the array personality, so we need to explicitly call remove_and_add_spares() to free it (clearing Blocked just like we do in store_slot()). Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 07f207e..802c2a3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5883,6 +5883,9 @@ static int hot_remove_disk(struct mddev * mddev, dev_t dev) if (!rdev) return -ENXIO; + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(mddev, rdev); + if (rdev->raid_disk >= 0) goto busy; @@ -6496,6 +6499,10 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = md_set_readonly(mddev, bdev); goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, new_decode_dev(arg)); + goto done_unlock; + case BLKROSET: if (get_user(ro, (int __user *)(arg))) { err = -EFAULT; @@ -6566,10 +6573,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto done_unlock; } - case HOT_REMOVE_DISK: - err = hot_remove_disk(mddev, new_decode_dev(arg)); - goto done_unlock; - case HOT_ADD_DISK: err = hot_add_disk(mddev, new_decode_dev(arg)); goto done_unlock; -- cgit v0.10.2 From 3f810b6c4a850db1b9989cd569ab88b90cd24996 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 24 Apr 2013 11:42:41 +1000 Subject: md: use set_bit_le and clear_bit_le The value returned by test_and_set_bit_le() drivers/md/bitmap.c is not used. So just use set_bit_le(). The same goes for test_and_clear_bit_le(). Signed-off-by: Akinobu Mita Cc: Neil Brown Cc: linux-raid@vger.kernel.org Signed-off-by: NeilBrown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 4fd9d6a..5a2c754 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -846,7 +846,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) set_bit(bit, kaddr); else - test_and_set_bit_le(bit, kaddr); + set_bit_le(bit, kaddr); kunmap_atomic(kaddr); pr_debug("set file bit %lu page %lu\n", bit, page->index); /* record page number so it gets flushed to disk when unplug occurs */ @@ -868,7 +868,7 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) clear_bit(bit, paddr); else - test_and_clear_bit_le(bit, paddr); + clear_bit_le(bit, paddr); kunmap_atomic(paddr); if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) { set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING); -- cgit v0.10.2 From 6f608040ce6cf37475dd2f6f3c97b009dab7c1d1 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Wed, 24 Apr 2013 11:42:41 +1000 Subject: md/raid5: Change or of some order to improve efficiency. As the function call is the most expensive of these tests it should be done later in the chain so that it can be avoided in some cases. Signed-off-by: Jianpeng Ma Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f4e87bf..f31882c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4672,9 +4672,10 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped = 1; return rv; } - if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && - !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { + if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + !conf->fullsync && + !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + sync_blocks >= STRIPE_SECTORS) { /* we can skip this block, and probably more */ sync_blocks /= STRIPE_SECTORS; *skipped = 1; -- cgit v0.10.2 From c0b32972fb1e1110101702f096c0877afcab9f1d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 24 Apr 2013 11:42:42 +1000 Subject: md/raid5: avoid an extra write when writing to a known-bad-block. If we write to a known-bad-block it will be flags as having a ReadError by analyse_stripe, but the write will proceed anyway (as it should). Then the read-error handling will kick in an write again, then re-read. We don't need that 'write-again', so set R5_ReWrite so it looks like it has already been done. Then we will just get the re-read, which we want. Reported-by: majianpeng Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f31882c..4a7be45 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1887,8 +1887,15 @@ static void raid5_end_write_request(struct bio *bi, int error) &rdev->mddev->recovery); } else if (is_badblock(rdev, sh->sector, STRIPE_SECTORS, - &first_bad, &bad_sectors)) + &first_bad, &bad_sectors)) { set_bit(R5_MadeGood, &sh->dev[i].flags); + if (test_bit(R5_ReadError, &sh->dev[i].flags)) + /* That was a successful write so make + * sure it looks like we already did + * a re-write. + */ + set_bit(R5_ReWrite, &sh->dev[i].flags); + } } rdev_dec_pending(rdev, conf->mddev); -- cgit v0.10.2 From b6fec069af5a26439f7d36cb3d4f5dd1378af7e2 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Wed, 24 Apr 2013 11:42:42 +1000 Subject: MD: Fix typos in MD documentation MD: Fix some typos/grammer in MD documentation Reviewed-by: Paul Menzel Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown diff --git a/Documentation/md.txt b/Documentation/md.txt index 993fba3..e0ddd32 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt @@ -119,7 +119,7 @@ device to add. The array is started with the RUN_ARRAY ioctl. Once started, new devices can be added. They should have an -appropriate superblock written to them, and then passed be in with +appropriate superblock written to them, and then be passed in with ADD_NEW_DISK. Devices that have failed or are not yet active can be detached from an @@ -131,7 +131,7 @@ Specific Rules that apply to format-0 super block arrays, and ------------------------------------------------------------- An array can be 'created' by describing the array (level, chunksize -etc) in a SET_ARRAY_INFO ioctl. This must has major_version==0 and +etc) in a SET_ARRAY_INFO ioctl. This must have major_version==0 and raid_disks != 0. Then uninitialized devices can be added with ADD_NEW_DISK. The @@ -426,7 +426,7 @@ Each directory contains: offset This gives the location in the device (in sectors from the start) where data from the array will be stored. Any part of - the device before this offset us not touched, unless it is + the device before this offset is not touched, unless it is used for storing metadata (Formats 1.1 and 1.2). size @@ -440,7 +440,7 @@ Each directory contains: When the device is not 'in_sync', this records the number of sectors from the start of the device which are known to be correct. This is normally zero, but during a recovery - operation is will steadily increase, and if the recovery is + operation it will steadily increase, and if the recovery is interrupted, restoring this value can cause recovery to avoid repeating the earlier blocks. With v1.x metadata, this value is saved and restored automatically. @@ -468,7 +468,7 @@ Each directory contains: -An active md device will also contain and entry for each active device +An active md device will also contain an entry for each active device in the array. These are named rdNN @@ -482,7 +482,7 @@ will show 'in_sync' on every line. -Active md devices for levels that support data redundancy (1,4,5,6) +Active md devices for levels that support data redundancy (1,4,5,6,10) also have sync_action @@ -494,7 +494,7 @@ also have failed/missing device idle - nothing is happening check - A full check of redundancy was requested and is - happening. This reads all block and checks + happening. This reads all blocks and checks them. A repair may also happen for some raid levels. repair - A full check and repair is happening. This is @@ -522,7 +522,7 @@ also have degraded This contains a count of the number of devices by which the - arrays is degraded. So an optimal array with show '0'. A + arrays is degraded. So an optimal array will show '0'. A single failed/missing drive will show '1', etc. This file responds to select/poll, any increase or decrease in the count of missing devices will trigger an event. -- cgit v0.10.2 From 7e83ccbecd608b971f340e951c9e84cd0343002f Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 24 Apr 2013 11:42:42 +1000 Subject: md/raid10: Allow skipping recovery when clean arrays are assembled When an array is assembled incrementally with mdadm -I -R and the array switches to "active" mode, md starts a recovery. If the array was clean, the "fullsync" flag will be 0. Skip the full recovery in this case, as RAID1 does (the code was actually copied from the sync_request() method of RAID1). Signed-off-by: Martin Wilck Signed-off-by: NeilBrown diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 77b562d..2372926 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2913,6 +2913,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, if (init_resync(conf)) return 0; + /* + * Allow skipping a full rebuild for incremental assembly + * of a clean array, like RAID1 does. + */ + if (mddev->bitmap == NULL && + mddev->recovery_cp == MaxSector && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + conf->fullsync == 0) { + *skipped = 1; + max_sector = mddev->dev_sectors; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + max_sector = mddev->resync_max_sectors; + return max_sector - sector_nr; + } + skipped: max_sector = mddev->dev_sectors; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || -- cgit v0.10.2 From 7ceb17e87bde79d285a8b988cfed9eaeebe60b86 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 24 Apr 2013 11:42:42 +1000 Subject: md: Allow devices to be re-added to a read-only array. When assembling an array incrementally we might want to make it device available when "enough" devices are present, but maybe not "all" devices are present. If the remaining devices appear before the array is actually used, they should be added transparently. We do this by using the "read-auto" mode where the array acts like it is read-only until a write request arrives. Current an add-device request switches a read-auto array to active. This means that only one device can be added after the array is first made read-auto. This isn't a problem for RAID5, but is not ideal for RAID6 or RAID10. Also we don't really want to switch the array to read-auto at all when re-adding a device as this doesn't really imply any change. So: - remove the "md_update_sb()" call from add_new_disk(). This isn't really needed as just adding a disk doesn't require a metadata update. Instead, just set MD_CHANGE_DEVS. This will effect a metadata update soon enough, once the array is not read-only. - Allow the ADD_NEW_DISK ioctl to succeed without activating a read-auto array, providing the MD_DISK_SYNC flag is set. In this case, the device will be rejected if it cannot be added with the correct device number, or has an incorrect event count. - Teach remove_and_add_spares() to be careful about adding spares when the array is read-only (or read-mostly) - only add devices that are thought to be in-sync, and only do it if the array is in-sync itself. - In md_check_recovery, use remove_and_add_spares in the read-only case, rather than open coding just the 'remove' part of it. Reported-by: Martin Wilck Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 802c2a3..491afda 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5816,7 +5816,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) else sysfs_notify_dirent_safe(rdev->sysfs_state); - md_update_sb(mddev, 1); + set_bit(MD_CHANGE_DEVS, &mddev->flags); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -6503,6 +6503,24 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = hot_remove_disk(mddev, new_decode_dev(arg)); goto done_unlock; + case ADD_NEW_DISK: + /* We can support ADD_NEW_DISK on read-only arrays + * on if we are re-adding a preexisting device. + * So require mddev->pers and MD_DISK_SYNC. + */ + if (mddev->pers) { + mdu_disk_info_t info; + if (copy_from_user(&info, argp, sizeof(info))) + err = -EFAULT; + else if (!(info.state & (1<flags) && !test_bit(Faulty, &rdev->flags)) spares++; - if (rdev->raid_disk < 0 - && !test_bit(Faulty, &rdev->flags)) { - rdev->recovery_offset = 0; - if (mddev->pers-> - hot_add_disk(mddev, rdev) == 0) { - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; - spares++; - md_new_event(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - } + if (rdev->raid_disk >= 0) + continue; + if (test_bit(Faulty, &rdev->flags)) + continue; + if (mddev->ro && + rdev->saved_raid_disk < 0) + continue; + + rdev->recovery_offset = 0; + if (rdev->saved_raid_disk >= 0 && mddev->in_sync) { + spin_lock_irq(&mddev->write_lock); + if (mddev->in_sync) + /* OK, this device, which is in_sync, + * will definitely be noticed before + * the next write, so recovery isn't + * needed. + */ + rdev->recovery_offset = mddev->recovery_cp; + spin_unlock_irq(&mddev->write_lock); + } + if (mddev->ro && rdev->recovery_offset != MaxSector) + /* not safe to add this disk now */ + continue; + if (mddev->pers-> + hot_add_disk(mddev, rdev) == 0) { + if (sysfs_link_rdev(mddev, rdev)) + /* failure here is OK */; + spares++; + md_new_event(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); } } no_add: @@ -7804,22 +7841,16 @@ void md_check_recovery(struct mddev *mddev) int spares = 0; if (mddev->ro) { - /* Only thing we do on a ro array is remove - * failed devices. + /* On a read-only array we can: + * - remove failed devices + * - add already-in_sync devices if the array itself + * is in-sync. + * As we only add devices that are already in-sync, + * we can activate the spares immediately. */ - struct md_rdev *rdev; - rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Blocked, &rdev->flags) && - test_bit(Faulty, &rdev->flags) && - atomic_read(&rdev->nr_pending)==0) { - if (mddev->pers->hot_remove_disk( - mddev, rdev) == 0) { - sysfs_unlink_rdev(mddev, rdev); - rdev->raid_disk = -1; - } - } clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + remove_and_add_spares(mddev, NULL); + mddev->pers->spare_active(mddev); goto unlock; } -- cgit v0.10.2 From b6d428c669159b8cfa5a83bb11abaaf4bdc170b0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 24 Apr 2013 11:42:42 +1000 Subject: md: don't update metadata when stopping a read-only array. read-only arrays should stay that way as much as possible. Updating the metadata - which could be triggered by a re-add while assembling the array metadata - should be avoided. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 491afda..3013228 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5293,7 +5293,8 @@ static void __md_stop_writes(struct mddev *mddev) bitmap_flush(mddev); md_super_wait(mddev); - if (!mddev->in_sync || mddev->flags) { + if (mddev->ro == 0 && + (!mddev->in_sync || mddev->flags)) { /* mark array as shutdown cleanly */ mddev->in_sync = 1; md_update_sb(mddev, 1); -- cgit v0.10.2 From a91d5ac04841ca1be340e8610e6d899fc8b419b5 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Wed, 24 Apr 2013 11:42:43 +1000 Subject: MD: Export 'md_reap_sync_thread' function MD: Export 'md_reap_sync_thread' function Make 'md_reap_sync_thread' available to other files, specifically dm-raid.c. - rename reap_sync_thread to md_reap_sync_thread - move the fn after md_check_recovery to match md.h declaration placement - export md_reap_sync_thread Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 3013228..0df1b9a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4231,8 +4231,6 @@ action_show(struct mddev *mddev, char *page) return sprintf(page, "%s\n", type); } -static void reap_sync_thread(struct mddev *mddev); - static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { @@ -4247,7 +4245,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - reap_sync_thread(mddev); + md_reap_sync_thread(mddev); } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -5285,7 +5283,7 @@ static void __md_stop_writes(struct mddev *mddev) if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); - reap_sync_thread(mddev); + md_reap_sync_thread(mddev); } del_timer_sync(&mddev->safemode_timer); @@ -7742,51 +7740,6 @@ no_add: return spares; } -static void reap_sync_thread(struct mddev *mddev) -{ - struct md_rdev *rdev; - - /* resync has finished, collect result */ - md_unregister_thread(&mddev->sync_thread); - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* success...*/ - /* activate any spares */ - if (mddev->pers->spare_active(mddev)) { - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - } - } - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) - mddev->pers->finish_reshape(mddev); - - /* If array is no-longer degraded, then any saved_raid_disk - * information must be scrapped. Also if any device is now - * In_sync we must scrape the saved_raid_disk for that device - * do the superblock for an incrementally recovered device - * written out. - */ - rdev_for_each(rdev, mddev) - if (!mddev->degraded || - test_bit(In_sync, &rdev->flags)) - rdev->saved_raid_disk = -1; - - md_update_sb(mddev, 1); - clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - /* flag recovery needed just to double check */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); - if (mddev->event_work.func) - queue_work(md_misc_wq, &mddev->event_work); -} - /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -7883,7 +7836,7 @@ void md_check_recovery(struct mddev *mddev) goto unlock; } if (mddev->sync_thread) { - reap_sync_thread(mddev); + md_reap_sync_thread(mddev); goto unlock; } /* Set RUNNING before clearing NEEDED to avoid @@ -7964,6 +7917,51 @@ void md_check_recovery(struct mddev *mddev) } } +void md_reap_sync_thread(struct mddev *mddev) +{ + struct md_rdev *rdev; + + /* resync has finished, collect result */ + md_unregister_thread(&mddev->sync_thread); + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + if (mddev->pers->spare_active(mddev)) { + sysfs_notify(&mddev->kobj, NULL, + "degraded"); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + } + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); + + /* If array is no-longer degraded, then any saved_raid_disk + * information must be scrapped. Also if any device is now + * In_sync we must scrape the saved_raid_disk for that device + * do the superblock for an incrementally recovered device + * written out. + */ + rdev_for_each(rdev, mddev) + if (!mddev->degraded || + test_bit(In_sync, &rdev->flags)) + rdev->saved_raid_disk = -1; + + md_update_sb(mddev, 1); + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* flag recovery needed just to double check */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(mddev); + if (mddev->event_work.func) + queue_work(md_misc_wq, &mddev->event_work); +} + void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) { sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -8689,6 +8687,7 @@ EXPORT_SYMBOL(md_register_thread); EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_wakeup_thread); EXPORT_SYMBOL(md_check_recovery); +EXPORT_SYMBOL(md_reap_sync_thread); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD RAID framework"); MODULE_ALIAS("md"); diff --git a/drivers/md/md.h b/drivers/md/md.h index d90fb1a..653f992b6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -567,6 +567,7 @@ extern struct md_thread *md_register_thread( extern void md_unregister_thread(struct md_thread **threadp); extern void md_wakeup_thread(struct md_thread *thread); extern void md_check_recovery(struct mddev *mddev); +extern void md_reap_sync_thread(struct mddev *mddev); extern void md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); -- cgit v0.10.2 From be83651f0050ca8621d58d35dad558e9c45cb18f Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Wed, 24 Apr 2013 11:42:43 +1000 Subject: DM RAID: Add message/status support for changing sync action DM RAID: Add message/status support for changing sync action This patch adds a message interface to dm-raid to allow the user to more finely control the sync actions being performed by the MD driver. This gives the user the ability to initiate "check" and "repair" (i.e. scrubbing). Two additional fields have been appended to the status output to provide more information about the type of sync action occurring and the results of those actions, specifically: and . These new fields will always be populated. This is essentially the device-mapper way of doing what MD controls through the 'sync_action' sysfs file and shows through the 'mismatch_cnt' sysfs file. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index b428556..e919228 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -1,10 +1,13 @@ dm-raid -------- +======= The device-mapper RAID (dm-raid) target provides a bridge from DM to MD. It allows the MD RAID drivers to be accessed using a device-mapper interface. + +Mapping Table Interface +----------------------- The target is named "raid" and it accepts the following parameters: <#raid_params> \ @@ -47,7 +50,7 @@ The target is named "raid" and it accepts the following parameters: followed by optional parameters (in any order): [sync|nosync] Force or prevent RAID initialization. - [rebuild ] Rebuild drive number idx (first drive is 0). + [rebuild ] Rebuild drive number 'idx' (first drive is 0). [daemon_sleep ] Interval between runs of the bitmap daemon that @@ -56,9 +59,9 @@ The target is named "raid" and it accepts the following parameters: [min_recovery_rate ] Throttle RAID initialization [max_recovery_rate ] Throttle RAID initialization - [write_mostly ] Drive index is write-mostly - [max_write_behind ] See '-write-behind=' (man mdadm) - [stripe_cache ] Stripe cache size (higher RAIDs only) + [write_mostly ] Mark drive index 'idx' write-mostly. + [max_write_behind ] See '--write-behind=' (man mdadm) + [stripe_cache ] Stripe cache size (RAID 4/5/6 only) [region_size ] The region_size multiplied by the number of regions is the logical size of the array. The bitmap records the device @@ -122,7 +125,7 @@ The target is named "raid" and it accepts the following parameters: given for both the metadata and data drives for a given position. -Example tables +Example Tables -------------- # RAID4 - 4 data drives, 1 parity (no metadata devices) # No metadata devices specified to hold superblock/bitmap info @@ -141,26 +144,70 @@ Example tables raid4 4 2048 sync min_recovery_rate 20 \ 5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82 + +Status Output +------------- 'dmsetup table' displays the table used to construct the mapping. The optional parameters are always printed in the order listed above with "sync" or "nosync" always output ahead of the other arguments, regardless of the order used when originally loading the table. Arguments that can be repeated are ordered by value. -'dmsetup status' yields information on the state and health of the -array. -The output is as follows: + +'dmsetup status' yields information on the state and health of the array. +The output is as follows (normally a single line, but expanded here for +clarity): 1: raid \ -2: <#devices> <1 health char for each dev> +2: <#devices> \ +3: Line 1 is the standard output produced by device-mapper. -Line 2 is produced by the raid target, and best explained by example: - 0 1960893648 raid raid4 5 AAAAA 2/490221568 +Line 2 & 3 are produced by the raid target and are best explained by example: + 0 1960893648 raid raid4 5 AAAAA 2/490221568 init 0 Here we can see the RAID type is raid4, there are 5 devices - all of -which are 'A'live, and the array is 2/490221568 complete with recovery. -Faulty or missing devices are marked 'D'. Devices that are out-of-sync -are marked 'a'. - +which are 'A'live, and the array is 2/490221568 complete with its initial +recovery. Here is a fuller description of the individual fields: + Same as the used to create the array. + One char for each device, indicating: 'A' = alive and + in-sync, 'a' = alive but not in-sync, 'D' = dead/failed. + The ratio indicating how much of the array has undergone + the process described by 'sync_action'. If the + 'sync_action' is "check" or "repair", then the process + of "resync" or "recover" can be considered complete. + One of the following possible states: + idle - No synchronization action is being performed. + frozen - The current action has been halted. + resync - Array is undergoing its initial synchronization + or is resynchronizing after an unclean shutdown + (possibly aided by a bitmap). + recover - A device in the array is being rebuilt or + replaced. + check - A user-initiated full check of the array is + being performed. All blocks are read and + checked for consistency. The number of + discrepancies found are recorded in + . No changes are made to the + array by this action. + repair - The same as "check", but discrepancies are + corrected. + reshape - The array is undergoing a reshape. + The number of discrepancies found between mirror copies + in RAID1/10 or wrong parity values found in RAID4/5/6. + This value is valid only after a "check" of the array + is performed. A healthy array has a 'mismatch_cnt' of 0. + +Message Interface +----------------- +The dm-raid target will accept certain actions through the 'message' interface. +('man dmsetup' for more information on the message interface.) These actions +include: + "idle" - Halt the current sync action. + "frozen" - Freeze the current sync action. + "resync" - Initiate/continue a resync. + "recover"- Initiate/continue a recover process. + "check" - Initiate a check (i.e. a "scrub") of the array. + "repair" - Initiate a repair of the array. + "reshape"- Currently unsupported (-EINVAL). Version History --------------- @@ -171,4 +218,7 @@ Version History 1.3.1 Allow device replacement/rebuild for RAID 10 1.3.2 Fix/improve redundancy checking for RAID10 1.4.0 Non-functional change. Removes arg from mapping function. -1.4.1 Add RAID10 "far" and "offset" algorithm support. +1.4.1 RAID10 fix redundancy validation checks (commit 55ebbb5). +1.4.2 Add RAID10 "far" and "offset" algorithm support. +1.5.0 Add message interface to allow manipulation of the sync_action. + New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt. diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 311e3d3..1d3fe1a 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1279,6 +1279,31 @@ static int raid_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } +static const char *decipher_sync_action(struct mddev *mddev) +{ + if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + return "frozen"; + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return "reshape"; + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + return "resync"; + else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + return "check"; + return "repair"; + } + + if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + return "recover"; + } + + return "idle"; +} + static void raid_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { @@ -1298,8 +1323,18 @@ static void raid_status(struct dm_target *ti, status_type_t type, sync = rs->md.recovery_cp; if (sync >= rs->md.resync_max_sectors) { + /* + * Sync complete. + */ array_in_sync = 1; sync = rs->md.resync_max_sectors; + } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) { + /* + * If "check" or "repair" is occurring, the array has + * undergone and initial sync and the health characters + * should not be 'a' anymore. + */ + array_in_sync = 1; } else { /* * The array may be doing an initial sync, or it may @@ -1311,6 +1346,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) array_in_sync = 1; } + /* * Status characters: * 'D' = Dead/Failed device @@ -1339,6 +1375,21 @@ static void raid_status(struct dm_target *ti, status_type_t type, (unsigned long long) sync, (unsigned long long) rs->md.resync_max_sectors); + /* + * Sync action: + * See Documentation/device-mapper/dm-raid.c for + * information on each of these states. + */ + DMEMIT(" %s", decipher_sync_action(&rs->md)); + + /* + * resync_mismatches/mismatch_cnt + * This field shows the number of discrepancies found when + * performing a "check" of the array. + */ + DMEMIT(" %llu", + (unsigned long long) + atomic64_read(&rs->md.resync_mismatches)); break; case STATUSTYPE_TABLE: /* The string you would use to construct this array */ @@ -1425,7 +1476,62 @@ static void raid_status(struct dm_target *ti, status_type_t type, } } -static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) +static int raid_message(struct dm_target *ti, unsigned argc, char **argv) +{ + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; + + if (!strcasecmp(argv[0], "reshape")) { + DMERR("Reshape not supported."); + return -EINVAL; + } + + if (!mddev->pers || !mddev->pers->sync_request) + return -EINVAL; + + if (!strcasecmp(argv[0], "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_reap_sync_thread(mddev); + } + } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + return -EBUSY; + else if (!strcasecmp(argv[0], "resync")) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + else if (!strcasecmp(argv[0], "recover")) { + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } else { + if (!strcasecmp(argv[0], "check")) + set_bit(MD_RECOVERY_CHECK, &mddev->recovery); + else if (!!strcasecmp(argv[0], "repair")) + return -EINVAL; + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } + if (mddev->ro == 2) { + /* A write to sync_action is enough to justify + * canceling read-auto mode + */ + mddev->ro = 0; + if (!mddev->suspended) + md_wakeup_thread(mddev->sync_thread); + } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + if (!mddev->suspended) + md_wakeup_thread(mddev->thread); + + return 0; +} + +static int raid_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) { struct raid_set *rs = ti->private; unsigned i; @@ -1482,12 +1588,13 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 4, 2}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, .map = raid_map, .status = raid_status, + .message = raid_message, .iterate_devices = raid_iterate_devices, .io_hints = raid_io_hints, .presuspend = raid_presuspend, -- cgit v0.10.2 From 0fea7ed82b1edf384735a2b535368f54a8584c3a Mon Sep 17 00:00:00 2001 From: Hirokazu Takahashi Date: Wed, 24 Apr 2013 11:42:44 +1000 Subject: md: raid1/raid10 md devices leak memory when stopping Hi. Raid1 and raid10 devices leak memory every time they stop. This is a patch for linux-3.9.0-rc7 to fix this problem. Thanks, Hirokazu Takahashi. Signed-off-by: Hirokazu Takahashi Signed-off-by: NeilBrown diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fd86b37..c055b92 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2901,6 +2901,7 @@ static int stop(struct mddev *mddev) if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); + safe_put_page(conf->tmppage); kfree(conf->poolinfo); kfree(conf); mddev->private = NULL; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2372926..c35d912 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3826,6 +3826,7 @@ static int stop(struct mddev *mddev) if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); + safe_put_page(conf->tmppage); kfree(conf->mirrors); kfree(conf); mddev->private = NULL; -- cgit v0.10.2 From 486adf72ccc0c235754923d47a2270c5dcb0c98b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 24 Apr 2013 11:42:44 +1000 Subject: md: bad block list should default to disabled. Maintenance of a bad-block-list currently defaults to 'enabled' and is then disabled when it cannot be supported. This is backwards and causes problem for dm-raid which didn't know to disable it. So fix the defaults, and only enabled for v1.x metadata which explicitly has bad blocks enabled. The problem with dm-raid has been present since badblock support was added in v3.1, so this patch is suitable for any -stable from 3.1 onwards. Cc: stable@vger.kernel.org (3.1+) Reported-by: Jonathan Brassow Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 0df1b9a..4c74424 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1567,8 +1567,8 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sector, count, 1) == 0) return -EINVAL; } - } else if (sb->bblog_offset == 0) - rdev->badblocks.shift = -1; + } else if (sb->bblog_offset != 0) + rdev->badblocks.shift = 0; if (!refdev) { ret = 1; @@ -3227,7 +3227,7 @@ int md_rdev_init(struct md_rdev *rdev) * be used - I wonder if that matters */ rdev->badblocks.count = 0; - rdev->badblocks.shift = 0; + rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); seqlock_init(&rdev->badblocks.lock); if (rdev->badblocks.page == NULL) @@ -3299,9 +3299,6 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe goto abort_free; } } - if (super_format == -1) - /* hot-add for 0.90, or non-persistent: so no badblocks */ - rdev->badblocks.shift = -1; return rdev; -- cgit v0.10.2 From 32f9f570d04461a41bdcd5c1d93b41ebc5ce182a Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Sun, 28 Apr 2013 18:26:38 +0800 Subject: MD: ignore discard request for hard disks of hybid raid1/raid10 array In SSD/hard disk hybid storage, discard request should be ignored for hard disk. We used to be doing this way, but the unplug path forgets it. This is suitable for stable tree since v3.6. Cc: stable@vger.kernel.org Reported-and-tested-by: Markus Signed-off-by: Shaohua Li Signed-off-by: NeilBrown diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c055b92..851023e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -981,7 +981,12 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } kfree(plug); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c35d912..018741b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1133,7 +1133,12 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } kfree(plug); -- cgit v0.10.2