From bf572541ab44240163eaa2d486b06f306a31d45a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 12 Jan 2011 09:03:35 +1100 Subject: md: fix regression with re-adding devices to arrays with no metadata Commit 1a855a0606 (2.6.37-rc4) fixed a problem where devices were re-added when they shouldn't be but caused a regression in a less common case that means sometimes devices cannot be re-added when they should be. In particular, when re-adding a device to an array without metadata we should always access the device, but after the above commit we didn't. This patch sets the In_sync flag in that case so that the re-add succeeds. This patch is suitable for any -stable kernel to which 1a855a0606 was applied. Cc: stable@kernel.org Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 175c424f..0da25da 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5159,9 +5159,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) /* set saved_raid_disk if appropriate */ if (!mddev->persistent) { if (info->state & (1<raid_disk < mddev->raid_disks) + info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; - else + set_bit(In_sync, &rdev->flags); + } else rdev->raid_disk = -1; } else super_types[mddev->major_version]. -- cgit v0.10.2 From 6c9879101442b08581e8a0e3ae6b7f643a78fd63 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:13:53 +1100 Subject: md: fix regression resulting in delays in clearing bits in a bitmap commit 589a594be1fb (2.6.37-rc4) fixed a problem were md_thread would sometimes call the ->run function at a bad time. If an error is detected during array start up after the md_thread has been started, the md_thread is killed. This resulted in the ->run function being called once. However the array may not be in a state that it is safe to call ->run. However the fix imposed meant that ->run was not called on a timeout. This means that when an array goes idle, bitmap bits do not get cleared promptly. While the array is busy the bits will still be cleared when appropriate so this is not very serious. There is no risk to data. Change the test so that we only avoid calling ->run when the thread is being stopped. This more explicitly addresses the problem situation. This is suitable for 2.6.37-stable and any -stable kernel to which 589a594be1fb was applied. Cc: stable@kernel.org Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 0da25da..a301912 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6042,7 +6042,8 @@ static int md_thread(void * arg) || kthread_should_stop(), thread->timeout); - if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags)) + clear_bit(THREAD_WAKEUP, &thread->flags); + if (!kthread_should_stop()) thread->run(thread->mddev); } -- cgit v0.10.2 From 067032bc628598606056412594042564fcf09e22 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 14 Jan 2011 09:14:33 +1100 Subject: md: Fix single printks with multiple KERN_s Noticed-by: Russell King Signed-off-by: Joe Perches Signed-off-by: NeilBrown diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 845cf95..d50056b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1027,8 +1027,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", + printk(KERN_ALERT + "md/raid1:%s: Disk failure on %s, disabling device.\n" + "md/raid1:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0641674..03825cb 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1051,8 +1051,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", + printk(KERN_ALERT + "md/raid10:%s: Disk failure on %s, disabling device.\n" + "md/raid10:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dc574f3..316fbe7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1721,7 +1721,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(Faulty, &rdev->flags); printk(KERN_ALERT "md/raid:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), -- cgit v0.10.2 From 0ca69886a8273ac1350143d562280bfcbe4760dc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:33 +1100 Subject: md: Ensure no IO request to get md device before it is properly initialised. When an md device is in the process of coming on line it is possible for an IO request (typically a partition table probe) to get through before the array is fully initialised, which can cause unexpected behaviour (e.g. a crash). So explicitly record when the array is ready for IO and don't allow IO through until then. There is no possibility for a similar problem when the array is going off-line as there must only be one 'open' at that time, and it is busy off-lining the array and so cannot send IO requests. So no memory barrier is needed in md_stop() This has been a bug since commit 409c57f3801 in 2.6.30 which introduced md_make_request. Before then, each personality would register its own make_request_fn when it was ready. This is suitable for any stable kernel from 2.6.30.y onwards. Cc: Signed-off-by: NeilBrown Reported-by: "Hawrylewicz Czarnowski, Przemyslaw" diff --git a/drivers/md/md.c b/drivers/md/md.c index a301912..540347c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -288,10 +288,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio) int rv; int cpu; - if (mddev == NULL || mddev->pers == NULL) { + if (mddev == NULL || mddev->pers == NULL + || !mddev->ready) { bio_io_error(bio); return 0; } + smp_rmb(); /* Ensure implications of 'active' are visible */ rcu_read_lock(); if (mddev->suspended) { DEFINE_WAIT(__wait); @@ -4564,7 +4566,8 @@ int md_run(mddev_t *mddev) mddev->safemode_timer.data = (unsigned long) mddev; mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; - + smp_wmb(); + mddev->ready = 1; list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0) { char nm[20]; @@ -4725,6 +4728,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes); void md_stop(mddev_t *mddev) { + mddev->ready = 0; mddev->pers->stop(mddev); if (mddev->pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; diff --git a/drivers/md/md.h b/drivers/md/md.h index d05bab5..229675a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -148,7 +148,8 @@ struct mddev_s * are happening, so run/ * takeover/stop are not safe */ - + int ready; /* See when safe to pass + * IO requests down */ struct gendisk *gendisk; struct kobject kobj; -- cgit v0.10.2 From 43c73ca43b3e03bb228ff9350b6b44d0e560f262 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Fri, 14 Jan 2011 09:14:33 +1100 Subject: md/raid5: use sysfs_notify_dirent_safe to avoid NULL pointer With the module parameter 'start_dirty_degraded' set, raid5_spare_active() previously called sysfs_notify_dirent() with a NULL argument (rdev->sysfs_state) when a rebuild finished. Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 316fbe7..9212c07 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5338,7 +5338,7 @@ static int raid5_spare_active(mddev_t *mddev) && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; - sysfs_notify_dirent(tmp->rdev->sysfs_state); + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); -- cgit v0.10.2 From defad61a5b16352d3e22a04d4c930a5b5a7fd1f0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:33 +1100 Subject: md: md_stop_writes requires mddev_lock. As md_stop_writes manipulates the sync_thread and calls md_update_sb, it need to be called with mddev_lock held. In all internal cases it is, but the symbol is exported for dm-raid to call and in that case the lock won't be help. Do make an exported version which takes the lock, and an internal version which does not. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 540347c..f43ff29 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4704,7 +4704,7 @@ static void md_clean(mddev_t *mddev) mddev->plug = NULL; } -void md_stop_writes(mddev_t *mddev) +static void __md_stop_writes(mddev_t *mddev) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -4724,6 +4724,13 @@ void md_stop_writes(mddev_t *mddev) md_update_sb(mddev, 1); } } + +void md_stop_writes(mddev_t *mddev) +{ + mddev_lock(mddev); + __md_stop_writes(mddev); + mddev_unlock(mddev); +} EXPORT_SYMBOL_GPL(md_stop_writes); void md_stop(mddev_t *mddev) @@ -4748,7 +4755,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open) goto out; } if (mddev->pers) { - md_stop_writes(mddev); + __md_stop_writes(mddev); err = -ENXIO; if (mddev->ro==1) @@ -4785,7 +4792,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) if (mddev->ro) set_disk_ro(disk, 0); - md_stop_writes(mddev); + __md_stop_writes(mddev); md_stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; -- cgit v0.10.2 From 7ebc0be7fff4146e87b4078f054977b72998abd3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:33 +1100 Subject: md: Be more careful about clearing flags bit in ->recovery Setting ->recovery to 0 is generally not a good idea as it could clear bits that shouldn't be cleared. In particular, MD_RECOVERY_FROZEN should only be cleared on explicit request from user-space. So when we need to clear things, just clear the bits that need clearing. As there are a few different places which reap a resync process - and some do an incomplte job - factor out the code for doing the from md_check_recovery and call that function instead of open coding part of it. Signed-off-by: NeilBrown Reported-by: Jonathan Brassow diff --git a/drivers/md/md.c b/drivers/md/md.c index f43ff29..a91dc59 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3746,6 +3746,8 @@ action_show(mddev_t *mddev, char *page) return sprintf(page, "%s\n", type); } +static void reap_sync_thread(mddev_t *mddev); + static ssize_t action_store(mddev_t *mddev, const char *page, size_t len) { @@ -3760,9 +3762,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - mddev->recovery = 0; + reap_sync_thread(mddev); } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -4709,8 +4709,7 @@ static void __md_stop_writes(mddev_t *mddev) if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; + reap_sync_thread(mddev); } del_timer_sync(&mddev->safemode_timer); @@ -7044,6 +7043,45 @@ static int remove_and_add_spares(mddev_t *mddev) } return spares; } + +static void reap_sync_thread(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + + /* resync has finished, collect result */ + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + if (mddev->pers->spare_active(mddev)) + sysfs_notify(&mddev->kobj, NULL, + "degraded"); + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); + md_update_sb(mddev, 1); + + /* if array is no-longer degraded, then any saved_raid_disk + * information must be scrapped + */ + if (!mddev->degraded) + list_for_each_entry(rdev, &mddev->disks, same_set) + rdev->saved_raid_disk = -1; + + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* flag recovery needed just to double check */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(mddev); +} + /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -7068,9 +7106,6 @@ static int remove_and_add_spares(mddev_t *mddev) */ void md_check_recovery(mddev_t *mddev) { - mdk_rdev_t *rdev; - - if (mddev->bitmap) bitmap_daemon_work(mddev); @@ -7138,34 +7173,7 @@ void md_check_recovery(mddev_t *mddev) goto unlock; } if (mddev->sync_thread) { - /* resync has finished, collect result */ - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* success...*/ - /* activate any spares */ - if (mddev->pers->spare_active(mddev)) - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - } - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) - mddev->pers->finish_reshape(mddev); - md_update_sb(mddev, 1); - - /* if array is no-longer degraded, then any saved_raid_disk - * information must be scrapped - */ - if (!mddev->degraded) - list_for_each_entry(rdev, &mddev->disks, same_set) - rdev->saved_raid_disk = -1; - - mddev->recovery = 0; - /* flag recovery needed just to double check */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); + reap_sync_thread(mddev); goto unlock; } /* Set RUNNING before clearing NEEDED to avoid @@ -7223,7 +7231,11 @@ void md_check_recovery(mddev_t *mddev) " thread...\n", mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ - mddev->recovery = 0; + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); } else md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_action); -- cgit v0.10.2 From 57b2caa394393f8870ed41bdcc38a7542593018f Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Fri, 14 Jan 2011 09:14:33 +1100 Subject: md-new-param-to-calc_dev_sboffset When we allow for separate devices for data and metadata in a later patch, we will need to be able to calculate the superblock offset based on more than the bdev. Signed-off-by: Jonathan Brassow diff --git a/drivers/md/md.c b/drivers/md/md.c index a91dc59..0a0d7c2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -705,9 +705,9 @@ static struct mdk_personality *find_pers(int level, char *clevel) } /* return the offset of the super block in 512byte sectors */ -static inline sector_t calc_dev_sboffset(struct block_device *bdev) +static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev) { - sector_t num_sectors = i_size_read(bdev->bd_inode) / 512; + sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; return MD_NEW_SIZE_SECTORS(num_sectors); } @@ -991,7 +991,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version * * It also happens to be a multiple of 4Kb. */ - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret; @@ -1332,7 +1332,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) return 0; /* component must fit device */ if (rdev->mddev->bitmap_info.offset) return 0; /* can't move bitmap */ - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); if (!num_sectors || num_sectors > rdev->sb_start) num_sectors = rdev->sb_start; md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, @@ -5249,7 +5249,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) printk(KERN_INFO "md: nonpersistent superblock ...\n"); rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; } else - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); rdev->sectors = rdev->sb_start; err = bind_rdev_to_array(rdev, mddev); @@ -5316,7 +5316,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) } if (mddev->persistent) - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); else rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; -- cgit v0.10.2 From ccebd4c4159462c96397ae9af9c667bb394d7b70 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Fri, 14 Jan 2011 09:14:33 +1100 Subject: md-new-param-to_sync_page_io Add new parameter to 'sync_page_io'. The new parameter allows us to distinguish between metadata and data operations. This becomes important later when we add the ability to use separate devices for data and metadata. Signed-off-by: Jonathan Brassow diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 5a1ffe3..1977765 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -210,11 +210,11 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset, || test_bit(Faulty, &rdev->flags)) continue; - target = rdev->sb_start + offset + index * (PAGE_SIZE/512); + target = offset + index * (PAGE_SIZE/512); if (sync_page_io(rdev, target, roundup(size, bdev_logical_block_size(rdev->bdev)), - page, READ)) { + page, READ, true)) { page->index = index; attach_page_buffers(page, NULL); /* so that free_buffer will * quietly no-op */ diff --git a/drivers/md/md.c b/drivers/md/md.c index 0a0d7c2..0bc10cc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -795,7 +795,7 @@ static void bi_complete(struct bio *bio, int error) } int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, - struct page *page, int rw) + struct page *page, int rw, bool metadata_op) { struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); struct completion event; @@ -804,7 +804,10 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, rw |= REQ_SYNC | REQ_UNPLUG; bio->bi_bdev = rdev->bdev; - bio->bi_sector = sector; + if (metadata_op) + bio->bi_sector = sector + rdev->sb_start; + else + bio->bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); init_completion(&event); bio->bi_private = &event; @@ -829,7 +832,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) return 0; - if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ)) + if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) goto fail; rdev->sb_loaded = 1; return 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index 229675a..7e4f358 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -498,8 +498,8 @@ extern void md_flush_request(mddev_t *mddev, struct bio *bio); extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page); extern void md_super_wait(mddev_t *mddev); -extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, - struct page *page, int rw); +extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, + struct page *page, int rw, bool metadata_op); extern void md_do_sync(mddev_t *mddev); extern void md_new_event(mddev_t *mddev); extern int md_allow_write(mddev_t *mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d50056b..a23ffa3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1365,10 +1365,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) */ rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - READ)) { + READ, false)) { success = 1; break; } @@ -1391,10 +1391,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) rdev = conf->mirrors[d].rdev; atomic_add(s, &rdev->corrected_errors); if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - WRITE) == 0) + WRITE, false) == 0) md_error(mddev, rdev); } d = start; @@ -1406,10 +1406,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) continue; rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - READ) == 0) + READ, false) == 0) md_error(mddev, rdev); } } else { @@ -1489,10 +1489,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && - sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ)) + sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false)) success = 1; else { d++; @@ -1515,9 +1513,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, WRITE, false) == 0) /* Well, this device is dead */ md_error(mddev, rdev); @@ -1532,9 +1529,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false) == 0) /* Well, this device is dead */ md_error(mddev, rdev); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 03825cb..69b6595 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1560,9 +1560,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_unlock(); success = sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, + sect, s<<9, - conf->tmppage, READ); + conf->tmppage, READ, false); rdev_dec_pending(rdev, mddev); rcu_read_lock(); if (success) @@ -1599,8 +1599,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) atomic_add(s, &rdev->corrected_errors); if (sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) + sect, + s<<9, conf->tmppage, WRITE, false) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE @@ -1636,9 +1636,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_unlock(); if (sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, + sect, s<<9, conf->tmppage, - READ) == 0) { + READ, false) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE "md/raid10:%s: unable to read back " -- cgit v0.10.2 From a6ff7e089c7fca813c956ccbed824087e89a3a49 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Fri, 14 Jan 2011 09:14:34 +1100 Subject: md: separate meta and data devs Allow the metadata to be on a separate device from the data. This doesn't mean the data and metadata will by on separate physical devices - it simply gives device-mapper and userspace tools more flexibility. Signed-off-by: NeilBrown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 1977765..6cf5871 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -264,14 +264,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) { mdk_rdev_t *rdev = NULL; + struct block_device *bdev; mddev_t *mddev = bitmap->mddev; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { int size = PAGE_SIZE; loff_t offset = mddev->bitmap_info.offset; + + bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; + if (page->index == bitmap->file_pages-1) size = roundup(bitmap->last_page_size, - bdev_logical_block_size(rdev->bdev)); + bdev_logical_block_size(bdev)); /* Just make sure we aren't corrupting data or * metadata */ diff --git a/drivers/md/md.c b/drivers/md/md.c index 0bc10cc..b98a85f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -765,7 +765,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, */ struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); - bio->bi_bdev = rdev->bdev; + bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; @@ -803,7 +803,8 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, rw |= REQ_SYNC | REQ_UNPLUG; - bio->bi_bdev = rdev->bdev; + bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? + rdev->meta_bdev : rdev->bdev; if (metadata_op) bio->bi_sector = sector + rdev->sb_start; else @@ -4435,7 +4436,9 @@ int md_run(mddev_t *mddev) * We don't want the data to overlap the metadata, * Internal Bitmap issues have been handled elsewhere. */ - if (rdev->data_offset < rdev->sb_start) { + if (rdev->meta_bdev) { + /* Nothing to check */; + } else if (rdev->data_offset < rdev->sb_start) { if (mddev->dev_sectors && rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { @@ -5532,7 +5535,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) * sb_start or, if that is sync_thread) return -EBUSY; diff --git a/drivers/md/md.h b/drivers/md/md.h index 7e4f358..eec517c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -60,6 +60,12 @@ struct mdk_rdev_s mddev_t *mddev; /* RAID array if running */ int last_events; /* IO event timestamp */ + /* + * If meta_bdev is non-NULL, it means that a separate device is + * being used to store the metadata (superblock/bitmap) which + * would otherwise be contained on the same device as the data (bdev). + */ + struct block_device *meta_bdev; struct block_device *bdev; /* block device handle */ struct page *sb_page; -- cgit v0.10.2 From 75d3da43cb74d2e5fb87816dbfecb839cd97c7f4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:34 +1100 Subject: md: Don't let implementation detail of curr_resync leak out through sysfs. mddev->curr_resync has artificial values of '1' and '2' which are used by the code which ensures only one resync is happening at a time on any given device. These values are internal and should never be exposed to user-space (except when translated appropriately as in the 'pending' status in /proc/mdstat). Unfortunately they are as ->curr_resync is assigned to ->curr_resync_completed and that value is directly visible through sysfs. So change the assignments to ->curr_resync_completed to get the same valued from elsewhere in a form that doesn't have the magic '1' or '2' values. Signed-off-by: NeilBrown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 6cf5871..9a35320 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1546,7 +1546,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) wait_event(bitmap->mddev->recovery_wait, atomic_read(&bitmap->mddev->recovery_active) == 0); - bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; + bitmap->mddev->curr_resync_completed = sector; set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); s = 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index b98a85f..42b1d9e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6824,7 +6824,7 @@ void md_do_sync(mddev_t *mddev) desc, mdname(mddev)); mddev->curr_resync = j; } - mddev->curr_resync_completed = mddev->curr_resync; + mddev->curr_resync_completed = j; while (j < max_sectors) { sector_t sectors; @@ -6842,8 +6842,7 @@ void md_do_sync(mddev_t *mddev) md_unplug(mddev); wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); - mddev->curr_resync_completed = - mddev->curr_resync; + mddev->curr_resync_completed = j; set_bit(MD_CHANGE_CLEAN, &mddev->flags); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9212c07..d223a6c0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4236,7 +4236,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); mddev->reshape_position = conf->reshape_progress; - mddev->curr_resync_completed = mddev->curr_resync; + mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); @@ -4337,7 +4337,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes) == 0); mddev->reshape_position = conf->reshape_progress; - mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; + mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); -- cgit v0.10.2 From 23ddff3792f61193695114c68d6ebd57e974c4f8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:34 +1100 Subject: md: allow suspend_lo and suspend_hi to decrease as well as increase. The sysfs attributes 'suspend_lo' and 'suspend_hi' describe a region to which read/writes are suspended so that the under lying data can be manipulated without user-space noticing. Currently the window they describe can only move forwards along the device. However this is an unnecessary restriction which will cause problems with planned developments. So relax this restriction and allow these endpoints to move arbitrarily. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 42b1d9e..dd64ad3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4016,19 +4016,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_lo; if (mddev->pers == NULL || mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; - if (new >= mddev->suspend_hi || - (new > mddev->suspend_lo && new < mddev->suspend_hi)) { - mddev->suspend_lo = new; + + mddev->suspend_lo = new; + if (new >= old) + /* Shrinking suspended region */ mddev->pers->quiesce(mddev, 2); - return len; - } else - return -EINVAL; + else { + /* Expanding suspended region - need to wait */ + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + return len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); @@ -4045,20 +4050,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_hi; if (mddev->pers == NULL || mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; - if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || - (new > mddev->suspend_lo && new > mddev->suspend_hi)) { - mddev->suspend_hi = new; + + mddev->suspend_hi = new; + if (new <= old) + /* Shrinking suspended region */ + mddev->pers->quiesce(mddev, 2); + else { + /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); - return len; - } else - return -EINVAL; + } + return len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); -- cgit v0.10.2 From 13ae864bc86ff65547ffe7e966b6433a0d0edb8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20R=C3=A9rolle?= Date: Fri, 14 Jan 2011 09:14:34 +1100 Subject: md: fix sync_completed reporting for very large drives (>2TB) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The values exported in the sync_completed file are unsigned long, which overflows with very large drives, resulting in wrong values reported. Since sync_completed uses sectors as unit, we'll start getting wrong values with components larger than 2TB. This patch simply replaces the use of unsigned long by unsigned long long. Signed-off-by: Rémi Rérolle Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index dd64ad3..5e3714f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3918,7 +3918,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t sync_completed_show(mddev_t *mddev, char *page) { - unsigned long max_sectors, resync; + unsigned long long max_sectors, resync; if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); @@ -3929,7 +3929,7 @@ sync_completed_show(mddev_t *mddev, char *page) max_sectors = mddev->dev_sectors; resync = mddev->curr_resync_completed; - return sprintf(page, "%lu / %lu\n", resync, max_sectors); + return sprintf(page, "%llu / %llu\n", resync, max_sectors); } static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); -- cgit v0.10.2 From 1a940fcee31ec6c18c2f24dbdad31d54e4c35048 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:34 +1100 Subject: md/raid5: handle manually-added spares in start_reshape. It is possible to manually add spares to specific slots before starting a reshape. raid5_start_reshape should recognised this possibility and include it in the accounting. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d223a6c0..5044bab 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5527,8 +5527,8 @@ static int raid5_start_reshape(mddev_t *mddev) return -ENOSPC; list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk < 0 && - !test_bit(Faulty, &rdev->flags)) + if ((rdev->raid_disk < 0 || rdev->raid_disk >= conf->raid_disks) + && !test_bit(Faulty, &rdev->flags)) spares++; if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) @@ -5588,6 +5588,11 @@ static int raid5_start_reshape(mddev_t *mddev) /* Failure here is OK */; } else break; + } else if (rdev->raid_disk >= conf->previous_raid_disks + && !test_bit(Faulty, &rdev->flags)) { + /* This is a spare that was manually added */ + set_bit(In_sync, &rdev->flags); + added_devices++; } /* When a reshape changes the number of devices, ->degraded -- cgit v0.10.2 From ba1b41b6b4e30cb66ae2775faadea05cae3ce61c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:34 +1100 Subject: md: range check slot number when manually adding a spare. When adding a spare to an active array, we should check the slot number, but allow it to be larger than raid_disks if a reshape is being prepared. Apply the same test when adding a device to an array-under-construction. It already had most of the test in place, but not quite all. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 5e3714f..6658513 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2479,6 +2479,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (rdev2->raid_disk == slot) return -EEXIST; + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) + return -ENOSPC; + rdev->raid_disk = slot; if (test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = slot; @@ -2496,7 +2500,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) /* failure here is OK */; /* don't wakeup anyone, leave that to userspace. */ } else { - if (slot >= rdev->mddev->raid_disks) + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; rdev->raid_disk = slot; /* assume it is working */ -- cgit v0.10.2 From bf2cb0dab8c97f00a71875d9b13dbac17a2f47ca Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:34 +1100 Subject: md: Fix removal of extra drives when converting RAID6 to RAID5 When a RAID6 is converted to a RAID5, the extra drive should be discarded. However it isn't due to a typo in a comparison. This bug was introduced in commit e93f68a1fc6 in 2.6.35-rc4 and is suitable for any -stable since than. As the extra drive is not removed, the 'degraded' counter is wrong and so the RAID5 will not respond correctly to a subsequent failure. Cc: stable@kernel.org Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 6658513..3194a80 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3126,7 +3126,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) char nm[20]; if (rdev->raid_disk < 0) continue; - if (rdev->new_raid_disk > mddev->raid_disks) + if (rdev->new_raid_disk >= mddev->raid_disks) rdev->new_raid_disk = -1; if (rdev->new_raid_disk == rdev->raid_disk) continue; -- cgit v0.10.2