diff options
-rw-r--r-- | Documentation/md.txt | 22 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 12 | ||||
-rw-r--r-- | drivers/md/md.c | 107 | ||||
-rw-r--r-- | drivers/md/md.h | 82 | ||||
-rw-r--r-- | drivers/md/multipath.c | 7 | ||||
-rw-r--r-- | drivers/md/raid1.c | 174 | ||||
-rw-r--r-- | drivers/md/raid1.h | 7 | ||||
-rw-r--r-- | drivers/md/raid10.c | 582 | ||||
-rw-r--r-- | drivers/md/raid10.h | 61 | ||||
-rw-r--r-- | drivers/md/raid5.c | 557 | ||||
-rw-r--r-- | drivers/md/raid5.h | 98 | ||||
-rw-r--r-- | include/linux/raid/md_p.h | 7 | ||||
-rw-r--r-- | include/linux/raid/pq.h | 2 |
13 files changed, 1280 insertions, 438 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt index fc94770..993fba3 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt @@ -357,14 +357,14 @@ Each directory contains: written to, that device. state - A file recording the current state of the device in the array + A file recording the current state of the device in the array which can be a comma separated list of faulty - device has been kicked from active use due to - a detected fault or it has unacknowledged bad - blocks + a detected fault, or it has unacknowledged bad + blocks in_sync - device is a fully in-sync member of the array writemostly - device will only be subject to read - requests if there are no other options. + requests if there are no other options. This applies only to raid1 arrays. blocked - device has failed, and the failure hasn't been acknowledged yet by the metadata handler. @@ -374,6 +374,13 @@ Each directory contains: This includes spares that are in the process of being recovered to write_error - device has ever seen a write error. + want_replacement - device is (mostly) working but probably + should be replaced, either due to errors or + due to user request. + replacement - device is a replacement for another active + device with same raid_disk. + + This list may grow in future. This can be written to. Writing "faulty" simulates a failure on the device. @@ -386,6 +393,13 @@ Each directory contains: Writing "in_sync" sets the in_sync flag. Writing "write_error" sets writeerrorseen flag. Writing "-write_error" clears writeerrorseen flag. + Writing "want_replacement" is allowed at any time except to a + replacement device or a spare. It sets the flag. + Writing "-want_replacement" is allowed at any time. It clears + the flag. + Writing "replacement" or "-replacement" is only allowed before + starting the array. It sets or clears the flag. + This file responds to select/poll. Any change to 'faulty' or 'blocked' causes an event. diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 6d03774..cdf36b1 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1149,12 +1149,12 @@ void bitmap_daemon_work(struct mddev *mddev) return; } if (time_before(jiffies, bitmap->daemon_lastrun - + bitmap->mddev->bitmap_info.daemon_sleep)) + + mddev->bitmap_info.daemon_sleep)) goto done; bitmap->daemon_lastrun = jiffies; if (bitmap->allclean) { - bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; goto done; } bitmap->allclean = 1; @@ -1206,7 +1206,7 @@ void bitmap_daemon_work(struct mddev *mddev) * sure that events_cleared is up-to-date. */ if (bitmap->need_sync && - bitmap->mddev->bitmap_info.external == 0) { + mddev->bitmap_info.external == 0) { bitmap_super_t *sb; bitmap->need_sync = 0; sb = kmap_atomic(bitmap->sb_page, KM_USER0); @@ -1270,8 +1270,8 @@ void bitmap_daemon_work(struct mddev *mddev) done: if (bitmap->allclean == 0) - bitmap->mddev->thread->timeout = - bitmap->mddev->bitmap_info.daemon_sleep; + mddev->thread->timeout = + mddev->bitmap_info.daemon_sleep; mutex_unlock(&mddev->bitmap_info.mutex); } @@ -1587,7 +1587,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n } if (!*bmc) { struct page *page; - *bmc = 1 | (needed ? NEEDED_MASK : 0); + *bmc = 2 | (needed ? NEEDED_MASK : 0); bitmap_count_page(bitmap, offset, 1); page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); diff --git a/drivers/md/md.c b/drivers/md/md.c index 5d1b676..ca8527f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1713,6 +1713,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) } if (sb->devflags & WriteMostly1) set_bit(WriteMostly, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) + set_bit(Replacement, &rdev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); @@ -1766,6 +1768,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); } + if (test_bit(Replacement, &rdev->flags)) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_REPLACEMENT); if (mddev->reshape_position != MaxSector) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); @@ -2559,6 +2564,15 @@ state_show(struct md_rdev *rdev, char *page) len += sprintf(page+len, "%swrite_error", sep); sep = ","; } + if (test_bit(WantReplacement, &rdev->flags)) { + len += sprintf(page+len, "%swant_replacement", sep); + sep = ","; + } + if (test_bit(Replacement, &rdev->flags)) { + len += sprintf(page+len, "%sreplacement", sep); + sep = ","; + } + return len+sprintf(page+len, "\n"); } @@ -2627,6 +2641,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } else if (cmd_match(buf, "-write_error")) { clear_bit(WriteErrorSeen, &rdev->flags); err = 0; + } else if (cmd_match(buf, "want_replacement")) { + /* Any non-spare device that is not a replacement can + * become want_replacement at any time, but we then need to + * check if recovery is needed. + */ + if (rdev->raid_disk >= 0 && + !test_bit(Replacement, &rdev->flags)) + set_bit(WantReplacement, &rdev->flags); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + err = 0; + } else if (cmd_match(buf, "-want_replacement")) { + /* Clearing 'want_replacement' is always allowed. + * Once replacements starts it is too late though. + */ + err = 0; + clear_bit(WantReplacement, &rdev->flags); + } else if (cmd_match(buf, "replacement")) { + /* Can only set a device as a replacement when array has not + * yet been started. Once running, replacement is automatic + * from spares, or by assigning 'slot'. + */ + if (rdev->mddev->pers) + err = -EBUSY; + else { + set_bit(Replacement, &rdev->flags); + err = 0; + } + } else if (cmd_match(buf, "-replacement")) { + /* Similarly, can only clear Replacement before start */ + if (rdev->mddev->pers) + err = -EBUSY; + else { + clear_bit(Replacement, &rdev->flags); + err = 0; + } } if (!err) sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -2688,7 +2738,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) if (rdev->mddev->pers->hot_remove_disk == NULL) return -EINVAL; err = rdev->mddev->pers-> - hot_remove_disk(rdev->mddev, rdev->raid_disk); + hot_remove_disk(rdev->mddev, rdev); if (err) return err; sysfs_unlink_rdev(rdev->mddev, rdev); @@ -2696,7 +2746,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { - struct md_rdev *rdev2; /* Activating a spare .. or possibly reactivating * if we ever get bitmaps working here. */ @@ -2710,10 +2759,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) if (rdev->mddev->pers->hot_add_disk == NULL) return -EINVAL; - list_for_each_entry(rdev2, &rdev->mddev->disks, same_set) - if (rdev2->raid_disk == slot) - return -EEXIST; - if (slot >= rdev->mddev->raid_disks && slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; @@ -6053,8 +6098,15 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, struct mddev *mddev = NULL; int ro; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + switch (cmd) { + case RAID_VERSION: + case GET_ARRAY_INFO: + case GET_DISK_INFO: + break; + default: + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + } /* * Commands dealing with the RAID driver but not any @@ -6714,8 +6766,11 @@ static int md_seq_show(struct seq_file *seq, void *v) if (test_bit(Faulty, &rdev->flags)) { seq_printf(seq, "(F)"); continue; - } else if (rdev->raid_disk < 0) + } + if (rdev->raid_disk < 0) seq_printf(seq, "(S)"); /* spare */ + if (test_bit(Replacement, &rdev->flags)) + seq_printf(seq, "(R)"); sectors += rdev->sectors; } @@ -7337,29 +7392,27 @@ static int remove_and_add_spares(struct mddev *mddev) ! test_bit(In_sync, &rdev->flags)) && atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( - mddev, rdev->raid_disk)==0) { + mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); rdev->raid_disk = -1; } } - if (mddev->degraded) { - list_for_each_entry(rdev, &mddev->disks, same_set) { - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + spares++; + if (rdev->raid_disk < 0 + && !test_bit(Faulty, &rdev->flags)) { + rdev->recovery_offset = 0; + if (mddev->pers-> + hot_add_disk(mddev, rdev) == 0) { + if (sysfs_link_rdev(mddev, rdev)) + /* failure here is OK */; spares++; - if (rdev->raid_disk < 0 - && !test_bit(Faulty, &rdev->flags)) { - rdev->recovery_offset = 0; - if (mddev->pers-> - hot_add_disk(mddev, rdev) == 0) { - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; - spares++; - md_new_event(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - } + md_new_event(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); } } } @@ -7474,7 +7527,7 @@ void md_check_recovery(struct mddev *mddev) test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( - mddev, rdev->raid_disk)==0) { + mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); rdev->raid_disk = -1; } diff --git a/drivers/md/md.h b/drivers/md/md.h index cf742d9..44c63df 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -72,34 +72,7 @@ struct md_rdev { * This reduces the burden of testing multiple flags in many cases */ - unsigned long flags; -#define Faulty 1 /* device is known to have a fault */ -#define In_sync 2 /* device is in_sync with rest of array */ -#define WriteMostly 4 /* Avoid reading if at all possible */ -#define AutoDetected 7 /* added by auto-detect */ -#define Blocked 8 /* An error occurred but has not yet - * been acknowledged by the metadata - * handler, so don't allow writes - * until it is cleared */ -#define WriteErrorSeen 9 /* A write error has been seen on this - * device - */ -#define FaultRecorded 10 /* Intermediate state for clearing - * Blocked. The Fault is/will-be - * recorded in the metadata, but that - * metadata hasn't been stored safely - * on disk yet. - */ -#define BlockedBadBlocks 11 /* A writer is blocked because they - * found an unacknowledged bad-block. - * This can safely be cleared at any - * time, and the writer will re-check. - * It may be set at any time, and at - * worst the writer will timeout and - * re-check. So setting it as - * accurately as possible is good, but - * not absolutely critical. - */ + unsigned long flags; /* bit set of 'enum flag_bits' bits. */ wait_queue_head_t blocked_wait; int desc_nr; /* descriptor index in the superblock */ @@ -152,6 +125,44 @@ struct md_rdev { sector_t size; /* in sectors */ } badblocks; }; +enum flag_bits { + Faulty, /* device is known to have a fault */ + In_sync, /* device is in_sync with rest of array */ + WriteMostly, /* Avoid reading if at all possible */ + AutoDetected, /* added by auto-detect */ + Blocked, /* An error occurred but has not yet + * been acknowledged by the metadata + * handler, so don't allow writes + * until it is cleared */ + WriteErrorSeen, /* A write error has been seen on this + * device + */ + FaultRecorded, /* Intermediate state for clearing + * Blocked. The Fault is/will-be + * recorded in the metadata, but that + * metadata hasn't been stored safely + * on disk yet. + */ + BlockedBadBlocks, /* A writer is blocked because they + * found an unacknowledged bad-block. + * This can safely be cleared at any + * time, and the writer will re-check. + * It may be set at any time, and at + * worst the writer will timeout and + * re-check. So setting it as + * accurately as possible is good, but + * not absolutely critical. + */ + WantReplacement, /* This device is a candidate to be + * hot-replaced, either because it has + * reported some faults, or because + * of explicit request. + */ + Replacement, /* This device is a replacement for + * a want_replacement device with same + * raid_disk number. + */ +}; #define BB_LEN_MASK (0x00000000000001FFULL) #define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) @@ -428,7 +439,7 @@ struct md_personality */ void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); - int (*hot_remove_disk) (struct mddev *mddev, int number); + int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*spare_active) (struct mddev *mddev); sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); int (*resize) (struct mddev *mddev, sector_t sectors); @@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev) static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); + if (!test_bit(Replacement, &rdev->flags)) { + sprintf(nm, "rd%d", rdev->raid_disk); + return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); + } else + return 0; } static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); + if (!test_bit(Replacement, &rdev->flags)) { + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } } /* diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 5899246..a222f51 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -292,17 +292,16 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) return err; } -static int multipath_remove_disk(struct mddev *mddev, int number) +static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) { struct mpconf *conf = mddev->private; int err = 0; - struct md_rdev *rdev; + int number = rdev->raid_disk; struct multipath_info *p = conf->multipaths + number; print_multipath_conf(conf); - rdev = p->rdev; - if (rdev) { + if (rdev == p->rdev) { if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { printk(KERN_ERR "hot-remove-disk, slot %d is identified" diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ede2461..cc24f0c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -135,7 +135,7 @@ out_free_pages: put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); j = -1; out_free_bio: - while ( ++j < pi->raid_disks ) + while (++j < pi->raid_disks) bio_put(r1_bio->bios[j]); r1bio_pool_free(r1_bio, data); return NULL; @@ -164,7 +164,7 @@ static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio) { int i; - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { struct bio **bio = r1_bio->bios + i; if (!BIO_SPECIAL(*bio)) bio_put(*bio); @@ -185,7 +185,7 @@ static void put_buf(struct r1bio *r1_bio) struct r1conf *conf = r1_bio->mddev->private; int i; - for (i=0; i<conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { struct bio *bio = r1_bio->bios[i]; if (bio->bi_end_io) rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); @@ -277,13 +277,14 @@ static inline void update_head_pos(int disk, struct r1bio *r1_bio) static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) { int mirror; - int raid_disks = r1_bio->mddev->raid_disks; + struct r1conf *conf = r1_bio->mddev->private; + int raid_disks = conf->raid_disks; - for (mirror = 0; mirror < raid_disks; mirror++) + for (mirror = 0; mirror < raid_disks * 2; mirror++) if (r1_bio->bios[mirror] == bio) break; - BUG_ON(mirror == raid_disks); + BUG_ON(mirror == raid_disks * 2); update_head_pos(mirror, r1_bio); return mirror; @@ -390,6 +391,11 @@ static void raid1_end_write_request(struct bio *bio, int error) if (!uptodate) { set_bit(WriteErrorSeen, &conf->mirrors[mirror].rdev->flags); + if (!test_and_set_bit(WantReplacement, + &conf->mirrors[mirror].rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + conf->mddev->recovery); + set_bit(R1BIO_WriteError, &r1_bio->state); } else { /* @@ -505,7 +511,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect start_disk = conf->last_used; } - for (i = 0 ; i < conf->raid_disks ; i++) { + for (i = 0 ; i < conf->raid_disks * 2 ; i++) { sector_t dist; sector_t first_bad; int bad_sectors; @@ -609,7 +615,7 @@ int md_raid1_congested(struct mddev *mddev, int bits) return 1; rcu_read_lock(); - for (i = 0; i < mddev->raid_disks; i++) { + for (i = 0; i < conf->raid_disks; i++) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -974,7 +980,7 @@ read_again: */ plugged = mddev_check_plugged(mddev); - disks = conf->raid_disks; + disks = conf->raid_disks * 2; retry_write: blocked_rdev = NULL; rcu_read_lock(); @@ -988,7 +994,8 @@ read_again: } r1_bio->bios[i] = NULL; if (!rdev || test_bit(Faulty, &rdev->flags)) { - set_bit(R1BIO_Degraded, &r1_bio->state); + if (i < conf->raid_disks) + set_bit(R1BIO_Degraded, &r1_bio->state); continue; } @@ -1263,6 +1270,25 @@ static int raid1_spare_active(struct mddev *mddev) */ for (i = 0; i < conf->raid_disks; i++) { struct md_rdev *rdev = conf->mirrors[i].rdev; + struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; + if (repl + && repl->recovery_offset == MaxSector + && !test_bit(Faulty, &repl->flags) + && !test_and_set_bit(In_sync, &repl->flags)) { + /* replacement has just become active */ + if (!rdev || + !test_and_clear_bit(In_sync, &rdev->flags)) + count++; + if (rdev) { + /* Replaced device not technically + * faulty, but we need to be sure + * it gets removed and never re-added + */ + set_bit(Faulty, &rdev->flags); + sysfs_notify_dirent_safe( + rdev->sysfs_state); + } + } if (rdev && !test_bit(Faulty, &rdev->flags) && !test_and_set_bit(In_sync, &rdev->flags)) { @@ -1286,7 +1312,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) int mirror = 0; struct mirror_info *p; int first = 0; - int last = mddev->raid_disks - 1; + int last = conf->raid_disks - 1; if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; @@ -1294,8 +1320,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; - for (mirror = first; mirror <= last; mirror++) - if ( !(p=conf->mirrors+mirror)->rdev) { + for (mirror = first; mirror <= last; mirror++) { + p = conf->mirrors+mirror; + if (!p->rdev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); @@ -1322,21 +1349,35 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(p->rdev, rdev); break; } + if (test_bit(WantReplacement, &p->rdev->flags) && + p[conf->raid_disks].rdev == NULL) { + /* Add this device as a replacement */ + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = mirror; + err = 0; + conf->fullsync = 1; + rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); + break; + } + } md_integrity_add_rdev(rdev, mddev); print_conf(conf); return err; } -static int raid1_remove_disk(struct mddev *mddev, int number) +static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r1conf *conf = mddev->private; int err = 0; - struct md_rdev *rdev; + int number = rdev->raid_disk; struct mirror_info *p = conf->mirrors+ number; + if (rdev != p->rdev) + p = conf->mirrors + conf->raid_disks + number; + print_conf(conf); - rdev = p->rdev; - if (rdev) { + if (rdev == p->rdev) { if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { err = -EBUSY; @@ -1358,7 +1399,21 @@ static int raid1_remove_disk(struct mddev *mddev, int number) err = -EBUSY; p->rdev = rdev; goto abort; - } + } else if (conf->mirrors[conf->raid_disks + number].rdev) { + /* We just removed a device that is being replaced. + * Move down the replacement. We drain all IO before + * doing this to avoid confusion. + */ + struct md_rdev *repl = + conf->mirrors[conf->raid_disks + number].rdev; + raise_barrier(conf); + clear_bit(Replacement, &repl->flags); + p->rdev = repl; + conf->mirrors[conf->raid_disks + number].rdev = NULL; + lower_barrier(conf); + clear_bit(WantReplacement, &rdev->flags); + } else + clear_bit(WantReplacement, &rdev->flags); err = md_integrity_register(mddev); } abort: @@ -1411,6 +1466,10 @@ static void end_sync_write(struct bio *bio, int error) } while (sectors_to_go > 0); set_bit(WriteErrorSeen, &conf->mirrors[mirror].rdev->flags); + if (!test_and_set_bit(WantReplacement, + &conf->mirrors[mirror].rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + mddev->recovery); set_bit(R1BIO_WriteError, &r1_bio->state); } else if (is_badblock(conf->mirrors[mirror].rdev, r1_bio->sector, @@ -1441,8 +1500,13 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) /* success */ return 1; - if (rw == WRITE) + if (rw == WRITE) { set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, + &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + rdev->mddev->recovery); + } /* need to record an error - either for the block or the device */ if (!rdev_set_badblocks(rdev, sector, sectors, 0)) md_error(rdev->mddev, rdev); @@ -1493,7 +1557,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) } } d++; - if (d == conf->raid_disks) + if (d == conf->raid_disks * 2) d = 0; } while (!success && d != r1_bio->read_disk); @@ -1510,7 +1574,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) mdname(mddev), bdevname(bio->bi_bdev, b), (unsigned long long)r1_bio->sector); - for (d = 0; d < conf->raid_disks; d++) { + for (d = 0; d < conf->raid_disks * 2; d++) { rdev = conf->mirrors[d].rdev; if (!rdev || test_bit(Faulty, &rdev->flags)) continue; @@ -1536,7 +1600,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) /* write it back and re-read */ while (d != r1_bio->read_disk) { if (d == 0) - d = conf->raid_disks; + d = conf->raid_disks * 2; d--; if (r1_bio->bios[d]->bi_end_io != end_sync_read) continue; @@ -1551,7 +1615,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) d = start; while (d != r1_bio->read_disk) { if (d == 0) - d = conf->raid_disks; + d = conf->raid_disks * 2; d--; if (r1_bio->bios[d]->bi_end_io != end_sync_read) continue; @@ -1584,7 +1648,7 @@ static int process_checks(struct r1bio *r1_bio) int primary; int i; - for (primary = 0; primary < conf->raid_disks; primary++) + for (primary = 0; primary < conf->raid_disks * 2; primary++) if (r1_bio->bios[primary]->bi_end_io == end_sync_read && test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { r1_bio->bios[primary]->bi_end_io = NULL; @@ -1592,7 +1656,7 @@ static int process_checks(struct r1bio *r1_bio) break; } r1_bio->read_disk = primary; - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { int j; int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); struct bio *pbio = r1_bio->bios[primary]; @@ -1656,7 +1720,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) { struct r1conf *conf = mddev->private; int i; - int disks = conf->raid_disks; + int disks = conf->raid_disks * 2; struct bio *bio, *wbio; bio = r1_bio->bios[r1_bio->read_disk]; @@ -1737,7 +1801,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, success = 1; else { d++; - if (d == conf->raid_disks) + if (d == conf->raid_disks * 2) d = 0; } } while (!success && d != read_disk); @@ -1753,7 +1817,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, start = d; while (d != read_disk) { if (d==0) - d = conf->raid_disks; + d = conf->raid_disks * 2; d--; rdev = conf->mirrors[d].rdev; if (rdev && @@ -1765,7 +1829,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, while (d != read_disk) { char b[BDEVNAME_SIZE]; if (d==0) - d = conf->raid_disks; + d = conf->raid_disks * 2; d--; rdev = conf->mirrors[d].rdev; if (rdev && @@ -1887,7 +1951,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio { int m; int s = r1_bio->sectors; - for (m = 0; m < conf->raid_disks ; m++) { + for (m = 0; m < conf->raid_disks * 2 ; m++) { struct md_rdev *rdev = conf->mirrors[m].rdev; struct bio *bio = r1_bio->bios[m]; if (bio->bi_end_io == NULL) @@ -1909,7 +1973,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) { int m; - for (m = 0; m < conf->raid_disks ; m++) + for (m = 0; m < conf->raid_disks * 2 ; m++) if (r1_bio->bios[m] == IO_MADE_GOOD) { struct md_rdev *rdev = conf->mirrors[m].rdev; rdev_clear_badblocks(rdev, @@ -2184,7 +2248,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp r1_bio->state = 0; set_bit(R1BIO_IsSync, &r1_bio->state); - for (i=0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { struct md_rdev *rdev; bio = r1_bio->bios[i]; @@ -2203,7 +2267,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { - still_degraded = 1; + if (i < conf->raid_disks) + still_degraded = 1; } else if (!test_bit(In_sync, &rdev->flags)) { bio->bi_rw = WRITE; bio->bi_end_io = end_sync_write; @@ -2254,7 +2319,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp * need to mark them bad on all write targets */ int ok = 1; - for (i = 0 ; i < conf->raid_disks ; i++) + for (i = 0 ; i < conf->raid_disks * 2 ; i++) if (r1_bio->bios[i]->bi_end_io == end_sync_write) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); @@ -2323,7 +2388,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp len = sync_blocks<<9; } - for (i=0 ; i < conf->raid_disks; i++) { + for (i = 0 ; i < conf->raid_disks * 2; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io) { page = bio->bi_io_vec[bio->bi_vcnt].bv_page; @@ -2356,7 +2421,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { atomic_set(&r1_bio->remaining, read_targets); - for (i=0; i<conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { md_sync_acct(bio->bi_bdev, nr_sectors); @@ -2393,7 +2458,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf) goto abort; - conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, + conf->mirrors = kzalloc(sizeof(struct mirror_info) + * mddev->raid_disks * 2, GFP_KERNEL); if (!conf->mirrors) goto abort; @@ -2405,7 +2471,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); if (!conf->poolinfo) goto abort; - conf->poolinfo->raid_disks = mddev->raid_disks; + conf->poolinfo->raid_disks = mddev->raid_disks * 2; conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, r1bio_pool_free, conf->poolinfo); @@ -2414,14 +2480,20 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->poolinfo->mddev = mddev; + err = -EINVAL; spin_lock_init(&conf->device_lock); list_for_each_entry(rdev, &mddev->disks, same_set) { int disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) continue; - disk = conf->mirrors + disk_idx; + if (test_bit(Replacement, &rdev->flags)) + disk = conf->mirrors + conf->raid_disks + disk_idx; + else + disk = conf->mirrors + disk_idx; + if (disk->rdev) + goto abort; disk->rdev = rdev; disk->head_position = 0; @@ -2437,11 +2509,27 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->pending_count = 0; conf->recovery_disabled = mddev->recovery_disabled - 1; + err = -EIO; conf->last_used = -1; - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { disk = conf->mirrors + i; + if (i < conf->raid_disks && + disk[conf->raid_disks].rdev) { + /* This slot has a replacement. */ + if (!disk->rdev) { + /* No original, just make the replacement + * a recovering spare + */ + disk->rdev = + disk[conf->raid_disks].rdev; + disk[conf->raid_disks].rdev = NULL; + } else if (!test_bit(In_sync, &disk->rdev->flags)) + /* Original is not in_sync - bad */ + goto abort; + } + if (!disk->rdev || !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; @@ -2455,7 +2543,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->last_used = i; } - err = -EIO; if (conf->last_used < 0) { printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", mdname(mddev)); @@ -2665,7 +2752,7 @@ static int raid1_reshape(struct mddev *mddev) if (!newpoolinfo) return -ENOMEM; newpoolinfo->mddev = mddev; - newpoolinfo->raid_disks = raid_disks; + newpoolinfo->raid_disks = raid_disks * 2; newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, r1bio_pool_free, newpoolinfo); @@ -2673,7 +2760,8 @@ static int raid1_reshape(struct mddev *mddev) kfree(newpoolinfo); return -ENOMEM; } - newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); + newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, + GFP_KERNEL); if (!newmirrors) { kfree(newpoolinfo); mempool_destroy(newpool); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index c732b6c..80ded13 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -12,6 +12,9 @@ struct mirror_info { * pool was allocated for, so they know how much to allocate and free. * mddev->raid_disks cannot be used, as it can change while a pool is active * These two datums are stored in a kmalloced struct. + * The 'raid_disks' here is twice the raid_disks in r1conf. + * This allows space for each 'real' device can have a replacement in the + * second half of the array. */ struct pool_info { @@ -21,7 +24,9 @@ struct pool_info { struct r1conf { struct mddev *mddev; - struct mirror_info *mirrors; + struct mirror_info *mirrors; /* twice 'raid_disks' to + * allow for replacements. + */ int raid_disks; /* When choose the best device for a read (read_balance()) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 685ddf3..6e8aa21 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -73,7 +73,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) struct r10conf *conf = data; int size = offsetof(struct r10bio, devs[conf->copies]); - /* allocate a r10bio with room for raid_disks entries in the bios array */ + /* allocate a r10bio with room for raid_disks entries in the + * bios array */ return kzalloc(size, gfp_flags); } @@ -123,12 +124,19 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) if (!bio) goto out_free_bio; r10_bio->devs[j].bio = bio; + if (!conf->have_replacement) + continue; + bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); + if (!bio) + goto out_free_bio; + r10_bio->devs[j].repl_bio = bio; } /* * Allocate RESYNC_PAGES data pages and attach them * where needed. */ for (j = 0 ; j < nalloc; j++) { + struct bio *rbio = r10_bio->devs[j].repl_bio; bio = r10_bio->devs[j].bio; for (i = 0; i < RESYNC_PAGES; i++) { if (j == 1 && !test_bit(MD_RECOVERY_SYNC, @@ -143,6 +151,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) goto out_free_pages; bio->bi_io_vec[i].bv_page = page; + if (rbio) + rbio->bi_io_vec[i].bv_page = page; } } @@ -156,8 +166,11 @@ out_free_pages: safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); j = -1; out_free_bio: - while ( ++j < nalloc ) + while (++j < nalloc) { bio_put(r10_bio->devs[j].bio); + if (r10_bio->devs[j].repl_bio) + bio_put(r10_bio->devs[j].repl_bio); + } r10bio_pool_free(r10_bio, conf); return NULL; } @@ -178,6 +191,9 @@ static void r10buf_pool_free(void *__r10_bio, void *data) } bio_put(bio); } + bio = r10bio->devs[j].repl_bio; + if (bio) + bio_put(bio); } r10bio_pool_free(r10bio, conf); } @@ -191,6 +207,10 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) if (!BIO_SPECIAL(*bio)) bio_put(*bio); *bio = NULL; + bio = &r10_bio->devs[i].repl_bio; + if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) + bio_put(*bio); + *bio = NULL; } } @@ -275,19 +295,27 @@ static inline void update_head_pos(int slot, struct r10bio *r10_bio) * Find the disk number which triggered given bio */ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, - struct bio *bio, int *slotp) + struct bio *bio, int *slotp, int *replp) { int slot; + int repl = 0; - for (slot = 0; slot < conf->copies; slot++) + for (slot = 0; slot < conf->copies; slot++) { if (r10_bio->devs[slot].bio == bio) break; + if (r10_bio->devs[slot].repl_bio == bio) { + repl = 1; + break; + } + } BUG_ON(slot == conf->copies); update_head_pos(slot, r10_bio); if (slotp) *slotp = slot; + if (replp) + *replp = repl; return r10_bio->devs[slot].devnum; } @@ -296,11 +324,13 @@ static void raid10_end_read_request(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r10bio *r10_bio = bio->bi_private; int slot, dev; + struct md_rdev *rdev; struct r10conf *conf = r10_bio->mddev->private; slot = r10_bio->read_slot; dev = r10_bio->devs[slot].devnum; + rdev = r10_bio->devs[slot].rdev; /* * this branch is our 'one mirror IO has finished' event handler: */ @@ -318,7 +348,7 @@ static void raid10_end_read_request(struct bio *bio, int error) */ set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); + rdev_dec_pending(rdev, conf->mddev); } else { /* * oops, read error - keep the refcount on the rdev @@ -327,7 +357,7 @@ static void raid10_end_read_request(struct bio *bio, int error) printk_ratelimited(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", mdname(conf->mddev), - bdevname(conf->mirrors[dev].rdev->bdev, b), + bdevname(rdev->bdev, b), (unsigned long long)r10_bio->sector); set_bit(R10BIO_ReadError, &r10_bio->state); reschedule_retry(r10_bio); @@ -366,17 +396,35 @@ static void raid10_end_write_request(struct bio *bio, int error) int dev; int dec_rdev = 1; struct r10conf *conf = r10_bio->mddev->private; - int slot; + int slot, repl; + struct md_rdev *rdev = NULL; - dev = find_bio_disk(conf, r10_bio, bio, &slot); + dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[dev].replacement; + if (!rdev) { + smp_rmb(); + repl = 0; + rdev = conf->mirrors[dev].rdev; + } /* * this branch is our 'one mirror IO has finished' event handler: */ if (!uptodate) { - set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags); - set_bit(R10BIO_WriteError, &r10_bio->state); - dec_rdev = 0; + if (repl) + /* Never record new bad blocks to replacement, + * just fail it. + */ + md_error(rdev->mddev, rdev); + else { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + set_bit(R10BIO_WriteError, &r10_bio->state); + dec_rdev = 0; + } } else { /* * Set R10BIO_Uptodate in our master bio, so that @@ -393,12 +441,15 @@ static void raid10_end_write_request(struct bio *bio, int error) set_bit(R10BIO_Uptodate, &r10_bio->state); /* Maybe we can clear some bad blocks. */ - if (is_badblock(conf->mirrors[dev].rdev, + if (is_badblock(rdev, r10_bio->devs[slot].addr, r10_bio->sectors, &first_bad, &bad_sectors)) { bio_put(bio); - r10_bio->devs[slot].bio = IO_MADE_GOOD; + if (repl) + r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; + else + r10_bio->devs[slot].bio = IO_MADE_GOOD; dec_rdev = 0; set_bit(R10BIO_MadeGood, &r10_bio->state); } @@ -414,7 +465,6 @@ static void raid10_end_write_request(struct bio *bio, int error) rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); } - /* * RAID10 layout manager * As well as the chunksize and raid_disks count, there are two @@ -562,14 +612,16 @@ static int raid10_mergeable_bvec(struct request_queue *q, * FIXME: possibly should rethink readbalancing and do it differently * depending on near_copies / far_copies geometry. */ -static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors) +static struct md_rdev *read_balance(struct r10conf *conf, + struct r10bio *r10_bio, + int *max_sectors) { const sector_t this_sector = r10_bio->sector; int disk, slot; int sectors = r10_bio->sectors; int best_good_sectors; sector_t new_distance, best_dist; - struct md_rdev *rdev; + struct md_rdev *rdev, *best_rdev; int do_balance; int best_slot; @@ -578,6 +630,7 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s retry: sectors = r10_bio->sectors; best_slot = -1; + best_rdev = NULL; best_dist = MaxSector; best_good_sectors = 0; do_balance = 1; @@ -599,10 +652,16 @@ retry: if (r10_bio->devs[slot].bio == IO_BLOCKED) continue; disk = r10_bio->devs[slot].devnum; - rdev = rcu_dereference(conf->mirrors[disk].rdev); + rdev = rcu_dereference(conf->mirrors[disk].replacement); + if (rdev == NULL || test_bit(Faulty, &rdev->flags) || + r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) + rdev = rcu_dereference(conf->mirrors[disk].rdev); if (rdev == NULL) continue; - if (!test_bit(In_sync, &rdev->flags)) + if (test_bit(Faulty, &rdev->flags)) + continue; + if (!test_bit(In_sync, &rdev->flags) && + r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) continue; dev_sector = r10_bio->devs[slot].addr; @@ -627,6 +686,7 @@ retry: if (good_sectors > best_good_sectors) { best_good_sectors = good_sectors; best_slot = slot; + best_rdev = rdev; } if (!do_balance) /* Must read from here */ @@ -655,16 +715,15 @@ retry: if (new_distance < best_dist) { best_dist = new_distance; best_slot = slot; + best_rdev = rdev; } } - if (slot == conf->copies) + if (slot >= conf->copies) { slot = best_slot; + rdev = best_rdev; + } if (slot >= 0) { - disk = r10_bio->devs[slot].devnum; - rdev = rcu_dereference(conf->mirrors[disk].rdev); - if (!rdev) - goto retry; atomic_inc(&rdev->nr_pending); if (test_bit(Faulty, &rdev->flags)) { /* Cannot risk returning a device that failed @@ -675,11 +734,11 @@ retry: } r10_bio->read_slot = slot; } else - disk = -1; + rdev = NULL; rcu_read_unlock(); *max_sectors = best_good_sectors; - return disk; + return rdev; } static int raid10_congested(void *data, int bits) @@ -846,7 +905,6 @@ static void unfreeze_array(struct r10conf *conf) static void make_request(struct mddev *mddev, struct bio * bio) { struct r10conf *conf = mddev->private; - struct mirror_info *mirror; struct r10bio *r10_bio; struct bio *read_bio; int i; @@ -945,27 +1003,27 @@ static void make_request(struct mddev *mddev, struct bio * bio) /* * read balancing logic: */ - int disk; + struct md_rdev *rdev; int slot; read_again: - disk = read_balance(conf, r10_bio, &max_sectors); - slot = r10_bio->read_slot; - if (disk < 0) { + rdev = read_balance(conf, r10_bio, &max_sectors); + if (!rdev) { raid_end_bio_io(r10_bio); return; } - mirror = conf->mirrors + disk; + slot = r10_bio->read_slot; read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, max_sectors); r10_bio->devs[slot].bio = read_bio; + r10_bio->devs[slot].rdev = rdev; read_bio->bi_sector = r10_bio->devs[slot].addr + - mirror->rdev->data_offset; - read_bio->bi_bdev = mirror->rdev->bdev; + rdev->data_offset; + read_bio->bi_bdev = rdev->bdev; read_bio->bi_end_io = raid10_end_read_request; read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r10_bio; @@ -1025,6 +1083,7 @@ read_again: */ plugged = mddev_check_plugged(mddev); + r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ raid10_find_phys(conf, r10_bio); retry_write: blocked_rdev = NULL; @@ -1034,12 +1093,25 @@ retry_write: for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[d].replacement); + if (rdev == rrdev) + rrdev = NULL; if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { atomic_inc(&rdev->nr_pending); blocked_rdev = rdev; break; } + if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { + atomic_inc(&rrdev->nr_pending); + blocked_rdev = rrdev; + break; + } + if (rrdev && test_bit(Faulty, &rrdev->flags)) + rrdev = NULL; + r10_bio->devs[i].bio = NULL; + r10_bio->devs[i].repl_bio = NULL; if (!rdev || test_bit(Faulty, &rdev->flags)) { set_bit(R10BIO_Degraded, &r10_bio->state); continue; @@ -1088,6 +1160,10 @@ retry_write: } r10_bio->devs[i].bio = bio; atomic_inc(&rdev->nr_pending); + if (rrdev) { + r10_bio->devs[i].repl_bio = bio; + atomic_inc(&rrdev->nr_pending); + } } rcu_read_unlock(); @@ -1096,11 +1172,23 @@ retry_write: int j; int d; - for (j = 0; j < i; j++) + for (j = 0; j < i; j++) { if (r10_bio->devs[j].bio) { d = r10_bio->devs[j].devnum; rdev_dec_pending(conf->mirrors[d].rdev, mddev); } + if (r10_bio->devs[j].repl_bio) { + struct md_rdev *rdev; + d = r10_bio->devs[j].devnum; + rdev = conf->mirrors[d].replacement; + if (!rdev) { + /* Race with remove_disk */ + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + rdev_dec_pending(rdev, mddev); + } + } allow_barrier(conf); md_wait_for_blocked_rdev(blocked_rdev, mddev); wait_barrier(conf); @@ -1147,6 +1235,31 @@ retry_write: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + + if (!r10_bio->devs[i].repl_bio) + continue; + + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[i].repl_bio = mbio; + + /* We are actively writing to the original device + * so it cannot disappear, so the replacement cannot + * become NULL here + */ + mbio->bi_sector = (r10_bio->devs[i].addr+ + conf->mirrors[d].replacement->data_offset); + mbio->bi_bdev = conf->mirrors[d].replacement->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_private = r10_bio; + + atomic_inc(&r10_bio->remaining); + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + spin_unlock_irqrestore(&conf->device_lock, flags); } /* Don't remove the bias on 'remaining' (one_write_done) until @@ -1309,9 +1422,27 @@ static int raid10_spare_active(struct mddev *mddev) */ for (i = 0; i < conf->raid_disks; i++) { tmp = conf->mirrors + i; - if (tmp->rdev - && !test_bit(Faulty, &tmp->rdev->flags) - && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + if (tmp->replacement + && tmp->replacement->recovery_offset == MaxSector + && !test_bit(Faulty, &tmp->replacement->flags) + && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { + /* Replacement has just become active */ + if (!tmp->rdev + || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) + count++; + if (tmp->rdev) { + /* Replaced device not technically faulty, + * but we need to be sure it gets removed + * and never re-added. + */ + set_bit(Faulty, &tmp->rdev->flags); + sysfs_notify_dirent_safe( + tmp->rdev->sysfs_state); + } + sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); + } else if (tmp->rdev + && !test_bit(Faulty, &tmp->rdev->flags) + && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; sysfs_notify_dirent(tmp->rdev->sysfs_state); } @@ -1353,8 +1484,25 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) struct mirror_info *p = &conf->mirrors[mirror]; if (p->recovery_disabled == mddev->recovery_disabled) continue; - if (p->rdev) - continue; + if (p->rdev) { + if (!test_bit(WantReplacement, &p->rdev->flags) || + p->replacement != NULL) + continue; + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = mirror; + err = 0; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } + conf->fullsync = 1; + rcu_assign_pointer(p->replacement, rdev); + break; + } disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); @@ -1385,40 +1533,61 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) return err; } -static int raid10_remove_disk(struct mddev *mddev, int number) +static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r10conf *conf = mddev->private; int err = 0; - struct md_rdev *rdev; - struct mirror_info *p = conf->mirrors+ number; + int number = rdev->raid_disk; + struct md_rdev **rdevp; + struct mirror_info *p = conf->mirrors + number; print_conf(conf); - rdev = p->rdev; - if (rdev) { - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { - err = -EBUSY; - goto abort; - } - /* Only remove faulty devices in recovery - * is not possible. - */ - if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != p->recovery_disabled && - enough(conf, -1)) { - err = -EBUSY; - goto abort; - } - p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } - err = md_integrity_register(mddev); + if (rdev == p->rdev) + rdevp = &p->rdev; + else if (rdev == p->replacement) + rdevp = &p->replacement; + else + return 0; + + if (test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) { + err = -EBUSY; + goto abort; } + /* Only remove faulty devices if recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->recovery_disabled != p->recovery_disabled && + (!p->replacement || p->replacement == rdev) && + enough(conf, -1)) { + err = -EBUSY; + goto abort; + } + *rdevp = NULL; + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + goto abort; + } else if (p->replacement) { + /* We must have just cleared 'rdev' */ + p->rdev = p->replacement; + clear_bit(Replacement, &p->replacement->flags); + smp_mb(); /* Make sure other CPUs may see both as identical + * but will never see neither -- if they are careful. + */ + p->replacement = NULL; + clear_bit(WantReplacement, &rdev->flags); + } else + /* We might have just remove the Replacement as faulty + * Clear the flag just in case + */ + clear_bit(WantReplacement, &rdev->flags); + + err = md_integrity_register(mddev); + abort: print_conf(conf); @@ -1432,7 +1601,7 @@ static void end_sync_read(struct bio *bio, int error) struct r10conf *conf = r10_bio->mddev->private; int d; - d = find_bio_disk(conf, r10_bio, bio, NULL); + d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); if (test_bit(BIO_UPTODATE, &bio->bi_flags)) set_bit(R10BIO_Uptodate, &r10_bio->state); @@ -1493,19 +1662,34 @@ static void end_sync_write(struct bio *bio, int error) sector_t first_bad; int bad_sectors; int slot; - - d = find_bio_disk(conf, r10_bio, bio, &slot); + int repl; + struct md_rdev *rdev = NULL; + + d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[d].replacement; + if (!rdev) { + smp_mb(); + rdev = conf->mirrors[d].rdev; + } if (!uptodate) { - set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); - set_bit(R10BIO_WriteError, &r10_bio->state); - } else if (is_badblock(conf->mirrors[d].rdev, + if (repl) + md_error(mddev, rdev); + else { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + set_bit(R10BIO_WriteError, &r10_bio->state); + } + } else if (is_badblock(rdev, r10_bio->devs[slot].addr, r10_bio->sectors, &first_bad, &bad_sectors)) set_bit(R10BIO_MadeGood, &r10_bio->state); - rdev_dec_pending(conf->mirrors[d].rdev, mddev); + rdev_dec_pending(rdev, mddev); end_sync_request(r10_bio); } @@ -1609,6 +1793,29 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) generic_make_request(tbio); } + /* Now write out to any replacement devices + * that are active + */ + for (i = 0; i < conf->copies; i++) { + int j, d; + int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); + + tbio = r10_bio->devs[i].repl_bio; + if (!tbio || !tbio->bi_end_io) + continue; + if (r10_bio->devs[i].bio->bi_end_io != end_sync_write + && r10_bio->devs[i].bio != fbio) + for (j = 0; j < vcnt; j++) + memcpy(page_address(tbio->bi_io_vec[j].bv_page), + page_address(fbio->bi_io_vec[j].bv_page), + PAGE_SIZE); + d = r10_bio->devs[i].devnum; + atomic_inc(&r10_bio->remaining); + md_sync_acct(conf->mirrors[d].replacement->bdev, + tbio->bi_size >> 9); + generic_make_request(tbio); + } + done: if (atomic_dec_and_test(&r10_bio->remaining)) { md_done_sync(mddev, r10_bio->sectors, 1); @@ -1668,8 +1875,13 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) s << 9, bio->bi_io_vec[idx].bv_page, WRITE, false); - if (!ok) + if (!ok) { set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, + &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + } } if (!ok) { /* We don't worry if we cannot set a bad block - @@ -1709,7 +1921,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) { struct r10conf *conf = mddev->private; int d; - struct bio *wbio; + struct bio *wbio, *wbio2; if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { fix_recovery_read_error(r10_bio); @@ -1721,12 +1933,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) * share the pages with the first bio * and submit the write request */ - wbio = r10_bio->devs[1].bio; d = r10_bio->devs[1].devnum; - - atomic_inc(&conf->mirrors[d].rdev->nr_pending); - md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); - generic_make_request(wbio); + wbio = r10_bio->devs[1].bio; + wbio2 = r10_bio->devs[1].repl_bio; + if (wbio->bi_end_io) { + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); + generic_make_request(wbio); + } + if (wbio2 && wbio2->bi_end_io) { + atomic_inc(&conf->mirrors[d].replacement->nr_pending); + md_sync_acct(conf->mirrors[d].replacement->bdev, + wbio2->bi_size >> 9); + generic_make_request(wbio2); + } } @@ -1779,8 +1999,12 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) /* success */ return 1; - if (rw == WRITE) + if (rw == WRITE) { set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + } /* need to record an error - either for the block or the device */ if (!rdev_set_badblocks(rdev, sector, sectors, 0)) md_error(rdev->mddev, rdev); @@ -2060,10 +2284,9 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) { int slot = r10_bio->read_slot; - int mirror = r10_bio->devs[slot].devnum; struct bio *bio; struct r10conf *conf = mddev->private; - struct md_rdev *rdev; + struct md_rdev *rdev = r10_bio->devs[slot].rdev; char b[BDEVNAME_SIZE]; unsigned long do_sync; int max_sectors; @@ -2081,15 +2304,15 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) fix_read_error(conf, mddev, r10_bio); unfreeze_array(conf); } - rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); + rdev_dec_pending(rdev, mddev); bio = r10_bio->devs[slot].bio; bdevname(bio->bi_bdev, b); r10_bio->devs[slot].bio = mddev->ro ? IO_BLOCKED : NULL; read_more: - mirror = read_balance(conf, r10_bio, &max_sectors); - if (mirror == -1) { + rdev = read_balance(conf, r10_bio, &max_sectors); + if (rdev == NULL) { printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" " read error for block %llu\n", mdname(mddev), b, @@ -2103,7 +2326,6 @@ read_more: if (bio) bio_put(bio); slot = r10_bio->read_slot; - rdev = conf->mirrors[mirror].rdev; printk_ratelimited( KERN_ERR "md/raid10:%s: %s: redirecting" @@ -2117,6 +2339,7 @@ read_more: r10_bio->sector - bio->bi_sector, max_sectors); r10_bio->devs[slot].bio = bio; + r10_bio->devs[slot].rdev = rdev; bio->bi_sector = r10_bio->devs[slot].addr + rdev->data_offset; bio->bi_bdev = rdev->bdev; @@ -2187,6 +2410,22 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) r10_bio->sectors, 0)) md_error(conf->mddev, rdev); } + rdev = conf->mirrors[dev].replacement; + if (r10_bio->devs[m].repl_bio == NULL) + continue; + if (test_bit(BIO_UPTODATE, + &r10_bio->devs[m].repl_bio->bi_flags)) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors); + } else { + if (!rdev_set_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0)) + md_error(conf->mddev, rdev); + } } put_buf(r10_bio); } else { @@ -2209,6 +2448,15 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) } rdev_dec_pending(rdev, conf->mddev); } + bio = r10_bio->devs[m].repl_bio; + rdev = conf->mirrors[dev].replacement; + if (rdev && bio == IO_MADE_GOOD) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors); + rdev_dec_pending(rdev, conf->mddev); + } } if (test_bit(R10BIO_WriteError, &r10_bio->state)) @@ -2272,9 +2520,14 @@ static void raid10d(struct mddev *mddev) static int init_resync(struct r10conf *conf) { int buffs; + int i; buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; BUG_ON(conf->r10buf_pool); + conf->have_replacement = 0; + for (i = 0; i < conf->raid_disks; i++) + if (conf->mirrors[i].replacement) + conf->have_replacement = 1; conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); if (!conf->r10buf_pool) return -ENOMEM; @@ -2355,9 +2608,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, bitmap_end_sync(mddev->bitmap, sect, &sync_blocks, 1); } - } else /* completed sync */ + } else { + /* completed sync */ + if ((!mddev->bitmap || conf->fullsync) + && conf->have_replacement + && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* Completed a full sync so the replacements + * are now fully recovered. + */ + for (i = 0; i < conf->raid_disks; i++) + if (conf->mirrors[i].replacement) + conf->mirrors[i].replacement + ->recovery_offset + = MaxSector; + } conf->fullsync = 0; - + } bitmap_close_sync(mddev->bitmap); close_sync(conf); *skipped = 1; @@ -2414,23 +2680,30 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, sector_t sect; int must_sync; int any_working; - - if (conf->mirrors[i].rdev == NULL || - test_bit(In_sync, &conf->mirrors[i].rdev->flags)) + struct mirror_info *mirror = &conf->mirrors[i]; + + if ((mirror->rdev == NULL || + test_bit(In_sync, &mirror->rdev->flags)) + && + (mirror->replacement == NULL || + test_bit(Faulty, + &mirror->replacement->flags))) continue; still_degraded = 0; /* want to reconstruct this device */ rb2 = r10_bio; sect = raid10_find_virt(conf, sector_nr, i); - /* Unless we are doing a full sync, we only need - * to recover the block if it is set in the bitmap + /* Unless we are doing a full sync, or a replacement + * we only need to recover the block if it is set in + * the bitmap */ must_sync = bitmap_start_sync(mddev->bitmap, sect, &sync_blocks, 1); if (sync_blocks < max_sync) max_sync = sync_blocks; if (!must_sync && + mirror->replacement == NULL && !conf->fullsync) { /* yep, skip the sync_blocks here, but don't assume * that there will never be anything to do here @@ -2500,33 +2773,60 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_end_io = end_sync_read; bio->bi_rw = READ; from_addr = r10_bio->devs[j].addr; - bio->bi_sector = from_addr + - conf->mirrors[d].rdev->data_offset; - bio->bi_bdev = conf->mirrors[d].rdev->bdev; - atomic_inc(&conf->mirrors[d].rdev->nr_pending); - atomic_inc(&r10_bio->remaining); - /* and we write to 'i' */ + bio->bi_sector = from_addr + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + atomic_inc(&rdev->nr_pending); + /* and we write to 'i' (if not in_sync) */ for (k=0; k<conf->copies; k++) if (r10_bio->devs[k].devnum == i) break; BUG_ON(k == conf->copies); - bio = r10_bio->devs[1].bio; - bio->bi_next = biolist; - biolist = bio; - bio->bi_private = r10_bio; - bio->bi_end_io = end_sync_write; - bio->bi_rw = WRITE; to_addr = r10_bio->devs[k].addr; - bio->bi_sector = to_addr + - conf->mirrors[i].rdev->data_offset; - bio->bi_bdev = conf->mirrors[i].rdev->bdev; - r10_bio->devs[0].devnum = d; r10_bio->devs[0].addr = from_addr; r10_bio->devs[1].devnum = i; r10_bio->devs[1].addr = to_addr; + rdev = mirror->rdev; + if (!test_bit(In_sync, &rdev->flags)) { + bio = r10_bio->devs[1].bio; + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = WRITE; + bio->bi_sector = to_addr + + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + atomic_inc(&r10_bio->remaining); + } else + r10_bio->devs[1].bio->bi_end_io = NULL; + + /* and maybe write to replacement */ + bio = r10_bio->devs[1].repl_bio; + if (bio) + bio->bi_end_io = NULL; + rdev = mirror->replacement; + /* Note: if rdev != NULL, then bio + * cannot be NULL as r10buf_pool_alloc will + * have allocated it. + * So the second test here is pointless. + * But it keeps semantic-checkers happy, and + * this comment keeps human reviewers + * happy. + */ + if (rdev == NULL || bio == NULL || + test_bit(Faulty, &rdev->flags)) + break; + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = WRITE; + bio->bi_sector = to_addr + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + atomic_inc(&r10_bio->remaining); break; } if (j == conf->copies) { @@ -2544,8 +2844,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, for (k = 0; k < conf->copies; k++) if (r10_bio->devs[k].devnum == i) break; - if (!rdev_set_badblocks( - conf->mirrors[i].rdev, + if (!test_bit(In_sync, + &mirror->rdev->flags) + && !rdev_set_badblocks( + mirror->rdev, + r10_bio->devs[k].addr, + max_sync, 0)) + any_working = 0; + if (mirror->replacement && + !rdev_set_badblocks( + mirror->replacement, r10_bio->devs[k].addr, max_sync, 0)) any_working = 0; @@ -2556,7 +2864,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, printk(KERN_INFO "md/raid10:%s: insufficient " "working devices for recovery.\n", mdname(mddev)); - conf->mirrors[i].recovery_disabled + mirror->recovery_disabled = mddev->recovery_disabled; } break; @@ -2605,6 +2913,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, sector_t first_bad, sector; int bad_sectors; + if (r10_bio->devs[i].repl_bio) + r10_bio->devs[i].repl_bio->bi_end_io = NULL; + bio = r10_bio->devs[i].bio; bio->bi_end_io = NULL; clear_bit(BIO_UPTODATE, &bio->bi_flags); @@ -2635,6 +2946,27 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, conf->mirrors[d].rdev->data_offset; bio->bi_bdev = conf->mirrors[d].rdev->bdev; count++; + + if (conf->mirrors[d].replacement == NULL || + test_bit(Faulty, + &conf->mirrors[d].replacement->flags)) + continue; + + /* Need to set up for writing to the replacement */ + bio = r10_bio->devs[i].repl_bio; + clear_bit(BIO_UPTODATE, &bio->bi_flags); + + sector = r10_bio->devs[i].addr; + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = WRITE; + bio->bi_sector = sector + + conf->mirrors[d].replacement->data_offset; + bio->bi_bdev = conf->mirrors[d].replacement->bdev; + count++; } if (count < 2) { @@ -2643,6 +2975,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, if (r10_bio->devs[i].bio->bi_end_io) rdev_dec_pending(conf->mirrors[d].rdev, mddev); + if (r10_bio->devs[i].repl_bio && + r10_bio->devs[i].repl_bio->bi_end_io) + rdev_dec_pending( + conf->mirrors[d].replacement, + mddev); } put_buf(r10_bio); biolist = NULL; @@ -2896,6 +3233,16 @@ static int run(struct mddev *mddev) continue; disk = conf->mirrors + disk_idx; + if (test_bit(Replacement, &rdev->flags)) { + if (disk->replacement) + goto out_free_conf; + disk->replacement = rdev; + } else { + if (disk->rdev) + goto out_free_conf; + disk->rdev = rdev; + } + disk->rdev = rdev; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); @@ -2923,6 +3270,13 @@ static int run(struct mddev *mddev) disk = conf->mirrors + i; + if (!disk->rdev && disk->replacement) { + /* The replacement is all we have - use it */ + disk->rdev = disk->replacement; + disk->replacement = NULL; + clear_bit(Replacement, &disk->rdev->flags); + } + if (!disk->rdev || !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 7facfdf..7c615613 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -2,7 +2,7 @@ #define _RAID10_H struct mirror_info { - struct md_rdev *rdev; + struct md_rdev *rdev, *replacement; sector_t head_position; int recovery_disabled; /* matches * mddev->recovery_disabled @@ -18,12 +18,13 @@ struct r10conf { spinlock_t device_lock; /* geometry */ - int near_copies; /* number of copies laid out raid0 style */ + int near_copies; /* number of copies laid out + * raid0 style */ int far_copies; /* number of copies laid out * at large strides across drives */ - int far_offset; /* far_copies are offset by 1 stripe - * instead of many + int far_offset; /* far_copies are offset by 1 + * stripe instead of many */ int copies; /* near_copies * far_copies. * must be <= raid_disks @@ -34,10 +35,11 @@ struct r10conf { * 1 stripe. */ - sector_t dev_sectors; /* temp copy of mddev->dev_sectors */ + sector_t dev_sectors; /* temp copy of + * mddev->dev_sectors */ - int chunk_shift; /* shift from chunks to sectors */ - sector_t chunk_mask; + int chunk_shift; /* shift from chunks to sectors */ + sector_t chunk_mask; struct list_head retry_list; /* queue pending writes and submit them on unplug */ @@ -45,20 +47,22 @@ struct r10conf { int pending_count; spinlock_t resync_lock; - int nr_pending; - int nr_waiting; - int nr_queued; - int barrier; + int nr_pending; + int nr_waiting; + int nr_queued; + int barrier; sector_t next_resync; int fullsync; /* set to 1 if a full sync is needed, * (fresh device added). * Cleared when a sync completes. */ - + int have_replacement; /* There is at least one + * replacement device. + */ wait_queue_head_t wait_barrier; - mempool_t *r10bio_pool; - mempool_t *r10buf_pool; + mempool_t *r10bio_pool; + mempool_t *r10buf_pool; struct page *tmppage; /* When taking over an array from a different personality, we store @@ -98,11 +102,18 @@ struct r10bio { * When resyncing we also use one for each copy. * When reconstructing, we use 2 bios, one for read, one for write. * We choose the number when they are allocated. + * We sometimes need an extra bio to write to the replacement. */ struct { - struct bio *bio; - sector_t addr; - int devnum; + struct bio *bio; + union { + struct bio *repl_bio; /* used for resync and + * writes */ + struct md_rdev *rdev; /* used for reads + * (read_slot >= 0) */ + }; + sector_t addr; + int devnum; } devs[0]; }; @@ -121,17 +132,19 @@ struct r10bio { #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) /* bits for r10bio.state */ -#define R10BIO_Uptodate 0 -#define R10BIO_IsSync 1 -#define R10BIO_IsRecover 2 -#define R10BIO_Degraded 3 +enum r10bio_state { + R10BIO_Uptodate, + R10BIO_IsSync, + R10BIO_IsRecover, + R10BIO_Degraded, /* Set ReadError on bios that experience a read error * so that raid10d knows what to do with them. */ -#define R10BIO_ReadError 4 + R10BIO_ReadError, /* If a write for this request means we can clear some * known-bad-block records, we set this flag. */ -#define R10BIO_MadeGood 5 -#define R10BIO_WriteError 6 + R10BIO_MadeGood, + R10BIO_WriteError, +}; #endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 858fdbb..360f2b9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -370,12 +370,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, * of the two sections, and some non-in_sync devices may * be insync in the section most affected by failed devices. */ -static int has_failed(struct r5conf *conf) +static int calc_degraded(struct r5conf *conf) { - int degraded; + int degraded, degraded2; int i; - if (conf->mddev->reshape_position == MaxSector) - return conf->mddev->degraded > conf->max_degraded; rcu_read_lock(); degraded = 0; @@ -399,14 +397,14 @@ static int has_failed(struct r5conf *conf) degraded++; } rcu_read_unlock(); - if (degraded > conf->max_degraded) - return 1; + if (conf->raid_disks == conf->previous_raid_disks) + return degraded; rcu_read_lock(); - degraded = 0; + degraded2 = 0; for (i = 0; i < conf->raid_disks; i++) { struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); if (!rdev || test_bit(Faulty, &rdev->flags)) - degraded++; + degraded2++; else if (test_bit(In_sync, &rdev->flags)) ; else @@ -416,9 +414,22 @@ static int has_failed(struct r5conf *conf) * almost certainly hasn't. */ if (conf->raid_disks <= conf->previous_raid_disks) - degraded++; + degraded2++; } rcu_read_unlock(); + if (degraded2 > degraded) + return degraded2; + return degraded; +} + +static int has_failed(struct r5conf *conf) +{ + int degraded; + + if (conf->mddev->reshape_position == MaxSector) + return conf->mddev->degraded > conf->max_degraded; + + degraded = calc_degraded(conf); if (degraded > conf->max_degraded) return 1; return 0; @@ -492,8 +503,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) for (i = disks; i--; ) { int rw; - struct bio *bi; - struct md_rdev *rdev; + int replace_only = 0; + struct bio *bi, *rbi; + struct md_rdev *rdev, *rrdev = NULL; if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) rw = WRITE_FUA; @@ -501,27 +513,57 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rw = WRITE; } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; - else + else if (test_and_clear_bit(R5_WantReplace, + &sh->dev[i].flags)) { + rw = WRITE; + replace_only = 1; + } else continue; bi = &sh->dev[i].req; + rbi = &sh->dev[i].rreq; /* For writing to replacement */ bi->bi_rw = rw; - if (rw & WRITE) + rbi->bi_rw = rw; + if (rw & WRITE) { bi->bi_end_io = raid5_end_write_request; - else + rbi->bi_end_io = raid5_end_write_request; + } else bi->bi_end_io = raid5_end_read_request; rcu_read_lock(); + rrdev = rcu_dereference(conf->disks[i].replacement); + smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev) { + rdev = rrdev; + rrdev = NULL; + } + if (rw & WRITE) { + if (replace_only) + rdev = NULL; + if (rdev == rrdev) + /* We raced and saw duplicates */ + rrdev = NULL; + } else { + if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) + rdev = rrdev; + rrdev = NULL; + } + if (rdev && test_bit(Faulty, &rdev->flags)) rdev = NULL; if (rdev) atomic_inc(&rdev->nr_pending); + if (rrdev && test_bit(Faulty, &rrdev->flags)) + rrdev = NULL; + if (rrdev) + atomic_inc(&rrdev->nr_pending); rcu_read_unlock(); /* We have already checked bad blocks for reads. Now - * need to check for writes. + * need to check for writes. We never accept write errors + * on the replacement, so we don't to check rrdev. */ while ((rw & WRITE) && rdev && test_bit(WriteErrorSeen, &rdev->flags)) { @@ -551,7 +593,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) } if (rdev) { - if (s->syncing || s->expanding || s->expanded) + if (s->syncing || s->expanding || s->expanded + || s->replacing) md_sync_acct(rdev->bdev, STRIPE_SECTORS); set_bit(STRIPE_IO_STARTED, &sh->state); @@ -563,16 +606,38 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) atomic_inc(&sh->count); bi->bi_sector = sh->sector + rdev->data_offset; bi->bi_flags = 1 << BIO_UPTODATE; - bi->bi_vcnt = 1; - bi->bi_max_vecs = 1; bi->bi_idx = 0; - bi->bi_io_vec = &sh->dev[i].vec; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; bi->bi_next = NULL; + if (rrdev) + set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); generic_make_request(bi); - } else { + } + if (rrdev) { + if (s->syncing || s->expanding || s->expanded + || s->replacing) + md_sync_acct(rrdev->bdev, STRIPE_SECTORS); + + set_bit(STRIPE_IO_STARTED, &sh->state); + + rbi->bi_bdev = rrdev->bdev; + pr_debug("%s: for %llu schedule op %ld on " + "replacement disc %d\n", + __func__, (unsigned long long)sh->sector, + rbi->bi_rw, i); + atomic_inc(&sh->count); + rbi->bi_sector = sh->sector + rrdev->data_offset; + rbi->bi_flags = 1 << BIO_UPTODATE; + rbi->bi_idx = 0; + rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; + rbi->bi_io_vec[0].bv_offset = 0; + rbi->bi_size = STRIPE_SIZE; + rbi->bi_next = NULL; + generic_make_request(rbi); + } + if (!rdev && !rrdev) { if (rw & WRITE) set_bit(STRIPE_DEGRADED, &sh->state); pr_debug("skip op %ld on disc %d for sector %llu\n", @@ -1583,7 +1648,7 @@ static void raid5_end_read_request(struct bio * bi, int error) int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); char b[BDEVNAME_SIZE]; - struct md_rdev *rdev; + struct md_rdev *rdev = NULL; for (i=0 ; i<disks; i++) @@ -1597,11 +1662,23 @@ static void raid5_end_read_request(struct bio * bi, int error) BUG(); return; } + if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) + /* If replacement finished while this request was outstanding, + * 'replacement' might be NULL already. + * In that case it moved down to 'rdev'. + * rdev is not removed until all requests are finished. + */ + rdev = conf->disks[i].replacement; + if (!rdev) + rdev = conf->disks[i].rdev; if (uptodate) { set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - rdev = conf->disks[i].rdev; + /* Note that this cannot happen on a + * replacement device. We just fail those on + * any error + */ printk_ratelimited( KERN_INFO "md/raid:%s: read error corrected" @@ -1614,16 +1691,24 @@ static void raid5_end_read_request(struct bio * bi, int error) clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } - if (atomic_read(&conf->disks[i].rdev->read_errors)) - atomic_set(&conf->disks[i].rdev->read_errors, 0); + if (atomic_read(&rdev->read_errors)) + atomic_set(&rdev->read_errors, 0); } else { - const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); + const char *bdn = bdevname(rdev->bdev, b); int retry = 0; - rdev = conf->disks[i].rdev; clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); - if (conf->mddev->degraded >= conf->max_degraded) + if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) + printk_ratelimited( + KERN_WARNING + "md/raid:%s: read error on replacement device " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); + else if (conf->mddev->degraded >= conf->max_degraded) printk_ratelimited( KERN_WARNING "md/raid:%s: read error not correctable " @@ -1657,7 +1742,7 @@ static void raid5_end_read_request(struct bio * bi, int error) md_error(conf->mddev, rdev); } } - rdev_dec_pending(conf->disks[i].rdev, conf->mddev); + rdev_dec_pending(rdev, conf->mddev); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -1668,14 +1753,30 @@ static void raid5_end_write_request(struct bio *bi, int error) struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; + struct md_rdev *uninitialized_var(rdev); int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); sector_t first_bad; int bad_sectors; + int replacement = 0; - for (i=0 ; i<disks; i++) - if (bi == &sh->dev[i].req) + for (i = 0 ; i < disks; i++) { + if (bi == &sh->dev[i].req) { + rdev = conf->disks[i].rdev; break; - + } + if (bi == &sh->dev[i].rreq) { + rdev = conf->disks[i].replacement; + if (rdev) + replacement = 1; + else + /* rdev was removed and 'replacement' + * replaced it. rdev is not removed + * until all requests are finished. + */ + rdev = conf->disks[i].rdev; + break; + } + } pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), uptodate); @@ -1684,21 +1785,33 @@ static void raid5_end_write_request(struct bio *bi, int error) return; } - if (!uptodate) { - set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); - set_bit(R5_WriteError, &sh->dev[i].flags); - } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, - &first_bad, &bad_sectors)) - set_bit(R5_MadeGood, &sh->dev[i].flags); + if (replacement) { + if (!uptodate) + md_error(conf->mddev, rdev); + else if (is_badblock(rdev, sh->sector, + STRIPE_SECTORS, + &first_bad, &bad_sectors)) + set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); + } else { + if (!uptodate) { + set_bit(WriteErrorSeen, &rdev->flags); + set_bit(R5_WriteError, &sh->dev[i].flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + } else if (is_badblock(rdev, sh->sector, + STRIPE_SECTORS, + &first_bad, &bad_sectors)) + set_bit(R5_MadeGood, &sh->dev[i].flags); + } + rdev_dec_pending(rdev, conf->mddev); - rdev_dec_pending(conf->disks[i].rdev, conf->mddev); - - clear_bit(R5_LOCKED, &sh->dev[i].flags); + if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) + clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } - static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); static void raid5_build_block(struct stripe_head *sh, int i, int previous) @@ -1709,12 +1822,15 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) dev->req.bi_io_vec = &dev->vec; dev->req.bi_vcnt++; dev->req.bi_max_vecs++; + dev->req.bi_private = sh; dev->vec.bv_page = dev->page; - dev->vec.bv_len = STRIPE_SIZE; - dev->vec.bv_offset = 0; - dev->req.bi_sector = sh->sector; - dev->req.bi_private = sh; + bio_init(&dev->rreq); + dev->rreq.bi_io_vec = &dev->rvec; + dev->rreq.bi_vcnt++; + dev->rreq.bi_max_vecs++; + dev->rreq.bi_private = sh; + dev->rvec.bv_page = dev->page; dev->flags = 0; dev->sector = compute_blocknr(sh, i, previous); @@ -1724,18 +1840,15 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r5conf *conf = mddev->private; + unsigned long flags; pr_debug("raid456: error called\n"); - if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded++; - spin_unlock_irqrestore(&conf->device_lock, flags); - /* - * if recovery was running, make sure it aborts. - */ - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - } + spin_lock_irqsave(&conf->device_lock, flags); + clear_bit(In_sync, &rdev->flags); + mddev->degraded = calc_degraded(conf); + spin_unlock_irqrestore(&conf->device_lock, flags); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -2362,8 +2475,9 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, md_done_sync(conf->mddev, STRIPE_SECTORS, 0); clear_bit(STRIPE_SYNCING, &sh->state); s->syncing = 0; + s->replacing = 0; /* There is nothing more to do for sync/check/repair. - * For recover we need to record a bad block on all + * For recover/replace we need to record a bad block on all * non-sync devices, or abort the recovery */ if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) @@ -2373,12 +2487,18 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, */ for (i = 0; i < conf->raid_disks; i++) { struct md_rdev *rdev = conf->disks[i].rdev; - if (!rdev - || test_bit(Faulty, &rdev->flags) - || test_bit(In_sync, &rdev->flags)) - continue; - if (!rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && !rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) + abort = 1; + rdev = conf->disks[i].replacement; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && !rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) abort = 1; } if (abort) { @@ -2387,6 +2507,22 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, } } +static int want_replace(struct stripe_head *sh, int disk_idx) +{ + struct md_rdev *rdev; + int rv = 0; + /* Doing recovery so rcu locking not required */ + rdev = sh->raid_conf->disks[disk_idx].replacement; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && (rdev->recovery_offset <= sh->sector + || rdev->mddev->recovery_cp <= sh->sector)) + rv = 1; + + return rv; +} + /* fetch_block - checks the given member device to see if its data needs * to be read or computed to satisfy a request. * @@ -2406,6 +2542,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, (dev->toread || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || s->syncing || s->expanding || + (s->replacing && want_replace(sh, disk_idx)) || (s->failed >= 1 && fdev[0]->toread) || (s->failed >= 2 && fdev[1]->toread) || (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && @@ -2959,22 +3096,18 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) } } - /* * handle_stripe - do things to a stripe. * - * We lock the stripe and then examine the state of various bits - * to see what needs to be done. + * We lock the stripe by setting STRIPE_ACTIVE and then examine the + * state of various bits to see what needs to be done. * Possible results: - * return some read request which now have data - * return some write requests which are safely on disc + * return some read requests which now have data + * return some write requests which are safely on storage * schedule a read on some buffers * schedule a write of some buffers * return confirmation of parity correctness * - * buffers are taken off read_list or write_list, and bh_cache buffers - * get BH_Lock set before the stripe lock is released. - * */ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) @@ -2983,10 +3116,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) int disks = sh->disks; struct r5dev *dev; int i; + int do_recovery = 0; memset(s, 0, sizeof(*s)); - s->syncing = test_bit(STRIPE_SYNCING, &sh->state); s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); s->failed_num[0] = -1; @@ -3004,7 +3137,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) dev = &sh->dev[i]; pr_debug("check %d: state 0x%lx read %p write %p written %p\n", - i, dev->flags, dev->toread, dev->towrite, dev->written); + i, dev->flags, + dev->toread, dev->towrite, dev->written); /* maybe we can reply to a read * * new wantfill requests are only permitted while @@ -3035,7 +3169,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } if (dev->written) s->written++; - rdev = rcu_dereference(conf->disks[i].rdev); + /* Prefer to use the replacement for reads, but only + * if it is recovered enough and has no bad blocks. + */ + rdev = rcu_dereference(conf->disks[i].replacement); + if (rdev && !test_bit(Faulty, &rdev->flags) && + rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && + !is_badblock(rdev, sh->sector, STRIPE_SECTORS, + &first_bad, &bad_sectors)) + set_bit(R5_ReadRepl, &dev->flags); + else { + if (rdev) + set_bit(R5_NeedReplace, &dev->flags); + rdev = rcu_dereference(conf->disks[i].rdev); + clear_bit(R5_ReadRepl, &dev->flags); + } if (rdev && test_bit(Faulty, &rdev->flags)) rdev = NULL; if (rdev) { @@ -3077,20 +3225,38 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) set_bit(R5_Insync, &dev->flags); if (rdev && test_bit(R5_WriteError, &dev->flags)) { - clear_bit(R5_Insync, &dev->flags); - if (!test_bit(Faulty, &rdev->flags)) { + /* This flag does not apply to '.replacement' + * only to .rdev, so make sure to check that*/ + struct md_rdev *rdev2 = rcu_dereference( + conf->disks[i].rdev); + if (rdev2 == rdev) + clear_bit(R5_Insync, &dev->flags); + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { s->handle_bad_blocks = 1; - atomic_inc(&rdev->nr_pending); + atomic_inc(&rdev2->nr_pending); } else clear_bit(R5_WriteError, &dev->flags); } if (rdev && test_bit(R5_MadeGood, &dev->flags)) { - if (!test_bit(Faulty, &rdev->flags)) { + /* This flag does not apply to '.replacement' + * only to .rdev, so make sure to check that*/ + struct md_rdev *rdev2 = rcu_dereference( + conf->disks[i].rdev); + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { s->handle_bad_blocks = 1; - atomic_inc(&rdev->nr_pending); + atomic_inc(&rdev2->nr_pending); } else clear_bit(R5_MadeGood, &dev->flags); } + if (test_bit(R5_MadeGoodRepl, &dev->flags)) { + struct md_rdev *rdev2 = rcu_dereference( + conf->disks[i].replacement); + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { + s->handle_bad_blocks = 1; + atomic_inc(&rdev2->nr_pending); + } else + clear_bit(R5_MadeGoodRepl, &dev->flags); + } if (!test_bit(R5_Insync, &dev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); @@ -3102,9 +3268,25 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (s->failed < 2) s->failed_num[s->failed] = i; s->failed++; + if (rdev && !test_bit(Faulty, &rdev->flags)) + do_recovery = 1; } } spin_unlock_irq(&conf->device_lock); + if (test_bit(STRIPE_SYNCING, &sh->state)) { + /* If there is a failed device being replaced, + * we must be recovering. + * else if we are after recovery_cp, we must be syncing + * else we can only be replacing + * sync and recovery both need to read all devices, and so + * use the same flag. + */ + if (do_recovery || + sh->sector >= conf->mddev->recovery_cp) + s->syncing = 1; + else + s->replacing = 1; + } rcu_read_unlock(); } @@ -3146,7 +3328,7 @@ static void handle_stripe(struct stripe_head *sh) if (unlikely(s.blocked_rdev)) { if (s.syncing || s.expanding || s.expanded || - s.to_write || s.written) { + s.replacing || s.to_write || s.written) { set_bit(STRIPE_HANDLE, &sh->state); goto finish; } @@ -3172,7 +3354,7 @@ static void handle_stripe(struct stripe_head *sh) sh->reconstruct_state = 0; if (s.to_read+s.to_write+s.written) handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); - if (s.syncing) + if (s.syncing + s.replacing) handle_failed_sync(conf, sh, &s); } @@ -3203,7 +3385,9 @@ static void handle_stripe(struct stripe_head *sh) */ if (s.to_read || s.non_overwrite || (conf->level == 6 && s.to_write && s.failed) - || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) + || (s.syncing && (s.uptodate + s.compute < disks)) + || s.replacing + || s.expanding) handle_stripe_fill(sh, &s, disks); /* Now we check to see if any write operations have recently @@ -3265,7 +3449,20 @@ static void handle_stripe(struct stripe_head *sh) handle_parity_checks5(conf, sh, &s, disks); } - if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { + if (s.replacing && s.locked == 0 + && !test_bit(STRIPE_INSYNC, &sh->state)) { + /* Write out to replacement devices where possible */ + for (i = 0; i < conf->raid_disks; i++) + if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && + test_bit(R5_NeedReplace, &sh->dev[i].flags)) { + set_bit(R5_WantReplace, &sh->dev[i].flags); + set_bit(R5_LOCKED, &sh->dev[i].flags); + s.locked++; + } + set_bit(STRIPE_INSYNC, &sh->state); + } + if ((s.syncing || s.replacing) && s.locked == 0 && + test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS, 1); clear_bit(STRIPE_SYNCING, &sh->state); } @@ -3363,6 +3560,15 @@ finish: STRIPE_SECTORS); rdev_dec_pending(rdev, conf->mddev); } + if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { + rdev = conf->disks[i].replacement; + if (!rdev) + /* rdev have been moved down */ + rdev = conf->disks[i].rdev; + rdev_clear_badblocks(rdev, sh->sector, + STRIPE_SECTORS); + rdev_dec_pending(rdev, conf->mddev); + } } if (s.ops_request) @@ -3586,6 +3792,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) int dd_idx; struct bio* align_bi; struct md_rdev *rdev; + sector_t end_sector; if (!in_chunk_boundary(mddev, raid_bio)) { pr_debug("chunk_aligned_read : non aligned\n"); @@ -3610,9 +3817,19 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 0, &dd_idx, NULL); + end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); rcu_read_lock(); - rdev = rcu_dereference(conf->disks[dd_idx].rdev); - if (rdev && test_bit(In_sync, &rdev->flags)) { + rdev = rcu_dereference(conf->disks[dd_idx].replacement); + if (!rdev || test_bit(Faulty, &rdev->flags) || + rdev->recovery_offset < end_sector) { + rdev = rcu_dereference(conf->disks[dd_idx].rdev); + if (rdev && + (test_bit(Faulty, &rdev->flags) || + !(test_bit(In_sync, &rdev->flags) || + rdev->recovery_offset >= end_sector))) + rdev = NULL; + } + if (rdev) { sector_t first_bad; int bad_sectors; @@ -4137,7 +4354,6 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ } - bitmap_cond_end_sync(mddev->bitmap, sector_nr); sh = get_active_stripe(conf, sector_nr, 0, 1, 0); @@ -4208,7 +4424,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) return handled; } - set_bit(R5_ReadError, &sh->dev[dd_idx].flags); if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { release_stripe(sh); raid5_set_bi_hw_segments(raid_bio, scnt); @@ -4635,7 +4850,15 @@ static struct r5conf *setup_conf(struct mddev *mddev) continue; disk = conf->disks + raid_disk; - disk->rdev = rdev; + if (test_bit(Replacement, &rdev->flags)) { + if (disk->replacement) + goto abort; + disk->replacement = rdev; + } else { + if (disk->rdev) + goto abort; + disk->rdev = rdev; + } if (test_bit(In_sync, &rdev->flags)) { char b[BDEVNAME_SIZE]; @@ -4724,6 +4947,7 @@ static int run(struct mddev *mddev) int dirty_parity_disks = 0; struct md_rdev *rdev; sector_t reshape_offset = 0; + int i; if (mddev->recovery_cp != MaxSector) printk(KERN_NOTICE "md/raid:%s: not clean" @@ -4813,12 +5037,25 @@ static int run(struct mddev *mddev) conf->thread = NULL; mddev->private = conf; - /* - * 0 for a fully functional array, 1 or 2 for a degraded array. - */ - list_for_each_entry(rdev, &mddev->disks, same_set) { - if (rdev->raid_disk < 0) + for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; + i++) { + rdev = conf->disks[i].rdev; + if (!rdev && conf->disks[i].replacement) { + /* The replacement is all we have yet */ + rdev = conf->disks[i].replacement; + conf->disks[i].replacement = NULL; + clear_bit(Replacement, &rdev->flags); + conf->disks[i].rdev = rdev; + } + if (!rdev) continue; + if (conf->disks[i].replacement && + conf->reshape_progress != MaxSector) { + /* replacements and reshape simply do not mix. */ + printk(KERN_ERR "md: cannot handle concurrent " + "replacement and reshape.\n"); + goto abort; + } if (test_bit(In_sync, &rdev->flags)) { working_disks++; continue; @@ -4852,8 +5089,10 @@ static int run(struct mddev *mddev) dirty_parity_disks++; } - mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) - - working_disks); + /* + * 0 for a fully functional array, 1 or 2 for a degraded array. + */ + mddev->degraded = calc_degraded(conf); if (has_failed(conf)) { printk(KERN_ERR "md/raid:%s: not enough operational devices" @@ -5016,7 +5255,25 @@ static int raid5_spare_active(struct mddev *mddev) for (i = 0; i < conf->raid_disks; i++) { tmp = conf->disks + i; - if (tmp->rdev + if (tmp->replacement + && tmp->replacement->recovery_offset == MaxSector + && !test_bit(Faulty, &tmp->replacement->flags) + && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { + /* Replacement has just become active. */ + if (!tmp->rdev + || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) + count++; + if (tmp->rdev) { + /* Replaced device not technically faulty, + * but we need to be sure it gets removed + * and never re-added. + */ + set_bit(Faulty, &tmp->rdev->flags); + sysfs_notify_dirent_safe( + tmp->rdev->sysfs_state); + } + sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); + } else if (tmp->rdev && tmp->rdev->recovery_offset == MaxSector && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { @@ -5025,49 +5282,68 @@ static int raid5_spare_active(struct mddev *mddev) } } spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded -= count; + mddev->degraded = calc_degraded(conf); spin_unlock_irqrestore(&conf->device_lock, flags); print_raid5_conf(conf); return count; } -static int raid5_remove_disk(struct mddev *mddev, int number) +static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r5conf *conf = mddev->private; int err = 0; - struct md_rdev *rdev; + int number = rdev->raid_disk; + struct md_rdev **rdevp; struct disk_info *p = conf->disks + number; print_raid5_conf(conf); - rdev = p->rdev; - if (rdev) { - if (number >= conf->raid_disks && - conf->reshape_progress == MaxSector) - clear_bit(In_sync, &rdev->flags); + if (rdev == p->rdev) + rdevp = &p->rdev; + else if (rdev == p->replacement) + rdevp = &p->replacement; + else + return 0; - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { - err = -EBUSY; - goto abort; - } - /* Only remove non-faulty devices if recovery - * isn't possible. - */ - if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != conf->recovery_disabled && - !has_failed(conf) && - number < conf->raid_disks) { - err = -EBUSY; - goto abort; - } - p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - } + if (number >= conf->raid_disks && + conf->reshape_progress == MaxSector) + clear_bit(In_sync, &rdev->flags); + + if (test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) { + err = -EBUSY; + goto abort; } + /* Only remove non-faulty devices if recovery + * isn't possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->recovery_disabled != conf->recovery_disabled && + !has_failed(conf) && + (!p->replacement || p->replacement == rdev) && + number < conf->raid_disks) { + err = -EBUSY; + goto abort; + } + *rdevp = NULL; + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + } else if (p->replacement) { + /* We must have just cleared 'rdev' */ + p->rdev = p->replacement; + clear_bit(Replacement, &p->replacement->flags); + smp_mb(); /* Make sure other CPUs may see both as identical + * but will never see neither - if they are careful + */ + p->replacement = NULL; + clear_bit(WantReplacement, &rdev->flags); + } else + /* We might have just removed the Replacement as faulty- + * clear the bit just in case + */ + clear_bit(WantReplacement, &rdev->flags); abort: print_raid5_conf(conf); @@ -5103,8 +5379,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) disk = rdev->saved_raid_disk; else disk = first; - for ( ; disk <= last ; disk++) - if ((p=conf->disks + disk)->rdev == NULL) { + for ( ; disk <= last ; disk++) { + p = conf->disks + disk; + if (p->rdev == NULL) { clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; err = 0; @@ -5113,6 +5390,17 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(p->rdev, rdev); break; } + if (test_bit(WantReplacement, &p->rdev->flags) && + p->replacement == NULL) { + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = disk; + err = 0; + conf->fullsync = 1; + rcu_assign_pointer(p->replacement, rdev); + break; + } + } print_raid5_conf(conf); return err; } @@ -5286,8 +5574,7 @@ static int raid5_start_reshape(struct mddev *mddev) * pre and post number of devices. */ spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) - - added_devices; + mddev->degraded = calc_degraded(conf); spin_unlock_irqrestore(&conf->device_lock, flags); } mddev->raid_disks = conf->raid_disks; @@ -5356,17 +5643,15 @@ static void raid5_finish_reshape(struct mddev *mddev) revalidate_disk(mddev->gendisk); } else { int d; - mddev->degraded = conf->raid_disks; - for (d = 0; d < conf->raid_disks ; d++) - if (conf->disks[d].rdev && - test_bit(In_sync, - &conf->disks[d].rdev->flags)) - mddev->degraded--; + spin_lock_irq(&conf->device_lock); + mddev->degraded = calc_degraded(conf); + spin_unlock_irq(&conf->device_lock); for (d = conf->raid_disks ; d < conf->raid_disks - mddev->delta_disks; d++) { struct md_rdev *rdev = conf->disks[d].rdev; - if (rdev && raid5_remove_disk(mddev, d) == 0) { + if (rdev && + raid5_remove_disk(mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); rdev->raid_disk = -1; } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index e10c553..8d8e139 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -27,7 +27,7 @@ * The possible state transitions are: * * Empty -> Want - on read or write to get old data for parity calc - * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE) + * Empty -> Dirty - on compute_parity to satisfy write/sync request. * Empty -> Clean - on compute_block when computing a block for failed drive * Want -> Empty - on failed read * Want -> Clean - on successful completion of read request @@ -226,8 +226,11 @@ struct stripe_head { #endif } ops; struct r5dev { - struct bio req; - struct bio_vec vec; + /* rreq and rvec are used for the replacement device when + * writing data to both devices. + */ + struct bio req, rreq; + struct bio_vec vec, rvec; struct page *page; struct bio *toread, *read, *towrite, *written; sector_t sector; /* sector of this page */ @@ -239,7 +242,13 @@ struct stripe_head { * for handle_stripe. */ struct stripe_head_state { - int syncing, expanding, expanded; + /* 'syncing' means that we need to read all devices, either + * to check/correct parity, or to reconstruct a missing device. + * 'replacing' means we are replacing one or more drives and + * the source is valid at this point so we don't need to + * read all devices, just the replacement targets. + */ + int syncing, expanding, expanded, replacing; int locked, uptodate, to_read, to_write, failed, written; int to_fill, compute, req_compute, non_overwrite; int failed_num[2]; @@ -252,38 +261,41 @@ struct stripe_head_state { int handle_bad_blocks; }; -/* Flags */ -#define R5_UPTODATE 0 /* page contains current data */ -#define R5_LOCKED 1 /* IO has been submitted on "req" */ -#define R5_OVERWRITE 2 /* towrite covers whole page */ +/* Flags for struct r5dev.flags */ +enum r5dev_flags { + R5_UPTODATE, /* page contains current data */ + R5_LOCKED, /* IO has been submitted on "req" */ + R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */ + R5_OVERWRITE, /* towrite covers whole page */ /* and some that are internal to handle_stripe */ -#define R5_Insync 3 /* rdev && rdev->in_sync at start */ -#define R5_Wantread 4 /* want to schedule a read */ -#define R5_Wantwrite 5 -#define R5_Overlap 7 /* There is a pending overlapping request on this block */ -#define R5_ReadError 8 /* seen a read error here recently */ -#define R5_ReWrite 9 /* have tried to over-write the readerror */ + R5_Insync, /* rdev && rdev->in_sync at start */ + R5_Wantread, /* want to schedule a read */ + R5_Wantwrite, + R5_Overlap, /* There is a pending overlapping request + * on this block */ + R5_ReadError, /* seen a read error here recently */ + R5_ReWrite, /* have tried to over-write the readerror */ -#define R5_Expanded 10 /* This block now has post-expand data */ -#define R5_Wantcompute 11 /* compute_block in progress treat as - * uptodate - */ -#define R5_Wantfill 12 /* dev->toread contains a bio that needs - * filling - */ -#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ -#define R5_WantFUA 14 /* Write should be FUA */ -#define R5_WriteError 15 /* got a write error - need to record it */ -#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/ -/* - * Write method - */ -#define RECONSTRUCT_WRITE 1 -#define READ_MODIFY_WRITE 2 -/* not a write method, but a compute_parity mode */ -#define CHECK_PARITY 3 -/* Additional compute_parity mode -- updates the parity w/o LOCKING */ -#define UPDATE_PARITY 4 + R5_Expanded, /* This block now has post-expand data */ + R5_Wantcompute, /* compute_block in progress treat as + * uptodate + */ + R5_Wantfill, /* dev->toread contains a bio that needs + * filling + */ + R5_Wantdrain, /* dev->towrite needs to be drained */ + R5_WantFUA, /* Write should be FUA */ + R5_WriteError, /* got a write error - need to record it */ + R5_MadeGood, /* A bad block has been fixed by writing to it */ + R5_ReadRepl, /* Will/did read from replacement rather than orig */ + R5_MadeGoodRepl,/* A bad block on the replacement device has been + * fixed by writing to it */ + R5_NeedReplace, /* This device has a replacement which is not + * up-to-date at this stripe. */ + R5_WantReplace, /* We need to update the replacement, we have read + * data in, and now is a good time to write it out. + */ +}; /* * Stripe state @@ -311,13 +323,14 @@ enum { /* * Operation request flags */ -#define STRIPE_OP_BIOFILL 0 -#define STRIPE_OP_COMPUTE_BLK 1 -#define STRIPE_OP_PREXOR 2 -#define STRIPE_OP_BIODRAIN 3 -#define STRIPE_OP_RECONSTRUCT 4 -#define STRIPE_OP_CHECK 5 - +enum { + STRIPE_OP_BIOFILL, + STRIPE_OP_COMPUTE_BLK, + STRIPE_OP_PREXOR, + STRIPE_OP_BIODRAIN, + STRIPE_OP_RECONSTRUCT, + STRIPE_OP_CHECK, +}; /* * Plugging: * @@ -344,13 +357,12 @@ enum { struct disk_info { - struct md_rdev *rdev; + struct md_rdev *rdev, *replacement; }; struct r5conf { struct hlist_head *stripe_hashtbl; struct mddev *mddev; - struct disk_info *spare; int chunk_sectors; int level, algorithm; int max_degraded; diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 9e65d9e..6f6df86 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -277,7 +277,10 @@ struct mdp_superblock_1 { */ #define MD_FEATURE_RESHAPE_ACTIVE 4 #define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ - -#define MD_FEATURE_ALL (1|2|4|8) +#define MD_FEATURE_REPLACEMENT 16 /* This device is replacing an + * active device with same 'role'. + * 'recovery_offset' is also set. + */ +#define MD_FEATURE_ALL (1|2|4|8|16) #endif diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 2b59cc8..53272e9 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -132,7 +132,7 @@ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, PROT_READ|PROT_WRITE, \ MAP_PRIVATE|MAP_ANONYMOUS,\ 0, 0)) -# define free_pages(x, y) munmap((void *)(x), (y)*PAGE_SIZE) +# define free_pages(x, y) munmap((void *)(x), PAGE_SIZE << (y)) static inline void cpu_relax(void) { |