From d8f05d2995d467a91db1af01637e6ffd94660ca8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 28 Jul 2011 11:33:00 +1000 Subject: md/raid1: record badblocks found during resync etc. If we find a bad block while writing as part of resync/recovery we need to report that back to raid1d which must record the bad block, or fail the device. Similarly when fixing a read error, a further error should just record a bad block if possible rather than failing the device. Signed-off-by: NeilBrown Reviewed-by: Namhyung Kim diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b45d641..e695715 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1386,7 +1386,9 @@ static void end_sync_write(struct bio *bio, int error) s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); - md_error(mddev, conf->mirrors[mirror].rdev); + set_bit(WriteErrorSeen, + &conf->mirrors[mirror].rdev->flags); + set_bit(R1BIO_WriteError, &r1_bio->state); } else if (is_badblock(conf->mirrors[mirror].rdev, r1_bio->sector, r1_bio->sectors, @@ -1397,7 +1399,8 @@ static void end_sync_write(struct bio *bio, int error) if (atomic_dec_and_test(&r1_bio->remaining)) { int s = r1_bio->sectors; - if (test_bit(R1BIO_MadeGood, &r1_bio->state)) + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) reschedule_retry(r1_bio); else { put_buf(r1_bio); @@ -1406,6 +1409,20 @@ static void end_sync_write(struct bio *bio, int error) } } +static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector, + int sectors, struct page *page, int rw) +{ + if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) + /* success */ + return 1; + if (rw == WRITE) + set_bit(WriteErrorSeen, &rdev->flags); + /* need to record an error - either for the block or the device */ + if (!rdev_set_badblocks(rdev, sector, sectors, 0)) + md_error(rdev->mddev, rdev); + return 0; +} + static int fix_sync_read_error(r1bio_t *r1_bio) { /* Try some synchronous reads of other devices to get @@ -1477,12 +1494,11 @@ static int fix_sync_read_error(r1bio_t *r1_bio) if (r1_bio->bios[d]->bi_end_io != end_sync_read) continue; rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev, sect, s<<9, - bio->bi_io_vec[idx].bv_page, - WRITE, false) == 0) { + if (r1_sync_page_io(rdev, sect, s, + bio->bi_io_vec[idx].bv_page, + WRITE) == 0) { r1_bio->bios[d]->bi_end_io = NULL; rdev_dec_pending(rdev, mddev); - md_error(mddev, rdev); } } d = start; @@ -1493,11 +1509,9 @@ static int fix_sync_read_error(r1bio_t *r1_bio) if (r1_bio->bios[d]->bi_end_io != end_sync_read) continue; rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev, sect, s<<9, - bio->bi_io_vec[idx].bv_page, - READ, false) == 0) - md_error(mddev, rdev); - else + if (r1_sync_page_io(rdev, sect, s, + bio->bi_io_vec[idx].bv_page, + READ) != 0) atomic_add(s, &rdev->corrected_errors); } sectors -= s; @@ -1682,8 +1696,10 @@ static void fix_read_error(conf_t *conf, int read_disk, } while (!success && d != read_disk); if (!success) { - /* Cannot read from anywhere -- bye bye array */ - md_error(mddev, conf->mirrors[read_disk].rdev); + /* Cannot read from anywhere - mark it bad */ + mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev; + if (!rdev_set_badblocks(rdev, sect, s, 0)) + md_error(mddev, rdev); break; } /* write it back and re-read */ @@ -1694,13 +1710,9 @@ static void fix_read_error(conf_t *conf, int read_disk, d--; rdev = conf->mirrors[d].rdev; if (rdev && - test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev, sect, s<<9, - conf->tmppage, WRITE, false) - == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - } + test_bit(In_sync, &rdev->flags)) + r1_sync_page_io(rdev, sect, s, + conf->tmppage, WRITE); } d = start; while (d != read_disk) { @@ -1711,12 +1723,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev, sect, s<<9, - conf->tmppage, READ, false) - == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - else { + if (r1_sync_page_io(rdev, sect, s, + conf->tmppage, READ)) { atomic_add(s, &rdev->corrected_errors); printk(KERN_INFO "md/raid1:%s: read error corrected " @@ -1860,20 +1868,33 @@ static void raid1d(mddev_t *mddev) mddev = r1_bio->mddev; conf = mddev->private; if (test_bit(R1BIO_IsSync, &r1_bio->state)) { - if (test_bit(R1BIO_MadeGood, &r1_bio->state)) { + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) { int m; int s = r1_bio->sectors; for (m = 0; m < conf->raid_disks ; m++) { + mdk_rdev_t *rdev + = conf->mirrors[m].rdev; struct bio *bio = r1_bio->bios[m]; - if (bio->bi_end_io != NULL && - test_bit(BIO_UPTODATE, + if (bio->bi_end_io == NULL) + continue; + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) { - rdev = conf->mirrors[m].rdev; rdev_clear_badblocks( rdev, r1_bio->sector, r1_bio->sectors); } + if (!test_bit(BIO_UPTODATE, + &bio->bi_flags) && + test_bit(R1BIO_WriteError, + &r1_bio->state)) { + if (!rdev_set_badblocks( + rdev, + r1_bio->sector, + r1_bio->sectors, 0)) + md_error(mddev, rdev); + } } put_buf(r1_bio); md_done_sync(mddev, s, 1); -- cgit v0.10.2