From 88724bfa68be792c1487d759e87568c36ac1a1cc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 24 Sep 2015 14:00:51 +1000 Subject: md: wait for pending superblock updates before switching to read-only If a superblock update is pending, wait for it to complete before letting md_set_readonly() switch to readonly. Otherwise we might lose important information about a device having failed. For external arrays, waiting for superblock updates can wait on user-space, so in that case, just return an error. Reported-and-tested-by: Shaohua Li Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 4f5ecbe..b8247bc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5409,9 +5409,13 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) * which will now never happen */ wake_up_process(mddev->sync_thread->tsk); + if (mddev->external && test_bit(MD_CHANGE_PENDING, &mddev->flags)) + return -EBUSY; mddev_unlock(mddev); wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); mddev_lock_nointr(mddev); mutex_lock(&mddev->open_mutex); -- cgit v0.10.2 From ebda780bce8d58ec0abab157397c9e099c41a05f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 18 Sep 2015 10:20:13 -0700 Subject: raid5: update analysis state for failed stripe handle_failed_stripe() makes the stripe fail, eg, all IO will return with a failure, but it doesn't update stripe_head_state. Later handle_stripe() has special handling for raid6 for handle_stripe_fill(). That check before handle_stripe_fill() doesn't skip the failed stripe and we get a kernel crash in need_this_block. This patch clear the analysis state to make sure no functions wrongly called after handle_failed_stripe() Signed-off-by: Shaohua Li Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 15ef2c6..2105e5f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3150,6 +3150,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); + if (bi) + s->to_read--; while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = @@ -3169,6 +3171,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, */ clear_bit(R5_LOCKED, &sh->dev[i].flags); } + s->to_write = 0; + s->written = 0; if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) if (atomic_dec_and_test(&conf->pending_full_writes)) -- cgit v0.10.2 From 36707bb2e7c6730d79d6cdc6d1475d3d7e94c518 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 24 Sep 2015 15:25:36 +1000 Subject: md/raid5: don't index beyond end of array in need_this_block(). When need_this_block probably shouldn't be called when there are more than 2 failed devices, we really don't want it to try indexing beyond the end of the failed_num[] of fdev[] arrays. So limit the loops to at most 2 iterations. Reported-by: Shaohua Li Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2105e5f..d66ff723 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3304,7 +3304,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, */ return 0; - for (i = 0; i < s->failed; i++) { + for (i = 0; i < s->failed && i < 2; i++) { if (fdev[i]->towrite && !test_bit(R5_UPTODATE, &fdev[i]->flags) && !test_bit(R5_OVERWRITE, &fdev[i]->flags)) @@ -3328,7 +3328,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, sh->sector < sh->raid_conf->mddev->recovery_cp) /* reconstruct-write isn't being forced */ return 0; - for (i = 0; i < s->failed; i++) { + for (i = 0; i < s->failed && i < 2; i++) { if (s->failed_num[i] != sh->pd_idx && s->failed_num[i] != sh->qd_idx && !test_bit(R5_UPTODATE, &fdev[i]->flags) && -- cgit v0.10.2 From 66eefe5de11db1e0d8f2edc3880d50e7c36a9d43 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 24 Sep 2015 15:47:47 +1000 Subject: md/raid0: apply base queue limits *before* disk_stack_limits Calling e.g. blk_queue_max_hw_sectors() after calls to disk_stack_limits() discards the settings determined by disk_stack_limits(). So we need to make those calls first. Fixes: 199dc6ed5179 ("md/raid0: update queue parameter in a safer location.") Cc: stable@vger.kernel.org (v2.6.35+ - please apply with 199dc6ed5179). Reported-by: Jes Sorensen Signed-off-by: NeilBrown diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 63e619b..f8e5db0 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -376,12 +376,6 @@ static int raid0_run(struct mddev *mddev) struct md_rdev *rdev; bool discard_supported = false; - rdev_for_each(rdev, mddev) { - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - if (blk_queue_discard(bdev_get_queue(rdev->bdev))) - discard_supported = true; - } blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); @@ -390,6 +384,12 @@ static int raid0_run(struct mddev *mddev) blk_queue_io_opt(mddev->queue, (mddev->chunk_sectors << 9) * mddev->raid_disks); + rdev_for_each(rdev, mddev) { + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; + } if (!discard_supported) queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); else -- cgit v0.10.2 From d4929add83ad4660b1824a9282ab5dd4d60140fa Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 18 Sep 2015 10:20:12 -0700 Subject: md: clear CHANGE_PENDING in readonly array If faulty disks of an array are more than allowed degraded number, the array enters error handling. It will be marked as read-only with MD_CHANGE_PENDING/RECOVERY_NEEDED set. But currently recovery doesn't clear CHANGE_PENDING bit for read-only array. If MD_CHANGE_PENDING is set for a raid5 array, all returned IO will be hold on a list till the bit is clear. But recovery nevery clears this bit, the IO is always in pending state and nevery finish. This has bad effects like upper layer can't get an IO error and the array can't be stopped. Fixes: c3cce6cda162 ("md/raid5: ensure device failure recorded before write request returns.") Signed-off-by: Shaohua Li Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index b8247bc..c702de1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8164,6 +8164,7 @@ void md_check_recovery(struct mddev *mddev) md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); goto unlock; } -- cgit v0.10.2 From 644df1a85fc4b0c7a16800f55717261546f4e651 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Sep 2015 14:15:10 +0200 Subject: md: drop null test before destroy functions Remove unneeded NULL test. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ expression x; @@ -if (x != NULL) \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x); // Signed-off-by: Julia Lawall Signed-off-by: NeilBrown diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index d222522..d132f06 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -470,8 +470,7 @@ static int multipath_run (struct mddev *mddev) return 0; out_free_conf: - if (conf->pool) - mempool_destroy(conf->pool); + mempool_destroy(conf->pool); kfree(conf->multipaths); kfree(conf); mddev->private = NULL; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4517f06..5f4f553 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2843,8 +2843,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) abort: if (conf) { - if (conf->r1bio_pool) - mempool_destroy(conf->r1bio_pool); + mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); @@ -2946,8 +2945,7 @@ static void raid1_free(struct mddev *mddev, void *priv) { struct r1conf *conf = priv; - if (conf->r1bio_pool) - mempool_destroy(conf->r1bio_pool); + mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0fc33eb..7c99a40 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3486,8 +3486,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", mdname(mddev)); if (conf) { - if (conf->r10bio_pool) - mempool_destroy(conf->r10bio_pool); + mempool_destroy(conf->r10bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf); @@ -3682,8 +3681,7 @@ static int run(struct mddev *mddev) out_free_conf: md_unregister_thread(&mddev->thread); - if (conf->r10bio_pool) - mempool_destroy(conf->r10bio_pool); + mempool_destroy(conf->r10bio_pool); safe_put_page(conf->tmppage); kfree(conf->mirrors); kfree(conf); @@ -3696,8 +3694,7 @@ static void raid10_free(struct mddev *mddev, void *priv) { struct r10conf *conf = priv; - if (conf->r10bio_pool) - mempool_destroy(conf->r10bio_pool); + mempool_destroy(conf->r10bio_pool); safe_put_page(conf->tmppage); kfree(conf->mirrors); kfree(conf->mirrors_old); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d66ff723..49bb8d3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2271,8 +2271,7 @@ static void shrink_stripes(struct r5conf *conf) drop_one_stripe(conf)) ; - if (conf->slab_cache) - kmem_cache_destroy(conf->slab_cache); + kmem_cache_destroy(conf->slab_cache); conf->slab_cache = NULL; } -- cgit v0.10.2 From e8ff8bf09ff49733534ff3cee91bde030186055f Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Wed, 16 Sep 2015 10:20:05 -0400 Subject: md/raid1: Avoid raid1 resync getting stuck close_sync() needs to set conf->next_resync to a large, but safe value below MaxSector and use it to determine whether or not to set start_next_window in wait_barrier() Solution suggested by Neil Brown. Reported-by: Nate Dailey Tested-by: Xiao Ni Signed-off-by: Jes Sorensen Signed-off-by: NeilBrown diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5f4f553..049df6c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -881,8 +881,7 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) } if (bio && bio_data_dir(bio) == WRITE) { - if (bio->bi_iter.bi_sector >= - conf->mddev->curr_resync_completed) { + if (bio->bi_iter.bi_sector >= conf->next_resync) { if (conf->start_next_window == MaxSector) conf->start_next_window = conf->next_resync + @@ -1516,7 +1515,7 @@ static void close_sync(struct r1conf *conf) conf->r1buf_pool = NULL; spin_lock_irq(&conf->resync_lock); - conf->next_resync = 0; + conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE; conf->start_next_window = MaxSector; conf->current_window_requests += conf->next_window_requests; -- cgit v0.10.2 From da6fb7a9e5bd6f04f7e15070f630bdf1ea502841 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 1 Oct 2015 16:03:38 +1000 Subject: md/bitmap: don't pass -1 to bitmap_storage_alloc. Passing -1 to bitmap_storage_alloc() causes page->index to be set to -1, which is quite problematic. So only pass ->cluster_slot if mddev_is_clustered(). Fixes: b97e92574c0b ("Use separate bitmaps for each nodes in the cluster") Cc: stable@vger.kernel.org (v4.1+) Signed-off-by: NeilBrown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index e51de52..48b5890 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1997,7 +1997,8 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) ret = bitmap_storage_alloc(&store, chunks, !bitmap->mddev->bitmap_info.external, - bitmap->cluster_slot); + mddev_is_clustered(bitmap->mddev) + ? bitmap->cluster_slot : 0); if (ret) goto err; -- cgit v0.10.2