From 8811b5968f6216e97ccb9fe7b9883af39e339921 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 2 Aug 2012 08:33:00 +1000 Subject: raid5: make_request use batch stripe release make_request() does stripe release for every stripe and the stripe usually has count 1, which makes previous release_stripe() optimization not work. In my test, this release_stripe() becomes the heaviest pleace to take conf->device_lock after previous patches applied. Below patch makes stripe release batch. All the stripes will be released in unplug. The STRIPE_ON_UNPLUG_LIST bit is to protect concurrent access stripe lru. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index bde9da2..978ba9b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -471,7 +471,8 @@ get_active_stripe(struct r5conf *conf, sector_t sector, } else { if (atomic_read(&sh->count)) { BUG_ON(!list_empty(&sh->lru) - && !test_bit(STRIPE_EXPANDING, &sh->state)); + && !test_bit(STRIPE_EXPANDING, &sh->state) + && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)); } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); @@ -3988,6 +3989,62 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf) return sh; } +struct raid5_plug_cb { + struct blk_plug_cb cb; + struct list_head list; +}; + +static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) +{ + struct raid5_plug_cb *cb = container_of( + blk_cb, struct raid5_plug_cb, cb); + struct stripe_head *sh; + struct mddev *mddev = cb->cb.data; + struct r5conf *conf = mddev->private; + + if (cb->list.next && !list_empty(&cb->list)) { + spin_lock_irq(&conf->device_lock); + while (!list_empty(&cb->list)) { + sh = list_first_entry(&cb->list, struct stripe_head, lru); + list_del_init(&sh->lru); + /* + * avoid race release_stripe_plug() sees + * STRIPE_ON_UNPLUG_LIST clear but the stripe + * is still in our list + */ + smp_mb__before_clear_bit(); + clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); + __release_stripe(conf, sh); + } + spin_unlock_irq(&conf->device_lock); + } + kfree(cb); +} + +static void release_stripe_plug(struct mddev *mddev, + struct stripe_head *sh) +{ + struct blk_plug_cb *blk_cb = blk_check_plugged( + raid5_unplug, mddev, + sizeof(struct raid5_plug_cb)); + struct raid5_plug_cb *cb; + + if (!blk_cb) { + release_stripe(sh); + return; + } + + cb = container_of(blk_cb, struct raid5_plug_cb, cb); + + if (cb->list.next == NULL) + INIT_LIST_HEAD(&cb->list); + + if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) + list_add_tail(&sh->lru, &cb->list); + else + release_stripe(sh); +} + static void make_request(struct mddev *mddev, struct bio * bi) { struct r5conf *conf = mddev->private; @@ -4116,8 +4173,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ((bi->bi_rw & REQ_SYNC) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); - mddev_check_plugged(mddev); - release_stripe(sh); + release_stripe_plug(mddev, sh); } else { /* cannot get stripe for read-ahead, just give-up */ clear_bit(BIO_UPTODATE, &bi->bi_flags); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 2164021..9a7b36f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -319,6 +319,7 @@ enum { STRIPE_BIOFILL_RUN, STRIPE_COMPUTE_RUN, STRIPE_OPS_REQ_PENDING, + STRIPE_ON_UNPLUG_LIST, }; /* -- cgit v0.10.2 From 46a06401f6ba13e59d24746fa9ffa6773b69eee3 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 2 Aug 2012 08:33:15 +1000 Subject: raid5: raid5d handle stripe in batch way Let raid5d handle stripe in batch way to reduce conf->device_lock locking. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 978ba9b..9e41ae3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4574,6 +4574,30 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) return handled; } +#define MAX_STRIPE_BATCH 8 +static int handle_active_stripes(struct r5conf *conf) +{ + struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; + int i, batch_size = 0; + + while (batch_size < MAX_STRIPE_BATCH && + (sh = __get_priority_stripe(conf)) != NULL) + batch[batch_size++] = sh; + + if (batch_size == 0) + return batch_size; + spin_unlock_irq(&conf->device_lock); + + for (i = 0; i < batch_size; i++) + handle_stripe(batch[i]); + + cond_resched(); + + spin_lock_irq(&conf->device_lock); + for (i = 0; i < batch_size; i++) + __release_stripe(conf, batch[i]); + return batch_size; +} /* * This is our raid5 kernel thread. @@ -4584,7 +4608,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) */ static void raid5d(struct mddev *mddev) { - struct stripe_head *sh; struct r5conf *conf = mddev->private; int handled; struct blk_plug plug; @@ -4598,6 +4621,7 @@ static void raid5d(struct mddev *mddev) spin_lock_irq(&conf->device_lock); while (1) { struct bio *bio; + int batch_size; if ( !list_empty(&conf->bitmap_list)) { @@ -4621,21 +4645,16 @@ static void raid5d(struct mddev *mddev) handled++; } - sh = __get_priority_stripe(conf); - - if (!sh) + batch_size = handle_active_stripes(conf); + if (!batch_size) break; - spin_unlock_irq(&conf->device_lock); - - handled++; - handle_stripe(sh); - release_stripe(sh); - cond_resched(); + handled += batch_size; - if (mddev->flags & ~(1<flags & ~(1<device_lock); md_check_recovery(mddev); - - spin_lock_irq(&conf->device_lock); + spin_lock_irq(&conf->device_lock); + } } pr_debug("%d stripes handled\n", handled); -- cgit v0.10.2 From f54a9d0e59c4bea3db733921ca9147612a6f292c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 2 Aug 2012 08:33:20 +1000 Subject: md/raid1: submit IO from originating thread instead of md thread. queuing writes to the md thread means that all requests go through the one processor which may not be able to keep up with very high request rates. So use the plugging infrastructure to submit all requests on unplug. If a 'schedule' is needed, we fall back on the old approach of handing the requests to the thread for it to handle. Signed-off-by: NeilBrown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 15dbe03..94e7f6b 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1305,7 +1305,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect prepare_to_wait(&bitmap->overflow_wait, &__wait, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&bitmap->counts.lock); - io_schedule(); + schedule(); finish_wait(&bitmap->overflow_wait, &__wait); continue; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 36a8fc0..9f01870 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -870,6 +870,44 @@ do_sync_io: pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); } +struct raid1_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; + int pending_cnt; +}; + +static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, + cb); + struct mddev *mddev = plug->cb.data; + struct r1conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + conf->pending_count += plug->pending_cnt; + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; + } + + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + bitmap_unplug(mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + kfree(plug); +} + static void make_request(struct mddev *mddev, struct bio * bio) { struct r1conf *conf = mddev->private; @@ -883,6 +921,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); struct md_rdev *blocked_rdev; + struct blk_plug_cb *cb; + struct raid1_plug_cb *plug = NULL; int first_clone; int sectors_handled; int max_sectors; @@ -1185,11 +1225,22 @@ read_again: mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); + + cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid1_plug_cb, cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); } /* Mustn't call r1_bio_write_done before this next test, -- cgit v0.10.2 From d9f691c365a83ce2530f0e46b947365c2db44ea0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 2 Aug 2012 08:35:43 +1000 Subject: md/dm-raid: DM_RAID should select MD_RAID10 Now that DM_RAID supports raid10, it needs to select that code to ensure it is included. Cc: Jonathan Brassow Reported-by: Fengguang Wu Signed-off-by: NeilBrown diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 10f122a..1de441a 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -277,13 +277,14 @@ config DM_MIRROR needed for live data migration tools such as 'pvmove'. config DM_RAID - tristate "RAID 1/4/5/6 target" + tristate "RAID 1/4/5/6/10 target" depends on BLK_DEV_DM select MD_RAID1 + select MD_RAID10 select MD_RAID456 select BLK_DEV_MD ---help--- - A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings + A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings A RAID-5 set of N drives with a capacity of C MB per drive provides the capacity of C * (N - 1) MB, and protects against a failure -- cgit v0.10.2