From 75aaf4c3e6a4ed48207230cf133a02258ca5abd5 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 23 Jan 2015 08:29:50 +0000 Subject: x86/raid6: correctly check for assembler capabilities Just like for AVX2 (which simply needs an #if -> #ifdef conversion), SSSE3 assembler support should be checked for before using it. Signed-off-by: Jan Beulich Cc: Jim Kukunas Acked-by: Thomas Gleixner Signed-off-by: NeilBrown diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 920e616..5ba2d9c 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -148,6 +148,7 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) +asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 7d0e5cd..dbef231 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -89,10 +89,10 @@ void (*raid6_datap_recov)(int, size_t, int, void **); EXPORT_SYMBOL_GPL(raid6_datap_recov); const struct raid6_recov_calls *const raid6_recov_algos[] = { -#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) #ifdef CONFIG_AS_AVX2 &raid6_recov_avx2, #endif +#ifdef CONFIG_AS_SSSE3 &raid6_recov_ssse3, #endif &raid6_recov_intx1, diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c index e1eea43..53fe3d7 100644 --- a/lib/raid6/recov_avx2.c +++ b/lib/raid6/recov_avx2.c @@ -8,7 +8,7 @@ * of the License. */ -#if CONFIG_AS_AVX2 +#ifdef CONFIG_AS_AVX2 #include #include "x86.h" diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c index a916832..cda33e5 100644 --- a/lib/raid6/recov_ssse3.c +++ b/lib/raid6/recov_ssse3.c @@ -7,6 +7,8 @@ * of the License. */ +#ifdef CONFIG_AS_SSSE3 + #include #include "x86.h" @@ -330,3 +332,7 @@ const struct raid6_recov_calls raid6_recov_ssse3 = { #endif .priority = 1, }; + +#else +#warning "your version of binutils lacks SSSE3 support" +#endif -- cgit v0.10.2 From ad3ab8b608c454f004391bb8568916bd955001ea Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Thu, 29 Jan 2015 12:38:29 -0500 Subject: md: do_release_stripe(): No need to call md_wakeup_thread() twice 67f455486d2ea20b2d94d6adf5b9b783d079e321 introduced a call to md_wakeup_thread() when adding to the delayed_list. However the md thread is woken up unconditionally just below. Remove the unnecessary wakeup call. Signed-off-by: Jes Sorensen Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b98765f..274db18 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -296,12 +296,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state) && - !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); - if (atomic_read(&conf->preread_active_stripes) - < IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { -- cgit v0.10.2 From 2c58f06e6fe23905c4c586191bef0acc51536a9d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 2 Feb 2015 11:32:23 +1100 Subject: md/raid5: separate large if clause out of fetch_block(). fetch_block() has a very large and hard to read 'if' condition. Separate it into its own function so that it can be made more readable. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 274db18..5a980fe 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2895,14 +2895,14 @@ static int want_replace(struct stripe_head *sh, int disk_idx) * Returns 1 when no more member devices need to be checked, otherwise returns * 0 to tell the loop in handle_stripe_fill to continue */ -static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, - int disk_idx, int disks) + +static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) { struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], &sh->dev[s->failed_num[1]] }; - /* is the data in this block needed, and can we get it? */ if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || @@ -2919,7 +2919,18 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, && s->failed && s->to_write && (s->to_write - s->non_overwrite < sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) && - (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) + return 1; + return 0; +} + +static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) +{ + struct r5dev *dev = &sh->dev[disk_idx]; + + /* is the data in this block needed, and can we get it? */ + if (need_this_block(sh, s, disk_idx, disks)) { /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync */ -- cgit v0.10.2 From a79cfe12c619aa0fc401f9148d78faa6fc61a331 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 2 Feb 2015 11:37:59 +1100 Subject: md/raid5: separate out the easy conditions in need_this_block. Some of the conditions in need_this_block have very straight forward motivation. Separate those out and document them. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5a980fe..21c1e79 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2903,14 +2903,34 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], &sh->dev[s->failed_num[1]] }; - if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || - (s->replacing && want_replace(sh, disk_idx)) || - (s->failed >= 1 && fdev[0]->toread) || - (s->failed >= 2 && fdev[1]->toread) || + + if (test_bit(R5_LOCKED, &dev->flags) || + test_bit(R5_UPTODATE, &dev->flags)) + /* No point reading this as we already have it or have + * decided to get it. + */ + return 0; + + if (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) + /* We need this block to directly satisfy a request */ + return 1; + + if (s->syncing || s->expanding || + (s->replacing && want_replace(sh, disk_idx))) + /* When syncing, or expanding we read everything. + * When replacing, we need the replaced block. + */ + return 1; + + if ((s->failed >= 1 && fdev[0]->toread) || + (s->failed >= 2 && fdev[1]->toread)) + /* If we want to read from a failed device, then + * we need to actually read every other device. + */ + return 1; + + if ( (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || @@ -2919,7 +2939,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, && s->failed && s->to_write && (s->to_write - s->non_overwrite < sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) && - (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)))) return 1; return 0; } -- cgit v0.10.2 From a9d56950f763fa3e9d831541e62d223197d2ff60 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 2 Feb 2015 11:49:10 +1100 Subject: md/raid5: need_this_block: start simplifying the last two conditions. Both the last two cases are only relevant if something has failed and something needs to be written (but not over-written), and if it is OK to pre-read blocks at this point. So factor out those tests and explain them. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 21c1e79..bb42551 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2930,16 +2930,34 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, */ return 1; + /* Sometimes neither read-modify-write nor reconstruct-write + * cycles can work. In those cases we read every block we + * can. Then the parity-update is certain to have enough to + * work with. + * This can only be a problem when we need to write something, + * and some device has failed. If either of those tests + * fail we need look no further. + */ + if (!s->failed || !s->to_write) + return 0; + + if (test_bit(R5_Insync, &dev->flags) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + /* Pre-reads at not permitted until after short delay + * to gather multiple requests. However if this + * device is no Insync, the block could only be be computed + * and there is no need to delay that. + */ + return 0; if ( - (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && - (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && + (sh->raid_conf->level <= 5 && fdev[0]->towrite && !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || ((sh->raid_conf->level == 6 || sh->sector >= sh->raid_conf->mddev->recovery_cp) - && s->failed && s->to_write && + && (s->to_write - s->non_overwrite < - sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) && - (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)))) + sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) + )) return 1; return 0; } -- cgit v0.10.2 From ea664c8245f3d5e78d05d1250bc0be0d60e264af Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 2 Feb 2015 14:03:28 +1100 Subject: md/raid5: need_this_block: tidy/fix last condition. That last condition is unclear and over cautious. There are two related issues here. If a partial write is destined for a missing device, then either RMW or RCW can work. We must read all the available block. Only then can the missing blocks be calculated, and then the parity update performed. If RMW is not an option, then there is a complication even without partial writes. If we would need to read a missing device to perform the reconstruction, then we must first read every block so the missing device data can be computed. This is the case for RAID6 (Which currently does not support RMW) and for times when we don't trust the parity (after a crash) and so are in the process of resyncing it. So make these two cases more clear and separate, and perform the relevant tests more thoroughly. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index bb42551..a03cf2d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2902,6 +2902,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], &sh->dev[s->failed_num[1]] }; + int i; if (test_bit(R5_LOCKED, &dev->flags) || @@ -2949,16 +2950,37 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, * and there is no need to delay that. */ return 0; - if ( - (sh->raid_conf->level <= 5 && fdev[0]->towrite && - !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || - ((sh->raid_conf->level == 6 || - sh->sector >= sh->raid_conf->mddev->recovery_cp) - && - (s->to_write - s->non_overwrite < - sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) - )) - return 1; + + for (i = 0; i < s->failed; i++) { + if (fdev[i]->towrite && + !test_bit(R5_UPTODATE, &fdev[i]->flags) && + !test_bit(R5_OVERWRITE, &fdev[i]->flags)) + /* If we have a partial write to a failed + * device, then we will need to reconstruct + * the content of that device, so all other + * devices must be read. + */ + return 1; + } + + /* If we are forced to do a reconstruct-write, either because + * the current RAID6 implementation only supports that, or + * or because parity cannot be trusted and we are currently + * recovering it, there is extra need to be careful. + * If one of the devices that we would need to read, because + * it is not being overwritten (and maybe not written at all) + * is missing/faulty, then we need to read everything we can. + */ + if (sh->raid_conf->level != 6 && + sh->sector < sh->raid_conf->mddev->recovery_cp) + /* reconstruct-write isn't being forced */ + return 0; + for (i = 0; i < s->failed; i++) { + if (!test_bit(R5_UPTODATE, &fdev[i]->flags) && + !test_bit(R5_OVERWRITE, &fdev[i]->flags)) + return 1; + } + return 0; } -- cgit v0.10.2 From 85572d7c75fd5b9fa3fc911e1c99c68ec74903a0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:56:56 +1100 Subject: md: rename mddev->write_lock to mddev->lock This lock is used for (slightly) more than helping with writing superblocks, and it will soon be extended further. So the name is inappropriate. Also, the _irq variant hasn't been needed since 2.6.37 as it is never taking from interrupt or bh context. So: -rename write_lock to lock -document what it protects -remove _irq ... except in md_flush_request() as there is no wait_event_lock() (with no _irq). This can be cleaned up after appropriate changes to wait.h. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 709755f..17e7fd7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -397,12 +397,12 @@ static void md_submit_flush_data(struct work_struct *ws) void md_flush_request(struct mddev *mddev, struct bio *bio) { - spin_lock_irq(&mddev->write_lock); + spin_lock_irq(&mddev->lock); wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio, - mddev->write_lock); + mddev->lock); mddev->flush_bio = bio; - spin_unlock_irq(&mddev->write_lock); + spin_unlock_irq(&mddev->lock); INIT_WORK(&mddev->flush_work, submit_flushes); queue_work(md_wq, &mddev->flush_work); @@ -465,7 +465,7 @@ void mddev_init(struct mddev *mddev) atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); - spin_lock_init(&mddev->write_lock); + spin_lock_init(&mddev->lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); @@ -2230,7 +2230,7 @@ repeat: return; } - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); mddev->utime = get_seconds(); @@ -2287,7 +2287,7 @@ repeat: } sync_sbs(mddev, nospares); - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", mdname(mddev), mddev->in_sync); @@ -2326,15 +2326,15 @@ repeat: md_super_wait(mddev); /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->in_sync != sync_req || test_bit(MD_CHANGE_DEVS, &mddev->flags)) { /* have to write it out again */ - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); goto repeat; } clear_bit(MD_CHANGE_PENDING, &mddev->flags); - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); @@ -3722,7 +3722,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) case clean: if (mddev->pers) { restart_array(mddev); - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (atomic_read(&mddev->writes_pending) == 0) { if (mddev->in_sync == 0) { mddev->in_sync = 1; @@ -3733,7 +3733,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) err = 0; } else err = -EBUSY; - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); } else err = -EINVAL; break; @@ -7102,7 +7102,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) if (mddev->safemode == 1) mddev->safemode = 0; if (mddev->in_sync) { - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); @@ -7110,7 +7110,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) md_wakeup_thread(mddev->thread); did_change = 1; } - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); } if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); @@ -7148,7 +7148,7 @@ int md_allow_write(struct mddev *mddev) if (!mddev->pers->sync_request) return 0; - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); @@ -7156,11 +7156,11 @@ int md_allow_write(struct mddev *mddev) if (mddev->safemode_delay && mddev->safemode == 0) mddev->safemode = 1; - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); md_update_sb(mddev, 0); sysfs_notify_dirent_safe(mddev->sysfs_state); } else - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) return -EAGAIN; @@ -7688,7 +7688,7 @@ void md_check_recovery(struct mddev *mddev) if (!mddev->external) { int did_change = 0; - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && @@ -7699,7 +7699,7 @@ void md_check_recovery(struct mddev *mddev) } if (mddev->safemode == 1) mddev->safemode = 0; - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); } diff --git a/drivers/md/md.h b/drivers/md/md.h index 03cec5b..f0d15bd 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -386,7 +386,13 @@ struct mddev { struct work_struct del_work; /* used for delayed sysfs removal */ - spinlock_t write_lock; + /* "lock" protects: + * flush_bio transition from NULL to !NULL + * rdev superblocks, events + * clearing MD_CHANGE_* + * in_sync - and related safemode and MD_CHANGE changes + */ + spinlock_t lock; wait_queue_head_t sb_wait; /* for waiting on superblock updates */ atomic_t pending_writes; /* number of active superblock writes */ -- cgit v0.10.2 From 5c675f83c68fbdf9c0e103c1090b06be747fa62c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:56:56 +1100 Subject: md: make ->congested robust against personality changes. There is currently no locking around calls to the 'congested' bdi function. If called at an awkward time while an array is being converted from one level (or personality) to another, there is a tiny chance of running code in an unreferenced module etc. So add a 'congested' function to the md_personality operations structure, and call it with appropriate locking from a central 'mddev_congested'. When the array personality is changing the array will be 'suspended' so no IO is processed. If mddev_congested detects this, it simply reports that the array is congested, which is a safe guess. As mddev_suspend calls synchronize_rcu(), mddev_congested can avoid races by included the whole call inside an rcu_read_lock() region. This require that the congested functions for all subordinate devices can be run under rcu_lock. Fortunately this is the case. Signed-off-by: NeilBrown diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 07c0fa0..777d9ba 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -746,13 +746,7 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) { struct raid_set *rs = container_of(cb, struct raid_set, callbacks); - if (rs->raid_type->level == 1) - return md_raid1_congested(&rs->md, bits); - - if (rs->raid_type->level == 10) - return md_raid10_congested(&rs->md, bits); - - return md_raid5_congested(&rs->md, bits); + return mddev_congested(&rs->md, bits); } /* diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 64713b7..0510851 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -97,15 +97,11 @@ static int linear_mergeable_bvec(struct request_queue *q, return maxsectors << 9; } -static int linear_congested(void *data, int bits) +static int linear_congested(struct mddev *mddev, int bits) { - struct mddev *mddev = data; struct linear_conf *conf; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - rcu_read_lock(); conf = rcu_dereference(mddev->private); @@ -218,8 +214,6 @@ static int linear_run (struct mddev *mddev) md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); - mddev->queue->backing_dev_info.congested_fn = linear_congested; - mddev->queue->backing_dev_info.congested_data = mddev; ret = md_integrity_register(mddev); if (ret) { @@ -366,6 +360,7 @@ static struct md_personality linear_personality = .status = linear_status, .hot_add_disk = linear_add, .size = linear_size, + .congested = linear_congested, }; static int __init linear_init (void) diff --git a/drivers/md/md.c b/drivers/md/md.c index 17e7fd7..d45f52e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -321,9 +321,23 @@ EXPORT_SYMBOL_GPL(mddev_resume); int mddev_congested(struct mddev *mddev, int bits) { - return mddev->suspended; + struct md_personality *pers = mddev->pers; + int ret = 0; + + rcu_read_lock(); + if (mddev->suspended) + ret = 1; + else if (pers && pers->congested) + ret = pers->congested(mddev, bits); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(mddev_congested); +static int md_congested(void *data, int bits) +{ + struct mddev *mddev = data; + return mddev_congested(mddev, bits); } -EXPORT_SYMBOL(mddev_congested); /* * Generic flush handling for md @@ -4908,6 +4922,10 @@ int md_run(struct mddev *mddev) bitmap_destroy(mddev); return err; } + if (mddev->queue) { + mddev->queue->backing_dev_info.congested_data = mddev; + mddev->queue->backing_dev_info.congested_fn = md_congested; + } if (mddev->pers->sync_request) { if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_redundancy_group)) diff --git a/drivers/md/md.h b/drivers/md/md.h index f0d15bd..f260228 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -496,6 +496,9 @@ struct md_personality * array. */ void *(*takeover) (struct mddev *mddev); + /* congested implements bdi.congested_fn(). + * Will not be called while array is 'suspended' */ + int (*congested)(struct mddev *mddev, int bits); }; struct md_sysfs_entry { diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 399272f..fedb1b3 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -153,15 +153,11 @@ static void multipath_status (struct seq_file *seq, struct mddev *mddev) seq_printf (seq, "]"); } -static int multipath_congested(void *data, int bits) +static int multipath_congested(struct mddev *mddev, int bits) { - struct mddev *mddev = data; struct mpconf *conf = mddev->private; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - rcu_read_lock(); for (i = 0; i < mddev->raid_disks ; i++) { struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); @@ -489,9 +485,6 @@ static int multipath_run (struct mddev *mddev) */ md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); - mddev->queue->backing_dev_info.congested_fn = multipath_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - if (md_integrity_register(mddev)) goto out_free_conf; @@ -533,6 +526,7 @@ static struct md_personality multipath_personality = .hot_add_disk = multipath_add_disk, .hot_remove_disk= multipath_remove_disk, .size = multipath_size, + .congested = multipath_congested, }; static int __init multipath_init (void) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ba6b85d..4b521ea 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -25,17 +25,13 @@ #include "raid0.h" #include "raid5.h" -static int raid0_congested(void *data, int bits) +static int raid0_congested(struct mddev *mddev, int bits) { - struct mddev *mddev = data; struct r0conf *conf = mddev->private; struct md_rdev **devlist = conf->devlist; int raid_disks = conf->strip_zone[0].nb_dev; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - for (i = 0; i < raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(devlist[i]->bdev); @@ -263,8 +259,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) mdname(mddev), (unsigned long long)smallest->sectors); } - mddev->queue->backing_dev_info.congested_fn = raid0_congested; - mddev->queue->backing_dev_info.congested_data = mddev; /* * now since we have the hard sector sizes, we can make sure @@ -729,6 +723,7 @@ static struct md_personality raid0_personality= .size = raid0_size, .takeover = raid0_takeover, .quiesce = raid0_quiesce, + .congested = raid0_congested, }; static int __init raid0_init (void) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 40b35be..9ad7ce7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -734,7 +734,7 @@ static int raid1_mergeable_bvec(struct request_queue *q, } -int md_raid1_congested(struct mddev *mddev, int bits) +static int raid1_congested(struct mddev *mddev, int bits) { struct r1conf *conf = mddev->private; int i, ret = 0; @@ -763,15 +763,6 @@ int md_raid1_congested(struct mddev *mddev, int bits) rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(md_raid1_congested); - -static int raid1_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid1_congested(mddev, bits); -} static void flush_pending_writes(struct r1conf *conf) { @@ -2955,8 +2946,6 @@ static int run(struct mddev *mddev) md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); if (mddev->queue) { - mddev->queue->backing_dev_info.congested_fn = raid1_congested; - mddev->queue->backing_dev_info.congested_data = mddev; blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); if (discard_supported) @@ -3193,6 +3182,7 @@ static struct md_personality raid1_personality = .check_reshape = raid1_reshape, .quiesce = raid1_quiesce, .takeover = raid1_takeover, + .congested = raid1_congested, }; static int __init raid_init(void) diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 33bda55..14ebb28 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -170,7 +170,4 @@ struct r1bio { */ #define R1BIO_MadeGood 7 #define R1BIO_WriteError 8 - -extern int md_raid1_congested(struct mddev *mddev, int bits); - #endif diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 32e282f..fb6b886 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -910,7 +910,7 @@ retry: return rdev; } -int md_raid10_congested(struct mddev *mddev, int bits) +static int raid10_congested(struct mddev *mddev, int bits) { struct r10conf *conf = mddev->private; int i, ret = 0; @@ -934,15 +934,6 @@ int md_raid10_congested(struct mddev *mddev, int bits) rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(md_raid10_congested); - -static int raid10_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid10_congested(mddev, bits); -} static void flush_pending_writes(struct r10conf *conf) { @@ -3757,8 +3748,6 @@ static int run(struct mddev *mddev) if (mddev->queue) { int stripe = conf->geo.raid_disks * ((mddev->chunk_sectors << 9) / PAGE_SIZE); - mddev->queue->backing_dev_info.congested_fn = raid10_congested; - mddev->queue->backing_dev_info.congested_data = mddev; /* Calculate max read-ahead size. * We need to readahead at least twice a whole stripe.... @@ -4727,6 +4716,7 @@ static struct md_personality raid10_personality = .check_reshape = raid10_check_reshape, .start_reshape = raid10_start_reshape, .finish_reshape = raid10_finish_reshape, + .congested = raid10_congested, }; static int __init raid_init(void) diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 157d69e..5ee6473 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -150,7 +150,4 @@ enum r10bio_state { */ R10BIO_Previous, }; - -extern int md_raid10_congested(struct mddev *mddev, int bits); - #endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a03cf2d..502a908 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4149,7 +4149,7 @@ static void activate_bit_delay(struct r5conf *conf, } } -int md_raid5_congested(struct mddev *mddev, int bits) +static int raid5_congested(struct mddev *mddev, int bits) { struct r5conf *conf = mddev->private; @@ -4166,15 +4166,6 @@ int md_raid5_congested(struct mddev *mddev, int bits) return 0; } -EXPORT_SYMBOL_GPL(md_raid5_congested); - -static int raid5_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid5_congested(mddev, bits); -} /* We want read requests to align with chunks where possible, * but write requests don't need to. @@ -6248,9 +6239,6 @@ static int run(struct mddev *mddev) blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); - mddev->queue->backing_dev_info.congested_data = mddev; - mddev->queue->backing_dev_info.congested_fn = raid5_congested; - chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * @@ -6333,8 +6321,6 @@ static int stop(struct mddev *mddev) struct r5conf *conf = mddev->private; md_unregister_thread(&mddev->thread); - if (mddev->queue) - mddev->queue->backing_dev_info.congested_fn = NULL; free_conf(conf); mddev->private = NULL; mddev->to_remove = &raid5_attrs_group; @@ -7126,6 +7112,7 @@ static struct md_personality raid6_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid6_takeover, + .congested = raid5_congested, }; static struct md_personality raid5_personality = { @@ -7148,6 +7135,7 @@ static struct md_personality raid5_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid5_takeover, + .congested = raid5_congested, }; static struct md_personality raid4_personality = @@ -7171,6 +7159,7 @@ static struct md_personality raid4_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid4_takeover, + .congested = raid5_congested, }; static int __init raid5_init(void) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index d59f5ca..983e18a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -558,7 +558,6 @@ static inline int algorithm_is_DDF(int layout) return layout >= 8 && layout <= 10; } -extern int md_raid5_congested(struct mddev *mddev, int bits); extern void md_raid5_kick_device(struct r5conf *conf); extern int raid5_set_cache_size(struct mddev *mddev, int size); #endif -- cgit v0.10.2 From 64590f45ddc7147fa1968147a1f5b5c436b728fe Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:56:57 +1100 Subject: md: make merge_bvec_fn more robust in face of personality changes. There is no locking around calls to merge_bvec_fn(), so it is possible that calls which coincide with a level (or personality) change could go wrong. So create a central dispatch point for these functions and use rcu_read_lock(). If the array is suspended, reject any merge that can be rejected. If not, we know it is safe to call the function. Signed-off-by: NeilBrown diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 0510851..4c2a92c 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -60,11 +60,10 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) * * Return amount of bytes we can take at this offset */ -static int linear_mergeable_bvec(struct request_queue *q, +static int linear_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct dev_info *dev0; unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); @@ -213,8 +212,6 @@ static int linear_run (struct mddev *mddev) mddev->private = conf; md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); - blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); - ret = md_integrity_register(mddev); if (ret) { kfree(conf); @@ -361,6 +358,7 @@ static struct md_personality linear_personality = .hot_add_disk = linear_add, .size = linear_size, .congested = linear_congested, + .mergeable_bvec = linear_mergeable_bvec, }; static int __init linear_init (void) diff --git a/drivers/md/md.c b/drivers/md/md.c index d45f52e..9f0ff71 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -339,6 +339,29 @@ static int md_congested(void *data, int bits) return mddev_congested(mddev, bits); } +static int md_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) +{ + struct mddev *mddev = q->queuedata; + int ret; + rcu_read_lock(); + if (mddev->suspended) { + /* Must always allow one vec */ + if (bvm->bi_size == 0) + ret = biovec->bv_len; + else + ret = 0; + } else { + struct md_personality *pers = mddev->pers; + if (pers && pers->mergeable_bvec) + ret = pers->mergeable_bvec(mddev, bvm, biovec); + else + ret = biovec->bv_len; + } + rcu_read_unlock(); + return ret; +} /* * Generic flush handling for md */ @@ -4925,6 +4948,7 @@ int md_run(struct mddev *mddev) if (mddev->queue) { mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = md_congested; + blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec); } if (mddev->pers->sync_request) { if (mddev->kobj.sd && diff --git a/drivers/md/md.h b/drivers/md/md.h index f260228..bee5b85 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -499,6 +499,10 @@ struct md_personality /* congested implements bdi.congested_fn(). * Will not be called while array is 'suspended' */ int (*congested)(struct mddev *mddev, int bits); + /* mergeable_bvec is use to implement ->merge_bvec_fn */ + int (*mergeable_bvec)(struct mddev *mddev, + struct bvec_merge_data *bvm, + struct bio_vec *biovec); }; struct md_sysfs_entry { diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 4b521ea..3770c96 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -350,17 +350,16 @@ static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, /** * raid0_mergeable_bvec -- tell bio layer if two requests can be merged - * @q: request queue + * @mddev: the md device * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset */ -static int raid0_mergeable_bvec(struct request_queue *q, +static int raid0_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct r0conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); sector_t sector_offset = sector; @@ -465,7 +464,6 @@ static int raid0_run(struct mddev *mddev) mddev->queue->backing_dev_info.ra_pages = 2* stripe; } - blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); dump_zones(mddev); ret = md_integrity_register(mddev); @@ -724,6 +722,7 @@ static struct md_personality raid0_personality= .takeover = raid0_takeover, .quiesce = raid0_quiesce, .congested = raid0_congested, + .mergeable_bvec = raid0_mergeable_bvec, }; static int __init raid0_init (void) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 9ad7ce7..45c512a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -701,11 +701,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect return best_disk; } -static int raid1_mergeable_bvec(struct request_queue *q, +static int raid1_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct r1conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max = biovec->bv_len; @@ -2946,8 +2945,6 @@ static int run(struct mddev *mddev) md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); if (mddev->queue) { - blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); - if (discard_supported) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); @@ -3183,6 +3180,7 @@ static struct md_personality raid1_personality = .quiesce = raid1_quiesce, .takeover = raid1_takeover, .congested = raid1_congested, + .mergeable_bvec = raid1_mergeable_bvec, }; static int __init raid_init(void) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index fb6b886..407c81a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -674,7 +674,7 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) /** * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged - * @q: request queue + * @mddev: the md device * @bvm: properties of new bio * @biovec: the request that could be merged to it. * @@ -682,11 +682,10 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) * This requires checking for end-of-chunk if near_copies != raid_disks, * and for subordinate merge_bvec_fns if merge_check_needed. */ -static int raid10_mergeable_bvec(struct request_queue *q, +static int raid10_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct r10conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; @@ -3756,7 +3755,6 @@ static int run(struct mddev *mddev) stripe /= conf->geo.near_copies; if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); } if (md_integrity_register(mddev)) @@ -4717,6 +4715,7 @@ static struct md_personality raid10_personality = .start_reshape = raid10_start_reshape, .finish_reshape = raid10_finish_reshape, .congested = raid10_congested, + .mergeable_bvec = raid10_mergeable_bvec, }; static int __init raid_init(void) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 502a908..2d4a2cc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4170,11 +4170,10 @@ static int raid5_congested(struct mddev *mddev, int bits) /* We want read requests to align with chunks where possible, * but write requests don't need to. */ -static int raid5_mergeable_bvec(struct request_queue *q, +static int raid5_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_sectors; @@ -6237,8 +6236,6 @@ static int run(struct mddev *mddev) if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); - chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * @@ -7113,6 +7110,7 @@ static struct md_personality raid6_personality = .quiesce = raid5_quiesce, .takeover = raid6_takeover, .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid5_personality = { @@ -7136,6 +7134,7 @@ static struct md_personality raid5_personality = .quiesce = raid5_quiesce, .takeover = raid5_takeover, .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid4_personality = @@ -7160,6 +7159,7 @@ static struct md_personality raid4_personality = .quiesce = raid5_quiesce, .takeover = raid4_takeover, .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static int __init raid5_init(void) -- cgit v0.10.2 From 3be260cc18f850873cd32381158e28b0a9a391fd Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:56:57 +1100 Subject: md/linear: remove rcu protections in favour of suspend/resume The use of 'rcu' to protect accesses to ->private_data so that the ->private_data could be updated predates the introduction of mddev_suspend/mddev_resume. These are a cleaner mechanism for providing stability while swapping in a new ->private data - it is used by level_store() to support changing of raid levels. So get rid of the RCU stuff and just use mddev_suspend, mddev_resume. As these function call ->quiesce(), we add an empty function for linear just like for raid0. Signed-off-by: NeilBrown diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 4c2a92c..b3e717a 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -34,7 +34,7 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) lo = 0; hi = mddev->raid_disks - 1; - conf = rcu_dereference(mddev->private); + conf = mddev->private; /* * Binary Search @@ -70,7 +70,6 @@ static int linear_mergeable_bvec(struct mddev *mddev, int maxbytes = biovec->bv_len; struct request_queue *subq; - rcu_read_lock(); dev0 = which_dev(mddev, sector); maxsectors = dev0->end_sector - sector; subq = bdev_get_queue(dev0->rdev->bdev); @@ -80,7 +79,6 @@ static int linear_mergeable_bvec(struct mddev *mddev, maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm, biovec)); } - rcu_read_unlock(); if (maxsectors < bio_sectors) maxsectors = 0; @@ -101,15 +99,13 @@ static int linear_congested(struct mddev *mddev, int bits) struct linear_conf *conf; int i, ret = 0; - rcu_read_lock(); - conf = rcu_dereference(mddev->private); + conf = mddev->private; for (i = 0; i < mddev->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); } - rcu_read_unlock(); return ret; } @@ -118,12 +114,10 @@ static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disk struct linear_conf *conf; sector_t array_sectors; - rcu_read_lock(); - conf = rcu_dereference(mddev->private); + conf = mddev->private; WARN_ONCE(sectors || raid_disks, "%s does not support generic reshape\n", __func__); array_sectors = conf->array_sectors; - rcu_read_unlock(); return array_sectors; } @@ -243,33 +237,22 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) if (!newconf) return -ENOMEM; - oldconf = rcu_dereference_protected(mddev->private, - lockdep_is_held( - &mddev->reconfig_mutex)); + mddev_suspend(mddev); + oldconf = mddev->private; mddev->raid_disks++; - rcu_assign_pointer(mddev->private, newconf); + mddev->private = newconf; md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); + mddev_resume(mddev); revalidate_disk(mddev->gendisk); - kfree_rcu(oldconf, rcu); + kfree(oldconf); return 0; } static int linear_stop (struct mddev *mddev) { - struct linear_conf *conf = - rcu_dereference_protected(mddev->private, - lockdep_is_held( - &mddev->reconfig_mutex)); + struct linear_conf *conf = mddev->private; - /* - * We do not require rcu protection here since - * we hold reconfig_mutex for both linear_add and - * linear_stop, so they cannot race. - * We should make sure any old 'conf's are properly - * freed though. - */ - rcu_barrier(); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf); mddev->private = NULL; @@ -290,16 +273,12 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) } do { - rcu_read_lock(); - tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector); start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; end_sector = tmp_dev->end_sector; data_offset = tmp_dev->rdev->data_offset; bio->bi_bdev = tmp_dev->rdev->bdev; - rcu_read_unlock(); - if (unlikely(bio->bi_iter.bi_sector >= end_sector || bio->bi_iter.bi_sector < start_sector)) goto out_of_bounds; @@ -346,6 +325,10 @@ static void linear_status (struct seq_file *seq, struct mddev *mddev) seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); } +static void linear_quiesce(struct mddev *mddev, int state) +{ +} + static struct md_personality linear_personality = { .name = "linear", @@ -357,6 +340,7 @@ static struct md_personality linear_personality = .status = linear_status, .hot_add_disk = linear_add, .size = linear_size, + .quiesce = linear_quiesce, .congested = linear_congested, .mergeable_bvec = linear_mergeable_bvec, }; -- cgit v0.10.2 From 5aa61f427e4979be733e4847b9199ff9cc48a47e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:56:57 +1100 Subject: md: split detach operation out from ->stop. Each md personality has a 'stop' operation which does two things: 1/ it finalizes some aspects of the array to ensure nothing is accessing the ->private data 2/ it frees the ->private data. All the steps in '1' can apply to all arrays and so can be performed in common code. This is useful as in the case where we change the personality which manages an array (in level_store()), it would be helpful to do step 1 early, and step 2 later. So split the 'step 1' functionality out into a new mddev_detach(). Signed-off-by: NeilBrown diff --git a/drivers/md/linear.c b/drivers/md/linear.c index b3e717a..c201555 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -253,7 +253,6 @@ static int linear_stop (struct mddev *mddev) { struct linear_conf *conf = mddev->private; - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf); mddev->private = NULL; diff --git a/drivers/md/md.c b/drivers/md/md.c index 9f0ff71..58f140b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -72,6 +72,7 @@ static struct workqueue_struct *md_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); +static void mddev_detach(struct mddev *mddev); /* * Default number of read corrections we'll attempt on an rdev @@ -3372,6 +3373,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) /* Looks like we have a winner */ mddev_suspend(mddev); + mddev_detach(mddev); mddev->pers->stop(mddev); if (mddev->pers->sync_request == NULL && @@ -4928,18 +4930,17 @@ int md_run(struct mddev *mddev) (unsigned long long)mddev->array_sectors / 2, (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); err = -EINVAL; - mddev->pers->stop(mddev); } if (err == 0 && mddev->pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { err = bitmap_create(mddev); - if (err) { + if (err) printk(KERN_ERR "%s: failed to create bitmap (%d)\n", mdname(mddev), err); - mddev->pers->stop(mddev); - } } if (err) { + mddev_detach(mddev); + mddev->pers->stop(mddev); module_put(mddev->pers->owner); mddev->pers = NULL; bitmap_destroy(mddev); @@ -5112,9 +5113,30 @@ void md_stop_writes(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_stop_writes); +static void mddev_detach(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + /* wait for behind writes to complete */ + if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n", + mdname(mddev)); + /* need to kick something here to make sure I/O goes? */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } + if (mddev->pers->quiesce) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + md_unregister_thread(&mddev->thread); + if (mddev->queue) + blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ +} + static void __md_stop(struct mddev *mddev) { mddev->ready = 0; + mddev_detach(mddev); mddev->pers->stop(mddev); if (mddev->pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index fedb1b3..9fe3445 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -504,8 +504,6 @@ static int multipath_stop (struct mddev *mddev) { struct mpconf *conf = mddev->private; - md_unregister_thread(&mddev->thread); - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ mempool_destroy(conf->pool); kfree(conf->multipaths); kfree(conf); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 3770c96..01dfca9 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -477,7 +477,6 @@ static int raid0_stop(struct mddev *mddev) { struct r0conf *conf = mddev->private; - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf->strip_zone); kfree(conf->devlist); kfree(conf); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 45c512a..fccea0b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2954,29 +2954,17 @@ static int run(struct mddev *mddev) } ret = md_integrity_register(mddev); - if (ret) + if (ret) { + md_unregister_thread(&mddev->thread); stop(mddev); + } return ret; } static int stop(struct mddev *mddev) { struct r1conf *conf = mddev->private; - struct bitmap *bitmap = mddev->bitmap; - - /* wait for behind writes to complete */ - if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", - mdname(mddev)); - /* need to kick something here to make sure I/O goes? */ - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); - } - - freeze_array(conf, 0); - unfreeze_array(conf); - md_unregister_thread(&mddev->thread); if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 407c81a..654fdae 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3802,14 +3802,6 @@ static int stop(struct mddev *mddev) { struct r10conf *conf = mddev->private; - raise_barrier(conf, 0); - lower_barrier(conf); - - md_unregister_thread(&mddev->thread); - if (mddev->queue) - /* the unplug fn references 'conf'*/ - blk_sync_queue(mddev->queue); - if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); safe_put_page(conf->tmppage); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2d4a2cc..4825260 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6317,7 +6317,6 @@ static int stop(struct mddev *mddev) { struct r5conf *conf = mddev->private; - md_unregister_thread(&mddev->thread); free_conf(conf); mddev->private = NULL; mddev->to_remove = &raid5_attrs_group; -- cgit v0.10.2 From afa0f557cb15176570a18fb2a093e348a793afd4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:56:58 +1100 Subject: md: rename ->stop to ->free Now that the ->stop function only frees the private data, rename is accordingly. Also pass in the private pointer as an arg rather than using mddev->private. This flexibility will be useful in level_store(). Finally, don't clear ->private. It doesn't make sense to clear it seeing that isn't what we free, and it is no longer necessary to clear ->private (it was some time ago before ->to_remove was introduced). Setting ->to_remove in ->free() is a bit of a wart, but not a big problem at the moment. Signed-off-by: NeilBrown diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index e8b4574..1277eb2 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -332,13 +332,11 @@ static int run(struct mddev *mddev) return 0; } -static int stop(struct mddev *mddev) +static void faulty_free(struct mddev *mddev, void *priv) { - struct faulty_conf *conf = mddev->private; + struct faulty_conf *conf = priv; kfree(conf); - mddev->private = NULL; - return 0; } static struct md_personality faulty_personality = @@ -348,7 +346,7 @@ static struct md_personality faulty_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = faulty_free, .status = status, .check_reshape = reshape, .size = faulty_size, diff --git a/drivers/md/linear.c b/drivers/md/linear.c index c201555..fa7d577 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -249,14 +249,11 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) return 0; } -static int linear_stop (struct mddev *mddev) +static void linear_free(struct mddev *mddev, void *priv) { - struct linear_conf *conf = mddev->private; + struct linear_conf *conf = priv; kfree(conf); - mddev->private = NULL; - - return 0; } static void linear_make_request(struct mddev *mddev, struct bio *bio) @@ -335,7 +332,7 @@ static struct md_personality linear_personality = .owner = THIS_MODULE, .make_request = linear_make_request, .run = linear_run, - .stop = linear_stop, + .free = linear_free, .status = linear_status, .hot_add_disk = linear_add, .size = linear_size, diff --git a/drivers/md/md.c b/drivers/md/md.c index 58f140b..2920fd0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -293,8 +293,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio) /* mddev_suspend makes sure no new requests are submitted * to the device, and that any requests that have been submitted * are completely handled. - * Once ->stop is called and completes, the module will be completely - * unused. + * Once mddev_detach() is called and completes, the module will be + * completely unused. */ void mddev_suspend(struct mddev *mddev) { @@ -3374,7 +3374,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) /* Looks like we have a winner */ mddev_suspend(mddev); mddev_detach(mddev); - mddev->pers->stop(mddev); + mddev->pers->free(mddev, mddev->private); if (mddev->pers->sync_request == NULL && pers->sync_request != NULL) { @@ -4940,7 +4940,7 @@ int md_run(struct mddev *mddev) } if (err) { mddev_detach(mddev); - mddev->pers->stop(mddev); + mddev->pers->free(mddev, mddev->private); module_put(mddev->pers->owner); mddev->pers = NULL; bitmap_destroy(mddev); @@ -5137,7 +5137,7 @@ static void __md_stop(struct mddev *mddev) { mddev->ready = 0; mddev_detach(mddev); - mddev->pers->stop(mddev); + mddev->pers->free(mddev, mddev->private); if (mddev->pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; module_put(mddev->pers->owner); diff --git a/drivers/md/md.h b/drivers/md/md.h index bee5b85..37e7c17 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -465,7 +465,7 @@ struct md_personality struct module *owner; void (*make_request)(struct mddev *mddev, struct bio *bio); int (*run)(struct mddev *mddev); - int (*stop)(struct mddev *mddev); + void (*free)(struct mddev *mddev, void *priv); void (*status)(struct seq_file *seq, struct mddev *mddev); /* error_handler must set ->faulty and clear ->in_sync * if appropriate, and should abort recovery if needed diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 9fe3445..ac3ede2 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -399,7 +399,7 @@ static int multipath_run (struct mddev *mddev) /* * copy the already verified devices into our private MULTIPATH * bookkeeping area. [whatever we allocate in multipath_run(), - * should be freed in multipath_stop()] + * should be freed in multipath_free()] */ conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); @@ -500,15 +500,13 @@ out: return -EIO; } -static int multipath_stop (struct mddev *mddev) +static void multipath_free(struct mddev *mddev, void *priv) { - struct mpconf *conf = mddev->private; + struct mpconf *conf = priv; mempool_destroy(conf->pool); kfree(conf->multipaths); kfree(conf); - mddev->private = NULL; - return 0; } static struct md_personality multipath_personality = @@ -518,7 +516,7 @@ static struct md_personality multipath_personality = .owner = THIS_MODULE, .make_request = multipath_make_request, .run = multipath_run, - .stop = multipath_stop, + .free = multipath_free, .status = multipath_status, .error_handler = multipath_error, .hot_add_disk = multipath_add_disk, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 01dfca9..a13f738 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -415,7 +415,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks return array_sectors; } -static int raid0_stop(struct mddev *mddev); +static void raid0_free(struct mddev *mddev, void *priv); static int raid0_run(struct mddev *mddev) { @@ -468,20 +468,18 @@ static int raid0_run(struct mddev *mddev) ret = md_integrity_register(mddev); if (ret) - raid0_stop(mddev); + raid0_free(mddev, conf); return ret; } -static int raid0_stop(struct mddev *mddev) +static void raid0_free(struct mddev *mddev, void *priv) { - struct r0conf *conf = mddev->private; + struct r0conf *conf = priv; kfree(conf->strip_zone); kfree(conf->devlist); kfree(conf); - mddev->private = NULL; - return 0; } /* @@ -715,7 +713,7 @@ static struct md_personality raid0_personality= .owner = THIS_MODULE, .make_request = raid0_make_request, .run = raid0_run, - .stop = raid0_stop, + .free = raid0_free, .status = raid0_status, .size = raid0_size, .takeover = raid0_takeover, diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fccea0b..5dd0c2e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2872,7 +2872,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) return ERR_PTR(err); } -static int stop(struct mddev *mddev); +static void raid1_free(struct mddev *mddev, void *priv); static int run(struct mddev *mddev) { struct r1conf *conf; @@ -2894,7 +2894,7 @@ static int run(struct mddev *mddev) /* * copy the already verified devices into our private RAID1 * bookkeeping area. [whatever we allocate in run(), - * should be freed in stop()] + * should be freed in raid1_free()] */ if (mddev->private == NULL) conf = setup_conf(mddev); @@ -2956,14 +2956,14 @@ static int run(struct mddev *mddev) ret = md_integrity_register(mddev); if (ret) { md_unregister_thread(&mddev->thread); - stop(mddev); + raid1_free(mddev, conf); } return ret; } -static int stop(struct mddev *mddev) +static void raid1_free(struct mddev *mddev, void *priv) { - struct r1conf *conf = mddev->private; + struct r1conf *conf = priv; if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); @@ -2971,8 +2971,6 @@ static int stop(struct mddev *mddev) safe_put_page(conf->tmppage); kfree(conf->poolinfo); kfree(conf); - mddev->private = NULL; - return 0; } static int raid1_resize(struct mddev *mddev, sector_t sectors) @@ -3155,7 +3153,7 @@ static struct md_personality raid1_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid1_free, .status = status, .error_handler = error, .hot_add_disk = raid1_add_disk, diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 654fdae..d1203cd 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3798,9 +3798,9 @@ out: return -EIO; } -static int stop(struct mddev *mddev) +static void raid10_free(struct mddev *mddev, void *priv) { - struct r10conf *conf = mddev->private; + struct r10conf *conf = priv; if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); @@ -3809,8 +3809,6 @@ static int stop(struct mddev *mddev) kfree(conf->mirrors_old); kfree(conf->mirrors_new); kfree(conf); - mddev->private = NULL; - return 0; } static void raid10_quiesce(struct mddev *mddev, int state) @@ -4692,7 +4690,7 @@ static struct md_personality raid10_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid10_free, .status = status, .error_handler = error, .hot_add_disk = raid10_add_disk, diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4825260..dab908b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6313,14 +6313,12 @@ abort: return -EIO; } -static int stop(struct mddev *mddev) +static void raid5_free(struct mddev *mddev, void *priv) { - struct r5conf *conf = mddev->private; + struct r5conf *conf = priv; free_conf(conf); - mddev->private = NULL; mddev->to_remove = &raid5_attrs_group; - return 0; } static void status(struct seq_file *seq, struct mddev *mddev) @@ -7094,7 +7092,7 @@ static struct md_personality raid6_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, @@ -7118,7 +7116,7 @@ static struct md_personality raid5_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, @@ -7143,7 +7141,7 @@ static struct md_personality raid4_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, -- cgit v0.10.2 From db721d32b74b51a5ac9ec9fab1d85cba90dbdbd3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:56:58 +1100 Subject: md: level_store: group all important changes into one place. Gather all the changes that can happen atomically and might be relevant to other code into one place. This will make it easier to refine the locking. Note that this puts quite a few things between mddev_detach() and ->free(). Enabling this was the point of some recent patches. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 2920fd0..58f531f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3280,9 +3280,9 @@ level_store(struct mddev *mddev, const char *buf, size_t len) { char clevel[16]; ssize_t rv = len; - struct md_personality *pers; + struct md_personality *pers, *oldpers; long level; - void *priv; + void *priv, *oldpriv; struct md_rdev *rdev; if (mddev->pers == NULL) { @@ -3374,9 +3374,35 @@ level_store(struct mddev *mddev, const char *buf, size_t len) /* Looks like we have a winner */ mddev_suspend(mddev); mddev_detach(mddev); - mddev->pers->free(mddev, mddev->private); + oldpers = mddev->pers; + oldpriv = mddev->private; + mddev->pers = pers; + mddev->private = priv; + strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + mddev->level = mddev->new_level; + mddev->layout = mddev->new_layout; + mddev->chunk_sectors = mddev->new_chunk_sectors; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + mddev->degraded = 0; + + if (oldpers->sync_request == NULL && + mddev->external) { + /* We are converting from a no-redundancy array + * to a redundancy array and metadata is managed + * externally so we need to be sure that writes + * won't block due to a need to transition + * clean->dirty + * until external management is started. + */ + mddev->in_sync = 0; + mddev->safemode_delay = 0; + mddev->safemode = 0; + } - if (mddev->pers->sync_request == NULL && + oldpers->free(mddev, oldpriv); + + if (oldpers->sync_request == NULL && pers->sync_request != NULL) { /* need to add the md_redundancy_group */ if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) @@ -3385,27 +3411,13 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mdname(mddev)); mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); } - if (mddev->pers->sync_request != NULL && + if (oldpers->sync_request != NULL && pers->sync_request == NULL) { /* need to remove the md_redundancy_group */ if (mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; } - if (mddev->pers->sync_request == NULL && - mddev->external) { - /* We are converting from a no-redundancy array - * to a redundancy array and metadata is managed - * externally so we need to be sure that writes - * won't block due to a need to transition - * clean->dirty - * until external management is started. - */ - mddev->in_sync = 0; - mddev->safemode_delay = 0; - mddev->safemode = 0; - } - rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; @@ -3431,17 +3443,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) } } - module_put(mddev->pers->owner); - mddev->pers = pers; - mddev->private = priv; - strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); - mddev->level = mddev->new_level; - mddev->layout = mddev->new_layout; - mddev->chunk_sectors = mddev->new_chunk_sectors; - mddev->delta_disks = 0; - mddev->reshape_backwards = 0; - mddev->degraded = 0; - if (mddev->pers->sync_request == NULL) { + if (pers->sync_request == NULL) { /* this is now an array without redundancy, so * it must always be in_sync */ -- cgit v0.10.2 From 36d091f4759d194c99f0705d412afe208622b45a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:56:58 +1100 Subject: md: protect ->pers changes with mddev->lock ->pers is already protected by ->reconfig_mutex, and cannot possibly change when there are threads running or outstanding IO. However there are some places where we access ->pers not in a thread or IO context, and where ->reconfig_mutex is unnecessarily heavy-weight: level_show and md_seq_show(). So protect all changes, and those accesses, with ->lock. This is a step toward taking those accesses out from under reconfig_mutex. [Fixed missing "mddev->pers" -> "pers" conversion, thanks to Dan Carpenter ] Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 58f531f..4db4e41 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3264,15 +3264,20 @@ __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); static ssize_t level_show(struct mddev *mddev, char *page) { - struct md_personality *p = mddev->pers; + struct md_personality *p; + int ret; + spin_lock(&mddev->lock); + p = mddev->pers; if (p) - return sprintf(page, "%s\n", p->name); + ret = sprintf(page, "%s\n", p->name); else if (mddev->clevel[0]) - return sprintf(page, "%s\n", mddev->clevel); + ret = sprintf(page, "%s\n", mddev->clevel); else if (mddev->level != LEVEL_NONE) - return sprintf(page, "%d\n", mddev->level); + ret = sprintf(page, "%d\n", mddev->level); else - return 0; + ret = 0; + spin_unlock(&mddev->lock); + return ret; } static ssize_t @@ -3374,6 +3379,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) /* Looks like we have a winner */ mddev_suspend(mddev); mddev_detach(mddev); + + spin_lock(&mddev->lock); oldpers = mddev->pers; oldpriv = mddev->private; mddev->pers = pers; @@ -3385,6 +3392,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->delta_disks = 0; mddev->reshape_backwards = 0; mddev->degraded = 0; + spin_unlock(&mddev->lock); if (oldpers->sync_request == NULL && mddev->external) { @@ -4866,7 +4874,6 @@ int md_run(struct mddev *mddev) mddev->clevel); return -EINVAL; } - mddev->pers = pers; spin_unlock(&pers_lock); if (mddev->level != pers->level) { mddev->level = pers->level; @@ -4877,7 +4884,6 @@ int md_run(struct mddev *mddev) if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ - mddev->pers = NULL; module_put(pers->owner); return -EINVAL; } @@ -4921,19 +4927,19 @@ int md_run(struct mddev *mddev) if (start_readonly && mddev->ro == 0) mddev->ro = 2; /* read-only, but switch on first write */ - err = mddev->pers->run(mddev); + err = pers->run(mddev); if (err) printk(KERN_ERR "md: pers->run() failed ...\n"); - else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { + else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { WARN_ONCE(!mddev->external_size, "%s: default size too small," " but 'external_size' not in effect?\n", __func__); printk(KERN_ERR "md: invalid array_size %llu > default size %llu\n", (unsigned long long)mddev->array_sectors / 2, - (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); + (unsigned long long)pers->size(mddev, 0, 0) / 2); err = -EINVAL; } - if (err == 0 && mddev->pers->sync_request && + if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { err = bitmap_create(mddev); if (err) @@ -4942,9 +4948,8 @@ int md_run(struct mddev *mddev) } if (err) { mddev_detach(mddev); - mddev->pers->free(mddev, mddev->private); - module_put(mddev->pers->owner); - mddev->pers = NULL; + pers->free(mddev, mddev->private); + module_put(pers->owner); bitmap_destroy(mddev); return err; } @@ -4953,7 +4958,7 @@ int md_run(struct mddev *mddev) mddev->queue->backing_dev_info.congested_fn = md_congested; blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec); } - if (mddev->pers->sync_request) { + if (pers->sync_request) { if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_redundancy_group)) printk(KERN_WARNING @@ -4972,7 +4977,10 @@ int md_run(struct mddev *mddev) mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; smp_wmb(); + spin_lock(&mddev->lock); + mddev->pers = pers; mddev->ready = 1; + spin_unlock(&mddev->lock); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) if (sysfs_link_rdev(mddev, rdev)) @@ -5126,7 +5134,7 @@ static void mddev_detach(struct mddev *mddev) wait_event(bitmap->behind_wait, atomic_read(&bitmap->behind_writes) == 0); } - if (mddev->pers->quiesce) { + if (mddev->pers && mddev->pers->quiesce) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } @@ -5137,13 +5145,16 @@ static void mddev_detach(struct mddev *mddev) static void __md_stop(struct mddev *mddev) { - mddev->ready = 0; + struct md_personality *pers = mddev->pers; mddev_detach(mddev); - mddev->pers->free(mddev, mddev->private); - if (mddev->pers->sync_request && mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; - module_put(mddev->pers->owner); + spin_lock(&mddev->lock); + mddev->ready = 0; mddev->pers = NULL; + spin_unlock(&mddev->lock); + pers->free(mddev, mddev->private); + if (pers->sync_request && mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + module_put(pers->owner); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } @@ -6942,6 +6953,7 @@ static int md_seq_show(struct seq_file *seq, void *v) if (mddev_lock(mddev) < 0) return -EINTR; + spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : %sactive", mdname(mddev), mddev->pers ? "" : "in"); @@ -7012,6 +7024,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } + spin_unlock(&mddev->lock); mddev_unlock(mddev); return 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index 37e7c17..e41559d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -391,6 +391,7 @@ struct mddev { * rdev superblocks, events * clearing MD_CHANGE_* * in_sync - and related safemode and MD_CHANGE changes + * pers (also protected by reconfig_mutex and pending IO). */ spinlock_t lock; wait_queue_head_t sb_wait; /* for waiting on superblock updates */ -- cgit v0.10.2 From 978a7a47cae79ae7a7b5a1e80bfcaef6ee700312 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:56:58 +1100 Subject: md/bitmap: protect clearing of ->bitmap by mddev->lock This makes it safe to inspect the struct while holding only the spinlock. Signed-off-by: NeilBrown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 1695ee5..3424b19 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1619,7 +1619,9 @@ void bitmap_destroy(struct mddev *mddev) return; mutex_lock(&mddev->bitmap_info.mutex); + spin_lock(&mddev->lock); mddev->bitmap = NULL; /* disconnect from the md device */ + spin_unlock(&mddev->lock); mutex_unlock(&mddev->bitmap_info.mutex); if (mddev->thread) mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; diff --git a/drivers/md/md.h b/drivers/md/md.h index e41559d..8770308 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -392,6 +392,7 @@ struct mddev { * clearing MD_CHANGE_* * in_sync - and related safemode and MD_CHANGE changes * pers (also protected by reconfig_mutex and pending IO). + * clearing ->bitmap */ spinlock_t lock; wait_queue_head_t sb_wait; /* for waiting on superblock updates */ -- cgit v0.10.2 From f97fcad38f2ecf2e34b6f0ab93f74f2978dbe008 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:56:59 +1100 Subject: md: remove need for mddev_lock() in md_seq_show() The only access in md_seq_show that could suffer from races not protected by ->lock is walking the rdev list. This can receive sufficient protection from 'rcu'. So use rdev_for_each_rcu() and get rid of mddev_lock(). Now reading /proc/mdstat will never block in md_seq_show. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 4db4e41..eb0c3b5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6950,9 +6950,6 @@ static int md_seq_show(struct seq_file *seq, void *v) return 0; } - if (mddev_lock(mddev) < 0) - return -EINTR; - spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : %sactive", mdname(mddev), @@ -6966,7 +6963,8 @@ static int md_seq_show(struct seq_file *seq, void *v) } sectors = 0; - rdev_for_each(rdev, mddev) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); @@ -6982,6 +6980,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "(R)"); sectors += rdev->sectors; } + rcu_read_unlock(); if (!list_empty(&mddev->disks)) { if (mddev->pers) @@ -7025,7 +7024,6 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } spin_unlock(&mddev->lock); - mddev_unlock(mddev); return 0; } -- cgit v0.10.2 From 7b1485bab9c49b0d3811d72beb0de60c7b8b337d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:56:59 +1100 Subject: md/raid5: use ->lock to protect accessing raid5 sysfs attributes. It is important that mddev->private isn't freed while a sysfs attribute function is accessing it. So use mddev->lock to protect the setting of ->private to NULL, and take that lock when checking ->private for NULL and de-referencing it in the sysfs access functions. This only applies to the read ('show') side of access. Write access will be handled separately. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dab908b..d5b8017 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5354,11 +5354,14 @@ static void raid5d(struct md_thread *thread) static ssize_t raid5_show_stripe_cache_size(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->max_nr_stripes); - else - return 0; + ret = sprintf(page, "%d\n", conf->max_nr_stripes); + spin_unlock(&mddev->lock); + return ret; } int @@ -5422,11 +5425,14 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, static ssize_t raid5_show_preread_threshold(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->bypass_threshold); - else - return 0; + ret = sprintf(page, "%d\n", conf->bypass_threshold); + spin_unlock(&mddev->lock); + return ret; } static ssize_t @@ -5456,11 +5462,14 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, static ssize_t raid5_show_skip_copy(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->skip_copy); - else - return 0; + ret = sprintf(page, "%d\n", conf->skip_copy); + spin_unlock(&mddev->lock); + return ret; } static ssize_t @@ -5512,11 +5521,14 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active); static ssize_t raid5_show_group_thread_cnt(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->worker_cnt_per_group); - else - return 0; + ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); + spin_unlock(&mddev->lock); + return ret; } static int alloc_thread_groups(struct r5conf *conf, int cnt, -- cgit v0.10.2 From b7b17c9b67e5984210c83d50d2c8117d3bd50ea0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:56:59 +1100 Subject: md: remove mddev_lock() from md_attr_show() Most attributes can be read safely without any locking. A race might lead to a slightly out-dated value, but nothing wrong. We already have locking in some places where needed. All that remains is can_clear_show(), behind_writes_used_show() and action_show() which are easily fixed. Signed-off-by: NeilBrown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 3424b19..3a57679 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -2211,11 +2211,13 @@ __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t can_clear_show(struct mddev *mddev, char *page) { int len; + spin_lock(&mddev->lock); if (mddev->bitmap) len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? "false" : "true")); else len = sprintf(page, "\n"); + spin_unlock(&mddev->lock); return len; } @@ -2240,10 +2242,15 @@ __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); static ssize_t behind_writes_used_show(struct mddev *mddev, char *page) { + ssize_t ret; + spin_lock(&mddev->lock); if (mddev->bitmap == NULL) - return sprintf(page, "0\n"); - return sprintf(page, "%lu\n", - mddev->bitmap->behind_writes_used); + ret = sprintf(page, "0\n"); + else + ret = sprintf(page, "%lu\n", + mddev->bitmap->behind_writes_used); + spin_unlock(&mddev->lock); + return ret; } static ssize_t diff --git a/drivers/md/md.c b/drivers/md/md.c index eb0c3b5..dbbe5e6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4042,20 +4042,21 @@ static ssize_t action_show(struct mddev *mddev, char *page) { char *type = "idle"; - if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + unsigned long recovery = mddev->recovery; + if (test_bit(MD_RECOVERY_FROZEN, &recovery)) type = "frozen"; - else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) type = "reshape"; - else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { + if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) type = "resync"; - else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_CHECK, &recovery)) type = "check"; else type = "repair"; - } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) type = "recover"; } return sprintf(page, "%s\n", type); @@ -4572,11 +4573,7 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) mddev_get(mddev); spin_unlock(&all_mddevs_lock); - rv = mddev_lock(mddev); - if (!rv) { - rv = entry->show(mddev, page); - mddev_unlock(mddev); - } + rv = entry->show(mddev, page); mddev_put(mddev); return rv; } -- cgit v0.10.2 From 758bfc8abfbc26c196a53c52d52d251f20226a5c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:56:59 +1100 Subject: md: remove mddev_lock from rdev_attr_show() No rdev attributes need locking for 'show', though state_show() might benefit from ensuring it sees a consistent set of flags. None even use rdev->mddev, so testing for it isn't really needed and it certainly doesn't need to be held constant. So improve state_show() and remove the locking. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index dbbe5e6..5438834 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2419,40 +2419,41 @@ state_show(struct md_rdev *rdev, char *page) { char *sep = ""; size_t len = 0; + unsigned long flags = ACCESS_ONCE(rdev->flags); - if (test_bit(Faulty, &rdev->flags) || + if (test_bit(Faulty, &flags) || rdev->badblocks.unacked_exist) { len+= sprintf(page+len, "%sfaulty",sep); sep = ","; } - if (test_bit(In_sync, &rdev->flags)) { + if (test_bit(In_sync, &flags)) { len += sprintf(page+len, "%sin_sync",sep); sep = ","; } - if (test_bit(WriteMostly, &rdev->flags)) { + if (test_bit(WriteMostly, &flags)) { len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; } - if (test_bit(Blocked, &rdev->flags) || + if (test_bit(Blocked, &flags) || (rdev->badblocks.unacked_exist - && !test_bit(Faulty, &rdev->flags))) { + && !test_bit(Faulty, &flags))) { len += sprintf(page+len, "%sblocked", sep); sep = ","; } - if (!test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags)) { + if (!test_bit(Faulty, &flags) && + !test_bit(In_sync, &flags)) { len += sprintf(page+len, "%sspare", sep); sep = ","; } - if (test_bit(WriteErrorSeen, &rdev->flags)) { + if (test_bit(WriteErrorSeen, &flags)) { len += sprintf(page+len, "%swrite_error", sep); sep = ","; } - if (test_bit(WantReplacement, &rdev->flags)) { + if (test_bit(WantReplacement, &flags)) { len += sprintf(page+len, "%swant_replacement", sep); sep = ","; } - if (test_bit(Replacement, &rdev->flags)) { + if (test_bit(Replacement, &flags)) { len += sprintf(page+len, "%sreplacement", sep); sep = ","; } @@ -2965,21 +2966,12 @@ rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); - struct mddev *mddev = rdev->mddev; - ssize_t rv; if (!entry->show) return -EIO; - - rv = mddev ? mddev_lock(mddev) : -EBUSY; - if (!rv) { - if (rdev->mddev == NULL) - rv = -EBUSY; - else - rv = entry->show(rdev, page); - mddev_unlock(mddev); - } - return rv; + if (!rdev->mddev) + return -EBUSY; + return entry->show(rdev, page); } static ssize_t -- cgit v0.10.2 From f4ad3d38d49dc0c4c911e31d8b884d5b74362b6e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:57:00 +1100 Subject: md: remove unnecessary 'buf' from get_bitmap_file. 'buf' is only used because d_path fills from the end of the buffer instead of from the start. We don't need a separate buf to handle that, we just need to use memmove() to move the string to the start. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 5438834..bddecc0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5502,7 +5502,7 @@ static int get_array_info(struct mddev *mddev, void __user *arg) static int get_bitmap_file(struct mddev *mddev, void __user * arg) { mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ - char *ptr, *buf = NULL; + char *ptr; int err = -ENOMEM; file = kmalloc(sizeof(*file), GFP_NOIO); @@ -5516,23 +5516,19 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg) goto copy_out; } - buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); - if (!buf) - goto out; - ptr = d_path(&mddev->bitmap->storage.file->f_path, - buf, sizeof(file->pathname)); + file->pathname, sizeof(file->pathname)); if (IS_ERR(ptr)) goto out; - strcpy(file->pathname, ptr); + memmove(file->pathname, ptr, + sizeof(file->pathname)-(ptr-file->pathname)); copy_out: err = 0; if (copy_to_user(arg, file, sizeof(*file))) err = -EFAULT; out: - kfree(buf); kfree(file); return err; } -- cgit v0.10.2 From 1e594bb24d3d08fe4a8afa0fc45a61d6109130fa Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:57:00 +1100 Subject: md: tidy up set_bitmap_file 1/ delay setting mddev->bitmap_info.file until 'f' looks usable, so we don't have to unset it. 2/ Don't allow bitmap file to be set if bitmap_info.file is already set. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index bddecc0..72c44d3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5851,22 +5851,24 @@ static int set_bitmap_file(struct mddev *mddev, int fd) if (fd >= 0) { struct inode *inode; - if (mddev->bitmap) + struct file *f; + + if (mddev->bitmap || mddev->bitmap_info.file) return -EEXIST; /* cannot add when bitmap is present */ - mddev->bitmap_info.file = fget(fd); + f = fget(fd); - if (mddev->bitmap_info.file == NULL) { + if (f == NULL) { printk(KERN_ERR "%s: error: failed to get bitmap file\n", mdname(mddev)); return -EBADF; } - inode = mddev->bitmap_info.file->f_mapping->host; + inode = f->f_mapping->host; if (!S_ISREG(inode->i_mode)) { printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", mdname(mddev)); err = -EBADF; - } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) { + } else if (!(f->f_mode & FMODE_WRITE)) { printk(KERN_ERR "%s: error: bitmap file must open for write\n", mdname(mddev)); err = -EBADF; @@ -5876,10 +5878,10 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = -EBUSY; } if (err) { - fput(mddev->bitmap_info.file); - mddev->bitmap_info.file = NULL; + fput(f); return err; } + mddev->bitmap_info.file = f; mddev->bitmap_info.offset = 0; /* file overrides offset */ } else if (mddev->bitmap == NULL) return -ENOENT; /* cannot remove what isn't there */ -- cgit v0.10.2 From 4af1a04176bdb4688aa14f6c10d1d5131c036a9d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:57:00 +1100 Subject: md: move GET_BITMAP_FILE ioctl out from mddev_lock. It makes more sense to report bitmap_info->file, rather than bitmap->file (the later is only available once the array is active). With that change, use mddev->lock to protect bitmap_info being set to NULL, and we can call get_bitmap_file() without taking the mutex. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 72c44d3..5a94916 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5292,8 +5292,11 @@ static int do_md_stop(struct mddev *mddev, int mode, bitmap_destroy(mddev); if (mddev->bitmap_info.file) { - fput(mddev->bitmap_info.file); + struct file *f = mddev->bitmap_info.file; + spin_lock(&mddev->lock); mddev->bitmap_info.file = NULL; + spin_unlock(&mddev->lock); + fput(f); } mddev->bitmap_info.offset = 0; @@ -5503,32 +5506,30 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg) { mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ char *ptr; - int err = -ENOMEM; + int err; file = kmalloc(sizeof(*file), GFP_NOIO); - if (!file) - goto out; + return -ENOMEM; + err = 0; + spin_lock(&mddev->lock); /* bitmap disabled, zero the first byte and copy out */ - if (!mddev->bitmap || !mddev->bitmap->storage.file) { + if (!mddev->bitmap_info.file) file->pathname[0] = '\0'; - goto copy_out; - } - - ptr = d_path(&mddev->bitmap->storage.file->f_path, - file->pathname, sizeof(file->pathname)); - if (IS_ERR(ptr)) - goto out; - - memmove(file->pathname, ptr, - sizeof(file->pathname)-(ptr-file->pathname)); + else if ((ptr = d_path(&mddev->bitmap_info.file->f_path, + file->pathname, sizeof(file->pathname))), + IS_ERR(ptr)) + err = PTR_ERR(ptr); + else + memmove(file->pathname, ptr, + sizeof(file->pathname)-(ptr-file->pathname)); + spin_unlock(&mddev->lock); -copy_out: - err = 0; - if (copy_to_user(arg, file, sizeof(*file))) + if (err == 0 && + copy_to_user(arg, file, sizeof(*file))) err = -EFAULT; -out: + kfree(file); return err; } @@ -5900,9 +5901,13 @@ static int set_bitmap_file(struct mddev *mddev, int fd) mddev->pers->quiesce(mddev, 0); } if (fd < 0) { - if (mddev->bitmap_info.file) - fput(mddev->bitmap_info.file); - mddev->bitmap_info.file = NULL; + struct file *f = mddev->bitmap_info.file; + if (f) { + spin_lock(&mddev->lock); + mddev->bitmap_info.file = NULL; + spin_unlock(&mddev->lock); + fput(f); + } } return err; @@ -6315,6 +6320,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, case SET_DISK_FAULTY: err = set_disk_faulty(mddev, new_decode_dev(arg)); goto out; + + case GET_BITMAP_FILE: + err = get_bitmap_file(mddev, argp); + goto out; + } if (cmd == ADD_NEW_DISK) @@ -6406,10 +6416,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, * Commands even a read-only array can execute: */ switch (cmd) { - case GET_BITMAP_FILE: - err = get_bitmap_file(mddev, argp); - goto unlock; - case RESTART_ARRAY_RW: err = restart_array(mddev); goto unlock; diff --git a/drivers/md/md.h b/drivers/md/md.h index 8770308..b4fbd6a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -393,6 +393,7 @@ struct mddev { * in_sync - and related safemode and MD_CHANGE changes * pers (also protected by reconfig_mutex and pending IO). * clearing ->bitmap + * clearing ->bitmap_info.file */ spinlock_t lock; wait_queue_head_t sb_wait; /* for waiting on superblock updates */ -- cgit v0.10.2 From 1b30e66f5acc6bf22fff49d4093cf17454f914b7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:57:00 +1100 Subject: md: minor cleanup in safe_delay_store. There isn't really much room for races with ->safemode_delay. But as I am trying to clean up any racy code and will soon be removing reconfig_mutex protection from most _store() functions: - only set mddev->safemode_delay once, to ensure no code can see an intermediate value - use safemode_timer to call md_safemode_timeout() rather than calling it directly, to ensure it never races with itself. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 5a94916..0f9bc9a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3242,11 +3242,13 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) mddev->safemode_delay = 0; else { unsigned long old_delay = mddev->safemode_delay; - mddev->safemode_delay = (msec*HZ)/1000; - if (mddev->safemode_delay == 0) - mddev->safemode_delay = 1; - if (mddev->safemode_delay < old_delay || old_delay == 0) - md_safemode_timeout((unsigned long)mddev); + unsigned long new_delay = (msec*HZ)/1000; + + if (new_delay == 0) + new_delay = 1; + mddev->safemode_delay = new_delay; + if (new_delay < old_delay || old_delay == 0) + mod_timer(&mddev->safemode_timer, jiffies+1); } return len; } -- cgit v0.10.2 From 23da422b1951cb8dbcb7c3090057cb6d5ceedf49 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:57:01 +1100 Subject: md: use mddev->lock to protect updates to resync_{min,max}. There are interdependencies between these two sysfs attributes and whether a resync is currently running. Rather than depending on reconfig_mutex to ensure no races when testing these interdependencies are met, use the spinlock. This will allow the mutex to be remove from protecting this code in a subsequent patch. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 0f9bc9a..0f00c1e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4269,22 +4269,36 @@ static ssize_t min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; + int err; + int chunk; + if (kstrtoull(buf, 10, &min)) return -EINVAL; + + spin_lock(&mddev->lock); + err = -EINVAL; if (min > mddev->resync_max) - return -EINVAL; + goto out_unlock; + + err = -EBUSY; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; + goto out_unlock; /* Must be a multiple of chunk_size */ - if (mddev->chunk_sectors) { + chunk = mddev->chunk_sectors; + if (chunk) { sector_t temp = min; - if (sector_div(temp, mddev->chunk_sectors)) - return -EINVAL; + + err = -EINVAL; + if (sector_div(temp, chunk)) + goto out_unlock; } mddev->resync_min = min; + err = 0; - return len; +out_unlock: + spin_unlock(&mddev->lock); + return err ?: len; } static struct md_sysfs_entry md_min_sync = @@ -4302,29 +4316,42 @@ max_sync_show(struct mddev *mddev, char *page) static ssize_t max_sync_store(struct mddev *mddev, const char *buf, size_t len) { + int err; + spin_lock(&mddev->lock); if (strncmp(buf, "max", 3) == 0) mddev->resync_max = MaxSector; else { unsigned long long max; + int chunk; + + err = -EINVAL; if (kstrtoull(buf, 10, &max)) - return -EINVAL; + goto out_unlock; if (max < mddev->resync_min) - return -EINVAL; + goto out_unlock; + + err = -EBUSY; if (max < mddev->resync_max && mddev->ro == 0 && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; + goto out_unlock; /* Must be a multiple of chunk_size */ - if (mddev->chunk_sectors) { + chunk = mddev->chunk_sectors; + if (chunk) { sector_t temp = max; - if (sector_div(temp, mddev->chunk_sectors)) - return -EINVAL; + + err = -EINVAL; + if (sector_div(temp, chunk)) + goto out_unlock; } mddev->resync_max = max; } wake_up(&mddev->recovery_wait); - return len; + err = 0; +out_unlock: + spin_unlock(&mddev->lock); + return err ?: len; } static struct md_sysfs_entry md_max_sync = @@ -7585,6 +7612,7 @@ void md_do_sync(struct md_thread *thread) skip: set_bit(MD_CHANGE_DEVS, &mddev->flags); + spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) @@ -7593,6 +7621,8 @@ void md_do_sync(struct md_thread *thread) } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; mddev->curr_resync = 0; + spin_unlock(&mddev->lock); + wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -7793,7 +7823,9 @@ void md_check_recovery(struct mddev *mddev) * any transients in the value of "sync_action". */ mddev->curr_resync_completed = 0; + spin_lock(&mddev->lock); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + spin_unlock(&mddev->lock); /* Clear some bits that don't mean anything, but * might be left set */ diff --git a/drivers/md/md.h b/drivers/md/md.h index b4fbd6a..6bf3faa 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -394,6 +394,8 @@ struct mddev { * pers (also protected by reconfig_mutex and pending IO). * clearing ->bitmap * clearing ->bitmap_info.file + * changing ->resync_{min,max} + * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max}) */ spinlock_t lock; wait_queue_head_t sb_wait; /* for waiting on superblock updates */ -- cgit v0.10.2 From 5c47daf6e76f657d961a96d89f6419fde8eda557 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:57:01 +1100 Subject: md: move mddev_lock and related to md.h The one which is not inline (mddev_unlock) gets EXPORTed. This makes the locking available to personality modules so that it doesn't have to be imposed upon them. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 0f00c1e..ea839d8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -590,32 +590,9 @@ static struct mddev *mddev_find(dev_t unit) goto retry; } -static inline int __must_check mddev_lock(struct mddev *mddev) -{ - return mutex_lock_interruptible(&mddev->reconfig_mutex); -} - -/* Sometimes we need to take the lock in a situation where - * failure due to interrupts is not acceptable. - */ -static inline void mddev_lock_nointr(struct mddev *mddev) -{ - mutex_lock(&mddev->reconfig_mutex); -} - -static inline int mddev_is_locked(struct mddev *mddev) -{ - return mutex_is_locked(&mddev->reconfig_mutex); -} - -static inline int mddev_trylock(struct mddev *mddev) -{ - return mutex_trylock(&mddev->reconfig_mutex); -} - static struct attribute_group md_redundancy_group; -static void mddev_unlock(struct mddev *mddev) +void mddev_unlock(struct mddev *mddev) { if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as @@ -657,6 +634,7 @@ static void mddev_unlock(struct mddev *mddev) md_wakeup_thread(mddev->thread); spin_unlock(&pers_lock); } +EXPORT_SYMBOL_GPL(mddev_unlock); static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) { diff --git a/drivers/md/md.h b/drivers/md/md.h index 6bf3faa..14367d9 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -450,6 +450,30 @@ struct mddev { void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); }; +static inline int __must_check mddev_lock(struct mddev *mddev) +{ + return mutex_lock_interruptible(&mddev->reconfig_mutex); +} + +/* Sometimes we need to take the lock in a situation where + * failure due to interrupts is not acceptable. + */ +static inline void mddev_lock_nointr(struct mddev *mddev) +{ + mutex_lock(&mddev->reconfig_mutex); +} + +static inline int mddev_is_locked(struct mddev *mddev) +{ + return mutex_is_locked(&mddev->reconfig_mutex); +} + +static inline int mddev_trylock(struct mddev *mddev) +{ + return mutex_trylock(&mddev->reconfig_mutex); +} +extern void mddev_unlock(struct mddev *mddev); + static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) { int faulty = test_bit(Faulty, &rdev->flags); -- cgit v0.10.2 From 6791875e2e5393845b9c781d2998481089735134 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Dec 2014 12:57:01 +1100 Subject: md: make reconfig_mutex optional for writes to md sysfs files. Rather than using mddev_lock() to take the reconfig_mutex when writing to any md sysfs file, we only take mddev_lock() in the particular _store() functions that require it. Admittedly this is most, but it isn't all. This also allows us to remove special-case handling for new_dev_store (in md_attr_store). Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index ea839d8..c8d2bac 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3256,26 +3256,32 @@ static ssize_t level_store(struct mddev *mddev, const char *buf, size_t len) { char clevel[16]; - ssize_t rv = len; + ssize_t rv; + size_t slen = len; struct md_personality *pers, *oldpers; long level; void *priv, *oldpriv; struct md_rdev *rdev; + if (slen == 0 || slen >= sizeof(clevel)) + return -EINVAL; + + rv = mddev_lock(mddev); + if (rv) + return rv; + if (mddev->pers == NULL) { - if (len == 0) - return 0; - if (len >= sizeof(mddev->clevel)) - return -ENOSPC; - strncpy(mddev->clevel, buf, len); - if (mddev->clevel[len-1] == '\n') - len--; - mddev->clevel[len] = 0; + strncpy(mddev->clevel, buf, slen); + if (mddev->clevel[slen-1] == '\n') + slen--; + mddev->clevel[slen] = 0; mddev->level = LEVEL_NONE; - return rv; + rv = len; + goto out_unlock; } + rv = -EROFS; if (mddev->ro) - return -EROFS; + goto out_unlock; /* request to change the personality. Need to ensure: * - array is not engaged in resync/recovery/reshape @@ -3283,25 +3289,25 @@ level_store(struct mddev *mddev, const char *buf, size_t len) * - new personality will access other array. */ + rv = -EBUSY; if (mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || mddev->reshape_position != MaxSector || mddev->sysfs_active) - return -EBUSY; + goto out_unlock; + rv = -EINVAL; if (!mddev->pers->quiesce) { printk(KERN_WARNING "md: %s: %s does not support online personality change\n", mdname(mddev), mddev->pers->name); - return -EINVAL; + goto out_unlock; } /* Now find the new personality */ - if (len == 0 || len >= sizeof(clevel)) - return -EINVAL; - strncpy(clevel, buf, len); - if (clevel[len-1] == '\n') - len--; - clevel[len] = 0; + strncpy(clevel, buf, slen); + if (clevel[slen-1] == '\n') + slen--; + clevel[slen] = 0; if (kstrtol(clevel, 10, &level)) level = LEVEL_NONE; @@ -3312,20 +3318,23 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (!pers || !try_module_get(pers->owner)) { spin_unlock(&pers_lock); printk(KERN_WARNING "md: personality %s not loaded\n", clevel); - return -EINVAL; + rv = -EINVAL; + goto out_unlock; } spin_unlock(&pers_lock); if (pers == mddev->pers) { /* Nothing to do! */ module_put(pers->owner); - return rv; + rv = len; + goto out_unlock; } if (!pers->takeover) { module_put(pers->owner); printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", mdname(mddev), clevel); - return -EINVAL; + rv = -EINVAL; + goto out_unlock; } rdev_for_each(rdev, mddev) @@ -3345,7 +3354,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) module_put(pers->owner); printk(KERN_WARNING "md: %s: %s would not accept array\n", mdname(mddev), clevel); - return PTR_ERR(priv); + rv = PTR_ERR(priv); + goto out_unlock; } /* Looks like we have a winner */ @@ -3438,6 +3448,9 @@ level_store(struct mddev *mddev, const char *buf, size_t len) md_update_sb(mddev, 1); sysfs_notify(&mddev->kobj, NULL, "level"); md_new_event(mddev); + rv = len; +out_unlock: + mddev_unlock(mddev); return rv; } @@ -3460,28 +3473,32 @@ layout_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long n = simple_strtoul(buf, &e, 10); + int err; if (!*buf || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { - int err; if (mddev->pers->check_reshape == NULL) - return -EBUSY; - if (mddev->ro) - return -EROFS; - mddev->new_layout = n; - err = mddev->pers->check_reshape(mddev); - if (err) { - mddev->new_layout = mddev->layout; - return err; + err = -EBUSY; + else if (mddev->ro) + err = -EROFS; + else { + mddev->new_layout = n; + err = mddev->pers->check_reshape(mddev); + if (err) + mddev->new_layout = mddev->layout; } } else { mddev->new_layout = n; if (mddev->reshape_position == MaxSector) mddev->layout = n; } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_layout = __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); @@ -3504,32 +3521,39 @@ static ssize_t raid_disks_store(struct mddev *mddev, const char *buf, size_t len) { char *e; - int rv = 0; + int err; unsigned long n = simple_strtoul(buf, &e, 10); if (!*buf || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) - rv = update_raid_disks(mddev, n); + err = update_raid_disks(mddev, n); else if (mddev->reshape_position != MaxSector) { struct md_rdev *rdev; int olddisks = mddev->raid_disks - mddev->delta_disks; + err = -EINVAL; rdev_for_each(rdev, mddev) { if (olddisks < n && rdev->data_offset < rdev->new_data_offset) - return -EINVAL; + goto out_unlock; if (olddisks > n && rdev->data_offset > rdev->new_data_offset) - return -EINVAL; + goto out_unlock; } + err = 0; mddev->delta_disks = n - olddisks; mddev->raid_disks = n; mddev->reshape_backwards = (mddev->delta_disks < 0); } else mddev->raid_disks = n; - return rv ? rv : len; +out_unlock: + mddev_unlock(mddev); + return err ? err : len; } static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); @@ -3548,30 +3572,34 @@ chunk_size_show(struct mddev *mddev, char *page) static ssize_t chunk_size_store(struct mddev *mddev, const char *buf, size_t len) { + int err; char *e; unsigned long n = simple_strtoul(buf, &e, 10); if (!*buf || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { - int err; if (mddev->pers->check_reshape == NULL) - return -EBUSY; - if (mddev->ro) - return -EROFS; - mddev->new_chunk_sectors = n >> 9; - err = mddev->pers->check_reshape(mddev); - if (err) { - mddev->new_chunk_sectors = mddev->chunk_sectors; - return err; + err = -EBUSY; + else if (mddev->ro) + err = -EROFS; + else { + mddev->new_chunk_sectors = n >> 9; + err = mddev->pers->check_reshape(mddev); + if (err) + mddev->new_chunk_sectors = mddev->chunk_sectors; } } else { mddev->new_chunk_sectors = n >> 9; if (mddev->reshape_position == MaxSector) mddev->chunk_sectors = n >> 9; } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_chunk_size = __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); @@ -3587,20 +3615,27 @@ resync_start_show(struct mddev *mddev, char *page) static ssize_t resync_start_store(struct mddev *mddev, const char *buf, size_t len) { + int err; char *e; unsigned long long n = simple_strtoull(buf, &e, 10); + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) - return -EBUSY; - if (cmd_match(buf, "none")) + err = -EBUSY; + else if (cmd_match(buf, "none")) n = MaxSector; else if (!*buf || (*e && *e != '\n')) - return -EINVAL; + err = -EINVAL; - mddev->recovery_cp = n; - if (mddev->pers) - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - return len; + if (!err) { + mddev->recovery_cp = n; + if (mddev->pers) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_resync_start = __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); @@ -3698,8 +3733,39 @@ static int restart_array(struct mddev *mddev); static ssize_t array_state_store(struct mddev *mddev, const char *buf, size_t len) { - int err = -EINVAL; + int err; enum array_state st = match_word(buf, array_states); + + if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { + /* don't take reconfig_mutex when toggling between + * clean and active + */ + spin_lock(&mddev->lock); + if (st == active) { + restart_array(mddev); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + wake_up(&mddev->sb_wait); + err = 0; + } else /* st == clean */ { + restart_array(mddev); + if (atomic_read(&mddev->writes_pending) == 0) { + if (mddev->in_sync == 0) { + mddev->in_sync = 1; + if (mddev->safemode == 1) + mddev->safemode = 0; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } + err = 0; + } else + err = -EBUSY; + } + spin_unlock(&mddev->lock); + return err; + } + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; switch(st) { case bad_word: break; @@ -3775,14 +3841,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) /* these cannot be set */ break; } - if (err) - return err; - else { + + if (!err) { if (mddev->hold_active == UNTIL_IOCTL) mddev->hold_active = 0; sysfs_notify_dirent_safe(mddev->sysfs_state); - return len; } + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_array_state = __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); @@ -3843,6 +3909,11 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; + flush_workqueue(md_misc_wq); + + err = mddev_lock(mddev); + if (err) + return err; if (mddev->persistent) { rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); @@ -3866,6 +3937,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) out: if (err) export_rdev(rdev); + mddev_unlock(mddev); return err ? err : len; } @@ -3877,7 +3949,11 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) { char *end; unsigned long chunk, end_chunk; + int err; + err = mddev_lock(mddev); + if (err) + return err; if (!mddev->bitmap) goto out; /* buf should be ... or - ... (range) */ @@ -3895,6 +3971,7 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) } bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ out: + mddev_unlock(mddev); return len; } @@ -3922,6 +3999,9 @@ size_store(struct mddev *mddev, const char *buf, size_t len) if (err < 0) return err; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { err = update_size(mddev, sectors); md_update_sb(mddev, 1); @@ -3932,6 +4012,7 @@ size_store(struct mddev *mddev, const char *buf, size_t len) else err = -ENOSPC; } + mddev_unlock(mddev); return err ? err : len; } @@ -3961,21 +4042,28 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) { int major, minor; char *e; + int err; /* Changing the details of 'external' metadata is * always permitted. Otherwise there must be * no devices attached to the array. */ + + err = mddev_lock(mddev); + if (err) + return err; + err = -EBUSY; if (mddev->external && strncmp(buf, "external:", 9) == 0) ; else if (!list_empty(&mddev->disks)) - return -EBUSY; + goto out_unlock; + err = 0; if (cmd_match(buf, "none")) { mddev->persistent = 0; mddev->external = 0; mddev->major_version = 0; mddev->minor_version = 90; - return len; + goto out_unlock; } if (strncmp(buf, "external:", 9) == 0) { size_t namelen = len-9; @@ -3989,22 +4077,27 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) mddev->external = 1; mddev->major_version = 0; mddev->minor_version = 90; - return len; + goto out_unlock; } major = simple_strtoul(buf, &e, 10); + err = -EINVAL; if (e==buf || *e != '.') - return -EINVAL; + goto out_unlock; buf = e+1; minor = simple_strtoul(buf, &e, 10); if (e==buf || (*e && *e != '\n') ) - return -EINVAL; + goto out_unlock; + err = -ENOENT; if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) - return -ENOENT; + goto out_unlock; mddev->major_version = major; mddev->minor_version = minor; mddev->persistent = 1; mddev->external = 0; - return len; + err = 0; +out_unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_metadata = @@ -4049,7 +4142,10 @@ action_store(struct mddev *mddev, const char *page, size_t len) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); + if (mddev_lock(mddev) == 0) { + md_reap_sync_thread(mddev); + mddev_unlock(mddev); + } } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -4063,7 +4159,11 @@ action_store(struct mddev *mddev, const char *page, size_t len) int err; if (mddev->pers->start_reshape == NULL) return -EINVAL; - err = mddev->pers->start_reshape(mddev); + err = mddev_lock(mddev); + if (!err) { + err = mddev->pers->start_reshape(mddev); + mddev_unlock(mddev); + } if (err) return err; sysfs_notify(&mddev->kobj, NULL, "degraded"); @@ -4346,14 +4446,20 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old = mddev->suspend_lo; + unsigned long long old; + int err; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) + goto unlock; + old = mddev->suspend_lo; mddev->suspend_lo = new; if (new >= old) /* Shrinking suspended region */ @@ -4363,7 +4469,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - return len; + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); @@ -4379,14 +4488,20 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old = mddev->suspend_hi; + unsigned long long old; + int err; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) + goto unlock; + old = mddev->suspend_hi; mddev->suspend_hi = new; if (new <= old) /* Shrinking suspended region */ @@ -4396,7 +4511,10 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - return len; + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); @@ -4416,11 +4534,17 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { struct md_rdev *rdev; char *e; + int err; unsigned long long new = simple_strtoull(buf, &e, 10); - if (mddev->pers) - return -EBUSY; + if (buf == e || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + err = -EBUSY; + if (mddev->pers) + goto unlock; mddev->reshape_position = new; mddev->delta_disks = 0; mddev->reshape_backwards = 0; @@ -4429,7 +4553,10 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len) mddev->new_chunk_sectors = mddev->chunk_sectors; rdev_for_each(rdev, mddev) rdev->new_data_offset = rdev->data_offset; - return len; + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_reshape_position = @@ -4447,6 +4574,8 @@ static ssize_t reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) { int backwards = 0; + int err; + if (cmd_match(buf, "forwards")) backwards = 0; else if (cmd_match(buf, "backwards")) @@ -4456,16 +4585,19 @@ reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->reshape_backwards == backwards) return len; + err = mddev_lock(mddev); + if (err) + return err; /* check if we are allowed to change */ if (mddev->delta_disks) - return -EBUSY; - - if (mddev->persistent && + err = -EBUSY; + else if (mddev->persistent && mddev->major_version == 0) - return -EINVAL; - - mddev->reshape_backwards = backwards; - return len; + err = -EINVAL; + else + mddev->reshape_backwards = backwards; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_reshape_direction = @@ -4486,6 +4618,11 @@ static ssize_t array_size_store(struct mddev *mddev, const char *buf, size_t len) { sector_t sectors; + int err; + + err = mddev_lock(mddev); + if (err) + return err; if (strncmp(buf, "default", 7) == 0) { if (mddev->pers) @@ -4496,19 +4633,22 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len) mddev->external_size = 0; } else { if (strict_blocks_to_sectors(buf, §ors) < 0) - return -EINVAL; - if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) - return -E2BIG; - - mddev->external_size = 1; + err = -EINVAL; + else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) + err = -E2BIG; + else + mddev->external_size = 1; } - mddev->array_sectors = sectors; - if (mddev->pers) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); + if (!err) { + mddev->array_sectors = sectors; + if (mddev->pers) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_array_size = @@ -4596,13 +4736,7 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, } mddev_get(mddev); spin_unlock(&all_mddevs_lock); - if (entry->store == new_dev_store) - flush_workqueue(md_misc_wq); - rv = mddev_lock(mddev); - if (!rv) { - rv = entry->store(mddev, page, length); - mddev_unlock(mddev); - } + rv = entry->store(mddev, page, length); mddev_put(mddev); return rv; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d5b8017..aa76865 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5400,21 +5400,25 @@ EXPORT_SYMBOL(raid5_set_cache_size); static ssize_t raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; int err; if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - err = raid5_set_cache_size(mddev, new); + err = mddev_lock(mddev); if (err) return err; - return len; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else + err = raid5_set_cache_size(mddev, new); + mddev_unlock(mddev); + + return err ?: len; } static struct md_sysfs_entry @@ -5438,19 +5442,27 @@ raid5_show_preread_threshold(struct mddev *mddev, char *page) static ssize_t raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - if (new > conf->max_nr_stripes) - return -EINVAL; - conf->bypass_threshold = new; - return len; + + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new > conf->max_nr_stripes) + err = -EINVAL; + else + conf->bypass_threshold = new; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry @@ -5475,29 +5487,35 @@ raid5_show_skip_copy(struct mddev *mddev, char *page) static ssize_t raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; new = !!new; - if (new == conf->skip_copy) - return len; - mddev_suspend(mddev); - conf->skip_copy = new; - if (new) - mddev->queue->backing_dev_info.capabilities |= - BDI_CAP_STABLE_WRITES; - else - mddev->queue->backing_dev_info.capabilities &= - ~BDI_CAP_STABLE_WRITES; - mddev_resume(mddev); - return len; + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new != conf->skip_copy) { + mddev_suspend(mddev); + conf->skip_copy = new; + if (new) + mddev->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; + else + mddev->queue->backing_dev_info.capabilities &= + ~BDI_CAP_STABLE_WRITES; + mddev_resume(mddev); + } + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry @@ -5538,7 +5556,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt, static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; int err; struct r5worker_group *new_groups, *old_groups; @@ -5546,41 +5564,41 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - if (new == conf->worker_cnt_per_group) - return len; - - mddev_suspend(mddev); + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new != conf->worker_cnt_per_group) { + mddev_suspend(mddev); - old_groups = conf->worker_groups; - if (old_groups) - flush_workqueue(raid5_wq); + old_groups = conf->worker_groups; + if (old_groups) + flush_workqueue(raid5_wq); - err = alloc_thread_groups(conf, new, - &group_cnt, &worker_cnt_per_group, - &new_groups); - if (!err) { - spin_lock_irq(&conf->device_lock); - conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; - conf->worker_groups = new_groups; - spin_unlock_irq(&conf->device_lock); + err = alloc_thread_groups(conf, new, + &group_cnt, &worker_cnt_per_group, + &new_groups); + if (!err) { + spin_lock_irq(&conf->device_lock); + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_groups; + spin_unlock_irq(&conf->device_lock); - if (old_groups) - kfree(old_groups[0].workers); - kfree(old_groups); + if (old_groups) + kfree(old_groups[0].workers); + kfree(old_groups); + } + mddev_resume(mddev); } + mddev_unlock(mddev); - mddev_resume(mddev); - - if (err) - return err; - return len; + return err ?: len; } static struct md_sysfs_entry -- cgit v0.10.2 From dfe15ac1c6ad301b092ed295f0cfdb16cfaf5cfa Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 26 Jul 2012 11:12:18 +0200 Subject: md: wakeup thread upon rdev_dec_pending() After each call to rdev_dec_pending() we should wakeup the md thread if the device is found to be faulty. Otherwise we'll incur heavy delays on failing devices. Signed-off-by: Neil Brown Signed-off-by: Hannes Reinecke diff --git a/drivers/md/md.h b/drivers/md/md.h index 14367d9..318ca8f 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -474,13 +474,6 @@ static inline int mddev_trylock(struct mddev *mddev) } extern void mddev_unlock(struct mddev *mddev); -static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) -{ - int faulty = test_bit(Faulty, &rdev->flags); - if (atomic_dec_and_test(&rdev->nr_pending) && faulty) - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); -} - static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) { atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); @@ -666,4 +659,14 @@ static inline int mddev_check_plugged(struct mddev *mddev) return !!blk_check_plugged(md_unplug, mddev, sizeof(struct blk_plug_cb)); } + +static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) +{ + int faulty = test_bit(Faulty, &rdev->flags); + if (atomic_dec_and_test(&rdev->nr_pending) && faulty) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } +} + #endif /* _MD_MD_H */ -- cgit v0.10.2 From 53a6ab4d3f6d6dc87ec8f14998b4b5536ee2968c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 12 Feb 2015 14:09:57 +1100 Subject: md/raid10: fix conversion from RAID0 to RAID10 A RAID0 array (like a LINEAR array) does not have a concept of 'size' being the amount of each device that is in use. Rather, as much of each device as is available is used. So the 'size' is set to 0 and ignored. RAID10 does have this concept and needs it to be set correctly. So when we convert RAID0 to RAID10 we must determine the 'size' (that being the size of the first 'strip_zone' in the RAID0), and set it correctly. Reported-and-tested-by: Xiao Ni Signed-off-by: NeilBrown diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d1203cd..b8d76b1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3872,7 +3872,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) return 0; } -static void *raid10_takeover_raid0(struct mddev *mddev) +static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) { struct md_rdev *rdev; struct r10conf *conf; @@ -3882,6 +3882,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev) mdname(mddev)); return ERR_PTR(-EINVAL); } + sector_div(size, devs); /* Set new parameters */ mddev->new_level = 10; @@ -3892,12 +3893,15 @@ static void *raid10_takeover_raid0(struct mddev *mddev) mddev->raid_disks *= 2; /* make sure it will be not marked as dirty */ mddev->recovery_cp = MaxSector; + mddev->dev_sectors = size; conf = setup_conf(mddev); if (!IS_ERR(conf)) { rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0) + if (rdev->raid_disk >= 0) { rdev->new_raid_disk = rdev->raid_disk * 2; + rdev->sectors = size; + } conf->barrier = 1; } @@ -3920,7 +3924,9 @@ static void *raid10_takeover(struct mddev *mddev) mdname(mddev)); return ERR_PTR(-EINVAL); } - return raid10_takeover_raid0(mddev); + return raid10_takeover_raid0(mddev, + raid0_conf->strip_zone->zone_end, + raid0_conf->strip_zone->nb_dev); } return ERR_PTR(-EINVAL); } -- cgit v0.10.2