From 8f6c2e4b325a8e9f8f47febb2fd0ed4fae7d45a9 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 1 Jul 2009 11:13:45 +1000 Subject: md: Use new topology calls to indicate alignment and I/O sizes Switch MD over to the new disk_stack_limits() function which checks for aligment and adjusts preferred I/O sizes when stacking. Also indicate preferred I/O sizes where applicable. Signed-off-by: Martin K. Petersen Signed-off-by: Mike Snitzer Signed-off-by: NeilBrown diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 15c8b7b..5810fa9 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -166,8 +166,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) rdev->sectors = sectors * mddev->chunk_sectors; } - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index cbe368f..237fe3f 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -294,7 +294,8 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) for (path = first; path <= last; path++) if ((p=conf->multipaths+path)->rdev == NULL) { q = rdev->bdev->bd_disk->queue; - blk_queue_stack_limits(mddev->queue, q); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as @@ -463,9 +464,9 @@ static int multipath_run (mddev_t *mddev) disk = conf->multipaths + disk_idx; disk->rdev = rdev; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); /* as we don't honour merge_bvec_fn, we must never risk * violating it, not that we ever expect a device with * a merge_bvec_fn to be involved in multipath */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ab4a489..335f490 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -170,8 +170,8 @@ static int create_strip_zones(mddev_t *mddev) } dev[j] = rdev1; - blk_queue_stack_limits(mddev->queue, - rdev1->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev1->bdev, + rdev1->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. @@ -250,6 +250,11 @@ static int create_strip_zones(mddev_t *mddev) mddev->chunk_sectors << 9); goto abort; } + + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); + blk_queue_io_opt(mddev->queue, + (mddev->chunk_sectors << 9) * mddev->raid_disks); + printk(KERN_INFO "raid0: done.\n"); mddev->private = conf; return 0; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 89939a7..0569efb 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1123,8 +1123,8 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) for (mirror = first; mirror <= last; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. @@ -1988,9 +1988,8 @@ static int run(mddev_t *mddev) disk = conf->mirrors + disk_idx; disk->rdev = rdev; - - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ae12cea..7298a5e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1151,8 +1151,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) for ( ; mirror <= last ; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. @@ -2044,7 +2044,7 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) static int run(mddev_t *mddev) { conf_t *conf; - int i, disk_idx; + int i, disk_idx, chunk_size; mirror_info_t *disk; mdk_rdev_t *rdev; int nc, fc, fo; @@ -2130,6 +2130,14 @@ static int run(mddev_t *mddev) spin_lock_init(&conf->device_lock); mddev->queue->queue_lock = &conf->device_lock; + chunk_size = mddev->chunk_sectors << 9; + blk_queue_io_min(mddev->queue, chunk_size); + if (conf->raid_disks % conf->near_copies) + blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); + else + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->raid_disks / conf->near_copies)); + list_for_each_entry(rdev, &mddev->disks, same_set) { disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks @@ -2138,9 +2146,8 @@ static int run(mddev_t *mddev) disk = conf->mirrors + disk_idx; disk->rdev = rdev; - - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f9f991e..92ef9b6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4452,7 +4452,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) static int run(mddev_t *mddev) { raid5_conf_t *conf; - int working_disks = 0; + int working_disks = 0, chunk_size; mdk_rdev_t *rdev; if (mddev->recovery_cp != MaxSector) @@ -4607,6 +4607,14 @@ static int run(mddev_t *mddev) md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); + chunk_size = mddev->chunk_sectors << 9; + blk_queue_io_min(mddev->queue, chunk_size); + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->raid_disks - conf->max_degraded)); + + list_for_each_entry(rdev, &mddev->disks, same_set) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); return 0; abort: -- cgit v0.10.2 From b8d966efd9a46a9a35beac50cbff6e30565125ef Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 1 Jul 2009 11:14:04 +1000 Subject: md: avoid dereferencing NULL pointer when accessing suspend_* sysfs attributes. If we try to modify one of the md/ sysfs files suspend_lo or suspend_hi when the array is not active, we dereference a NULL. Protect against that. Cc: stable@kernel.org Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 09be637..2166af8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3573,7 +3573,8 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) char *e; unsigned long long new = simple_strtoull(buf, &e, 10); - if (mddev->pers->quiesce == NULL) + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; @@ -3601,7 +3602,8 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) char *e; unsigned long long new = simple_strtoull(buf, &e, 10); - if (mddev->pers->quiesce == NULL) + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; -- cgit v0.10.2 From 1ec22eb2b4a2e1a763106bce36b11c02eaa84e61 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 1 Jul 2009 12:27:21 +1000 Subject: md: fix error path when duplicate name is found on md device creation. When an md device is created by name (rather than number) we need to check that the name is not already in use. If this check finds a duplicate, we return an error without dropping the lock or freeing the newly create mddev. This patch fixes that. Cc: stable@kernel.org Found-by: Jiri Slaby Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 2166af8..58bee23 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3862,6 +3862,8 @@ static int md_alloc(dev_t dev, char *name) if (mddev2->gendisk && strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); + mutex_unlock(&disks_mutex); + mddev_put(mddev); return -EEXIST; } spin_unlock(&all_mddevs_lock); -- cgit v0.10.2 From 0909dc448c98ed5021c87ffdfc09fb473aa464ab Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 1 Jul 2009 12:27:21 +1000 Subject: md: tidy up error paths in md_alloc As the recent bug in md_alloc showed, having a single exit path for unlocking and putting is a good idea. So restructure md_alloc to have a single mutex_unlock and mddev_put, and use gotos where necessary. Found-by: Jiri Slaby Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 58bee23..65fe35b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3846,11 +3846,9 @@ static int md_alloc(dev_t dev, char *name) flush_scheduled_work(); mutex_lock(&disks_mutex); - if (mddev->gendisk) { - mutex_unlock(&disks_mutex); - mddev_put(mddev); - return -EEXIST; - } + error = -EEXIST; + if (mddev->gendisk) + goto abort; if (name) { /* Need to ensure that 'name' is not a duplicate. @@ -3862,19 +3860,15 @@ static int md_alloc(dev_t dev, char *name) if (mddev2->gendisk && strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); - mutex_unlock(&disks_mutex); - mddev_put(mddev); - return -EEXIST; + goto abort; } spin_unlock(&all_mddevs_lock); } + error = -ENOMEM; mddev->queue = blk_alloc_queue(GFP_KERNEL); - if (!mddev->queue) { - mutex_unlock(&disks_mutex); - mddev_put(mddev); - return -ENOMEM; - } + if (!mddev->queue) + goto abort; mddev->queue->queuedata = mddev; /* Can be unlocked because the queue is new: no concurrency */ @@ -3884,11 +3878,9 @@ static int md_alloc(dev_t dev, char *name) disk = alloc_disk(1 << shift); if (!disk) { - mutex_unlock(&disks_mutex); blk_cleanup_queue(mddev->queue); mddev->queue = NULL; - mddev_put(mddev); - return -ENOMEM; + goto abort; } disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; @@ -3910,16 +3902,22 @@ static int md_alloc(dev_t dev, char *name) mddev->gendisk = disk; error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk_to_dev(disk)->kobj, "%s", "md"); - mutex_unlock(&disks_mutex); - if (error) + if (error) { + /* This isn't possible, but as kobject_init_and_add is marked + * __must_check, we must do something with the result + */ printk(KERN_WARNING "md: cannot register %s/md - name in use\n", disk->disk_name); - else { + error = 0; + } + abort: + mutex_unlock(&disks_mutex); + if (!error) { kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); } mddev_put(mddev); - return 0; + return error; } static struct kobject *md_probe(dev_t dev, int *part, void *data) -- cgit v0.10.2 From a5c308d4d1659b1f4833b863394e3e24cdbdfc6e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 1 Jul 2009 13:15:35 +1000 Subject: md/raid5: suspend shouldn't affect read requests. md allows write to regions on an array to be suspended temporarily. This allows user-space to participate is aspects of reshape. In particular, data can be copied with not risk of a race. We should not be blocking read requests though, so don't. Cc: stable@kernel.org Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 92ef9b6..1f444ae 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3702,7 +3702,8 @@ static int make_request(struct request_queue *q, struct bio * bi) /* FIXME what if we get a false positive because these * are being updated. */ - if (logical_sector >= mddev->suspend_lo && + if (bio_data_dir(bi) == WRITE && + logical_sector >= mddev->suspend_lo && logical_sector < mddev->suspend_hi) { release_stripe(sh); schedule(); -- cgit v0.10.2 From e62e58a5ffdc98ac28d8dbd070c857620d541f99 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 1 Jul 2009 13:15:35 +1000 Subject: md: use interruptible wait when duration is controlled by userspace. User space can set various limits on an md array so that resync waits when it gets to a certain point, or so that I/O is blocked for a short while. When md is waiting against one of these limit, it should use an interruptible wait so as not to add to the load average, and so are not to trigger a warning if the wait goes on for too long. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 65fe35b..0f4a70c4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6336,10 +6336,16 @@ void md_do_sync(mddev_t *mddev) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - if (j >= mddev->resync_max) - wait_event(mddev->recovery_wait, - mddev->resync_max > j - || kthread_should_stop()); + while (j >= mddev->resync_max && !kthread_should_stop()) { + /* As this condition is controlled by user-space, + * we can block indefinitely, so use '_interruptible' + * to avoid triggering warnings. + */ + flush_signals(current); /* just in case */ + wait_event_interruptible(mddev->recovery_wait, + mddev->resync_max > j + || kthread_should_stop()); + } if (kthread_should_stop()) goto interrupted; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1f444ae..3783553 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3699,14 +3699,21 @@ static int make_request(struct request_queue *q, struct bio * bi) goto retry; } } - /* FIXME what if we get a false positive because these - * are being updated. - */ + if (bio_data_dir(bi) == WRITE && logical_sector >= mddev->suspend_lo && logical_sector < mddev->suspend_hi) { release_stripe(sh); - schedule(); + /* As the suspend_* range is controlled by + * userspace, we want an interruptible + * wait. + */ + flush_signals(current); + prepare_to_wait(&conf->wait_for_overlap, + &w, TASK_INTERRUPTIBLE); + if (logical_sector >= mddev->suspend_lo && + logical_sector < mddev->suspend_hi) + schedule(); goto retry; } -- cgit v0.10.2