From f7bee80945155ad0326916486dabc38428c6cdef Mon Sep 17 00:00:00 2001 From: Krzysztof Wojcik Date: Mon, 14 Feb 2011 10:01:41 +1100 Subject: md: Fix raid1->raid0 takeover Takeover raid1->raid0 not succeded. Kernel message is shown: "md/raid0:md126: too few disks (1 of 2) - aborting!" Problem was that we weren't updating ->raid_disks for that takeover, unlike all the others. Signed-off-by: Krzysztof Wojcik Signed-off-by: NeilBrown diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 637a968..75671df 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -670,6 +670,7 @@ static void *raid0_takeover_raid1(mddev_t *mddev) mddev->new_layout = 0; mddev->new_chunk_sectors = 128; /* by default set chunk size to 64k */ mddev->delta_disks = 1 - mddev->raid_disks; + mddev->raid_disks = 1; /* make sure it will be not marked as dirty */ mddev->recovery_cp = MaxSector; -- cgit v0.10.2 From cbe6ef1d2622e08e272600b3cb6040bed60f0450 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Feb 2011 13:58:38 +1100 Subject: md: don't set_capacity before array is active. If the desired size of an array is set (via sysfs) before the array is active (which is the normal sequence), we currrently call set_capacity immediately. This means that a subsequent 'open' (as can be caused by some udev-triggers program) will notice the new size and try to probe for partitions. However as the array isn't quite ready yet the read will fail. Then when the array is read, as the size doesn't change again we don't try to re-probe. So when setting array size via sysfs, only call set_capacity if the array is already active. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 0cc30ec..6818ff4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4138,10 +4138,10 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len) } mddev->array_sectors = sectors; - set_capacity(mddev->gendisk, mddev->array_sectors); - if (mddev->pers) + if (mddev->pers) { + set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); - + } return len; } -- cgit v0.10.2 From 8f5f02c460b7ca74ce55ce126ce0c1e58a3f923d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Feb 2011 13:58:51 +1100 Subject: md: correctly handle probe of an 'mdp' device. 'mdp' devices are md devices with preallocated device numbers for partitions. As such it is possible to mknod and open a partition before opening the whole device. this causes md_probe() to be called with a device number of a partition, which in-turn calls mddev_find with such a number. However mddev_find expects the number of a 'whole device' and does the wrong thing with partition numbers. So add code to mddev_find to remove the 'partition' part of a device number and just work with the 'whole device'. This patch addresses https://bugzilla.kernel.org/show_bug.cgi?id=28652 Reported-by: hkmaly@bigfoot.com Signed-off-by: NeilBrown Cc: diff --git a/drivers/md/md.c b/drivers/md/md.c index 6818ff4..330addf 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -553,6 +553,9 @@ static mddev_t * mddev_find(dev_t unit) { mddev_t *mddev, *new = NULL; + if (unit && MAJOR(unit) != MD_MAJOR) + unit &= ~((1< Date: Mon, 21 Feb 2011 18:25:57 +1100 Subject: md: avoid spinlock problem in blk_throtl_exit blk_throtl_exit assumes that ->queue_lock still exists, so make sure that it does. To do this, we stop redirecting ->queue_lock to conf->device_lock and leave it pointing where it is initialised - __queue_lock. As the blk_plug functions check the ->queue_lock is held, we now take that spin_lock explicitly around the plug functions. We don't need the locking, just the warning removal. This is needed for any kernel with the blk_throtl code, which is which is 2.6.37 and later. Cc: stable@kernel.org Signed-off-by: NeilBrown diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 8a2f767..0ed7f6b 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -216,7 +216,6 @@ static int linear_run (mddev_t *mddev) if (md_check_no_bitmap(mddev)) return -EINVAL; - mddev->queue->queue_lock = &mddev->queue->__queue_lock; conf = linear_conf(mddev, mddev->raid_disks); if (!conf) diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 6d7ddf3..3a62d44 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -435,7 +435,6 @@ static int multipath_run (mddev_t *mddev) * bookkeeping area. [whatever we allocate in multipath_run(), * should be freed in multipath_stop()] */ - mddev->queue->queue_lock = &mddev->queue->__queue_lock; conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL); mddev->private = conf; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 75671df..c0ac457 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -361,7 +361,6 @@ static int raid0_run(mddev_t *mddev) if (md_check_no_bitmap(mddev)) return -EINVAL; blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); - mddev->queue->queue_lock = &mddev->queue->__queue_lock; /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a23ffa3..06cd712 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -593,7 +593,10 @@ static int flush_pending_writes(conf_t *conf) if (conf->pending_bio_list.head) { struct bio *bio; bio = bio_list_get(&conf->pending_bio_list); + /* Only take the spinlock to quiet a warning */ + spin_lock(conf->mddev->queue->queue_lock); blk_remove_plug(conf->mddev->queue); + spin_unlock(conf->mddev->queue->queue_lock); spin_unlock_irq(&conf->device_lock); /* flush any pending bitmap writes to * disk before proceeding w/ I/O */ @@ -959,7 +962,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) atomic_inc(&r1_bio->remaining); spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); - blk_plug_device(mddev->queue); + blk_plug_device_unlocked(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); } r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL); @@ -2021,7 +2024,6 @@ static int run(mddev_t *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); - mddev->queue->queue_lock = &conf->device_lock; list_for_each_entry(rdev, &mddev->disks, same_set) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3b607b2..747d061 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -662,7 +662,10 @@ static int flush_pending_writes(conf_t *conf) if (conf->pending_bio_list.head) { struct bio *bio; bio = bio_list_get(&conf->pending_bio_list); + /* Spinlock only taken to quiet a warning */ + spin_lock(conf->mddev->queue->queue_lock); blk_remove_plug(conf->mddev->queue); + spin_unlock(conf->mddev->queue->queue_lock); spin_unlock_irq(&conf->device_lock); /* flush any pending bitmap writes to disk * before proceeding w/ I/O */ @@ -971,7 +974,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) atomic_inc(&r10_bio->remaining); spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); - blk_plug_device(mddev->queue); + blk_plug_device_unlocked(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); } @@ -2304,8 +2307,6 @@ static int run(mddev_t *mddev) if (!conf) goto out; - mddev->queue->queue_lock = &conf->device_lock; - mddev->thread = conf->thread; conf->thread = NULL; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7028128..78536fd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5204,7 +5204,6 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->queue->queue_lock = &conf->device_lock; mddev->queue->unplug_fn = raid5_unplug_queue; chunk_size = mddev->chunk_sectors << 9; -- cgit v0.10.2 From 93b270f76e7ef3b81001576860c2701931cdc78b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 24 Feb 2011 17:25:47 +1100 Subject: Fix over-zealous flush_disk when changing device size. There are two cases when we call flush_disk. In one, the device has disappeared (check_disk_change) so any data will hold becomes irrelevant. In the oter, the device has changed size (check_disk_size_change) so data we hold may be irrelevant. In both cases it makes sense to discard any 'clean' buffers, so they will be read back from the device if needed. In the former case it makes sense to discard 'dirty' buffers as there will never be anywhere safe to write the data. In the second case it *does*not* make sense to discard dirty buffers as that will lead to file system corruption when you simply enlarge the containing devices. flush_disk calls __invalidate_devices. __invalidate_device calls both invalidate_inodes and invalidate_bdev. invalidate_inodes *does* discard I_DIRTY inodes and this does lead to fs corruption. invalidate_bev *does*not* discard dirty pages, but I don't really care about that at present. So this patch adds a flag to __invalidate_device (calling it __invalidate_device2) to indicate whether dirty buffers should be killed, and this is passed to invalidate_inodes which can choose to skip dirty inodes. flusk_disk then passes true from check_disk_change and false from check_disk_size_change. dm avoids tripping over this problem by calling i_size_write directly rathher than using check_disk_size_change. md does use check_disk_size_change and so is affected. This regression was introduced by commit 608aeef17a which causes check_disk_size_change to call flush_disk, so it is suitable for any kernel since 2.6.27. Cc: stable@kernel.org Acked-by: Jeff Moyer Cc: Andrew Patterson Cc: Jens Axboe Signed-off-by: NeilBrown diff --git a/block/genhd.c b/block/genhd.c index 6a5b772..cbf1112 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1355,7 +1355,7 @@ int invalidate_partition(struct gendisk *disk, int partno) struct block_device *bdev = bdget_disk(disk, partno); if (bdev) { fsync_bdev(bdev); - res = __invalidate_device(bdev); + res = __invalidate_device(bdev, true); bdput(bdev); } return res; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index b9ba04f..77fc76f 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3281,7 +3281,7 @@ static int set_geometry(unsigned int cmd, struct floppy_struct *g, struct block_device *bdev = opened_bdev[cnt]; if (!bdev || ITYPE(drive_state[cnt].fd_device) != type) continue; - __invalidate_device(bdev); + __invalidate_device(bdev, true); } mutex_unlock(&open_lock); } else { diff --git a/fs/block_dev.c b/fs/block_dev.c index 333a7bb..5e23152 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -927,9 +927,9 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); * when a disk has been changed -- either by a media change or online * resize. */ -static void flush_disk(struct block_device *bdev) +static void flush_disk(struct block_device *bdev, bool kill_dirty) { - if (__invalidate_device(bdev)) { + if (__invalidate_device(bdev, kill_dirty)) { char name[BDEVNAME_SIZE] = ""; if (bdev->bd_disk) @@ -966,7 +966,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) "%s: detected capacity change from %lld to %lld\n", name, bdev_size, disk_size); i_size_write(bdev->bd_inode, disk_size); - flush_disk(bdev); + flush_disk(bdev, false); } } EXPORT_SYMBOL(check_disk_size_change); @@ -1019,7 +1019,7 @@ int check_disk_change(struct block_device *bdev) if (!(events & DISK_EVENT_MEDIA_CHANGE)) return 0; - flush_disk(bdev); + flush_disk(bdev, true); if (bdops->revalidate_disk) bdops->revalidate_disk(bdev->bd_disk); return 1; @@ -1601,7 +1601,7 @@ fail: } EXPORT_SYMBOL(lookup_bdev); -int __invalidate_device(struct block_device *bdev) +int __invalidate_device(struct block_device *bdev, bool kill_dirty) { struct super_block *sb = get_super(bdev); int res = 0; @@ -1614,7 +1614,7 @@ int __invalidate_device(struct block_device *bdev) * hold). */ shrink_dcache_sb(sb); - res = invalidate_inodes(sb); + res = invalidate_inodes(sb, kill_dirty); drop_super(sb); } invalidate_bdev(bdev); diff --git a/fs/inode.c b/fs/inode.c index da85e56..c50d7fe 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -540,11 +540,14 @@ void evict_inodes(struct super_block *sb) /** * invalidate_inodes - attempt to free all inodes on a superblock * @sb: superblock to operate on + * @kill_dirty: flag to guide handling of dirty inodes * * Attempts to free all inodes for a given superblock. If there were any * busy inodes return a non-zero value, else zero. + * If @kill_dirty is set, discard dirty inodes too, otherwise treat + * them as busy. */ -int invalidate_inodes(struct super_block *sb) +int invalidate_inodes(struct super_block *sb, bool kill_dirty) { int busy = 0; struct inode *inode, *next; @@ -556,6 +559,10 @@ int invalidate_inodes(struct super_block *sb) list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) continue; + if (inode->i_state & I_DIRTY && !kill_dirty) { + busy = 1; + continue; + } if (atomic_read(&inode->i_count)) { busy = 1; continue; diff --git a/fs/internal.h b/fs/internal.h index 0663568..9b976b5 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -112,4 +112,4 @@ extern void release_open_intent(struct nameidata *); */ extern int get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); -extern int invalidate_inodes(struct super_block *); +extern int invalidate_inodes(struct super_block *, bool); diff --git a/include/linux/fs.h b/include/linux/fs.h index 32b38cd..683f4c5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2139,7 +2139,7 @@ extern void check_disk_size_change(struct gendisk *disk, struct block_device *bdev); extern int revalidate_disk(struct gendisk *); extern int check_disk_change(struct block_device *); -extern int __invalidate_device(struct block_device *); +extern int __invalidate_device(struct block_device *, bool); extern int invalidate_partition(struct gendisk *, int); #endif unsigned long invalidate_mapping_pages(struct address_space *mapping, -- cgit v0.10.2 From f0b4f7e2f29af678bd9af43422c537dcb6008603 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 24 Feb 2011 17:26:41 +1100 Subject: md: Fix - again - partition detection when array becomes active Revert b821eaa572fd737faaf6928ba046e571526c36c6 and f3b99be19ded511a1bf05a148276239d9f13eefa When I wrote the first of these I had a wrong idea about the lifetime of 'struct block_device'. It can disappear at any time that the block device is not open if it falls out of the inode cache. So relying on the 'size' recorded with it to detect when the device size has changed and so we need to revalidate, is wrong. Rather, we really do need the 'changed' attribute stored directly in the mddev and set/tested as appropriate. Without this patch, a sequence of: mknod / open / close / unlink (which can cause a block_device to be created and then destroyed) will result in a rescan of the partition table and consequence removal and addition of partitions. Several of these in a row can get udev racing to create and unlink and other code can get confused. With the patch, the rescan is only performed when needed and so there are no races. This is suitable for any stable kernel from 2.6.35. Reported-by: "Wojcik, Krzysztof" Signed-off-by: NeilBrown Cc: stable@kernel.org diff --git a/drivers/md/md.c b/drivers/md/md.c index 330addf..818313e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4627,6 +4627,7 @@ static int do_md_run(mddev_t *mddev) } set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); + mddev->changed = 1; kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); out: return err; @@ -4715,6 +4716,7 @@ static void md_clean(mddev_t *mddev) mddev->sync_speed_min = mddev->sync_speed_max = 0; mddev->recovery = 0; mddev->in_sync = 0; + mddev->changed = 0; mddev->degraded = 0; mddev->safemode = 0; mddev->bitmap_info.offset = 0; @@ -4830,6 +4832,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) set_capacity(disk, 0); mutex_unlock(&mddev->open_mutex); + mddev->changed = 1; revalidate_disk(disk); if (mddev->ro) @@ -6014,7 +6017,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); - check_disk_size_change(mddev->gendisk, bdev); + check_disk_change(bdev); out: return err; } @@ -6029,6 +6032,21 @@ static int md_release(struct gendisk *disk, fmode_t mode) return 0; } + +static int md_media_changed(struct gendisk *disk) +{ + mddev_t *mddev = disk->private_data; + + return mddev->changed; +} + +static int md_revalidate(struct gendisk *disk) +{ + mddev_t *mddev = disk->private_data; + + mddev->changed = 0; + return 0; +} static const struct block_device_operations md_fops = { .owner = THIS_MODULE, @@ -6039,6 +6057,8 @@ static const struct block_device_operations md_fops = .compat_ioctl = md_compat_ioctl, #endif .getgeo = md_getgeo, + .media_changed = md_media_changed, + .revalidate_disk= md_revalidate, }; static int md_thread(void * arg) diff --git a/drivers/md/md.h b/drivers/md/md.h index 7e90b85..12215d4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -274,6 +274,8 @@ struct mddev_s atomic_t active; /* general refcount */ atomic_t openers; /* number of active opens */ + int changed; /* True if we might need to + * reread partition info */ int degraded; /* whether md should consider * adding a spare */ -- cgit v0.10.2