diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 1 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 18 | ||||
-rw-r--r-- | drivers/md/dm-bio-list.h | 14 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 94 | ||||
-rw-r--r-- | drivers/md/dm-emc.c | 10 | ||||
-rw-r--r-- | drivers/md/dm-hw-handler.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 15 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 25 | ||||
-rw-r--r-- | drivers/md/dm-linear.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 24 | ||||
-rw-r--r-- | drivers/md/dm-log.h | 10 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 72 | ||||
-rw-r--r-- | drivers/md/dm-mpath.h | 4 | ||||
-rw-r--r-- | drivers/md/dm-path-selector.h | 12 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 47 | ||||
-rw-r--r-- | drivers/md/dm-round-robin.c | 14 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 48 | ||||
-rw-r--r-- | drivers/md/dm-stripe.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-zero.c | 2 | ||||
-rw-r--r-- | drivers/md/dm.c | 138 | ||||
-rw-r--r-- | drivers/md/dm.h | 19 | ||||
-rw-r--r-- | drivers/md/kcopyd.c | 6 | ||||
-rw-r--r-- | drivers/md/md.c | 56 | ||||
-rw-r--r-- | drivers/md/multipath.c | 4 | ||||
-rw-r--r-- | drivers/md/raid1.c | 5 | ||||
-rw-r--r-- | drivers/md/raid10.c | 6 | ||||
-rw-r--r-- | drivers/md/raid5.c | 357 |
27 files changed, 742 insertions, 267 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index c92c1521..4540ade 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -215,6 +215,7 @@ config DM_CRYPT tristate "Crypt target support" depends on BLK_DEV_DM && EXPERIMENTAL select CRYPTO + select CRYPTO_CBC ---help--- This device-mapper target allows you to create a device that transparently encrypts the data on it. You'll need to activate diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index d47d38a..5432d07 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -212,8 +212,8 @@ char *file_path(struct file *file, char *buf, int count) if (!buf) return NULL; - d = file->f_dentry; - v = file->f_vfsmnt; + d = file->f_path.dentry; + v = file->f_path.mnt; buf = d_path(d, v, buf, count); @@ -349,7 +349,7 @@ static struct page *read_page(struct file *file, unsigned long index, unsigned long count) { struct page *page = NULL; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct buffer_head *bh; sector_t block; @@ -536,7 +536,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) " "-- forcing full recovery\n", bmname(bitmap), events, (unsigned long long) bitmap->mddev->events); - sb->state |= BITMAP_STALE; + sb->state |= cpu_to_le32(BITMAP_STALE); } success: /* assign fields using values from superblock */ @@ -544,11 +544,11 @@ success: bitmap->daemon_sleep = daemon_sleep; bitmap->daemon_lastrun = jiffies; bitmap->max_write_behind = write_behind; - bitmap->flags |= sb->state; + bitmap->flags |= le32_to_cpu(sb->state); if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) bitmap->flags |= BITMAP_HOSTENDIAN; bitmap->events_cleared = le64_to_cpu(sb->events_cleared); - if (sb->state & BITMAP_STALE) + if (sb->state & cpu_to_le32(BITMAP_STALE)) bitmap->events_cleared = bitmap->mddev->events; err = 0; out: @@ -578,9 +578,9 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, spin_unlock_irqrestore(&bitmap->lock, flags); sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); switch (op) { - case MASK_SET: sb->state |= bits; + case MASK_SET: sb->state |= cpu_to_le32(bits); break; - case MASK_UNSET: sb->state &= ~bits; + case MASK_UNSET: sb->state &= cpu_to_le32(~bits); break; default: BUG(); } @@ -662,7 +662,7 @@ static void bitmap_file_put(struct bitmap *bitmap) bitmap_file_unmap(bitmap); if (file) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; invalidate_inode_pages(inode->i_mapping); fput(file); } diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h index bbf4615..da43496 100644 --- a/drivers/md/dm-bio-list.h +++ b/drivers/md/dm-bio-list.h @@ -44,6 +44,20 @@ static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) bl->tail = bl2->tail; } +static inline void bio_list_merge_head(struct bio_list *bl, + struct bio_list *bl2) +{ + if (!bl2->head) + return; + + if (bl->head) + bl2->tail->bi_next = bl->head; + else + bl->tail = bl2->tail; + + bl->head = bl2->head; +} + static inline struct bio *bio_list_pop(struct bio_list *bl) { struct bio *bio = bl->head; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 655d816..4c2471e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -16,9 +16,11 @@ #include <linux/slab.h> #include <linux/crypto.h> #include <linux/workqueue.h> +#include <linux/backing-dev.h> #include <asm/atomic.h> #include <linux/scatterlist.h> #include <asm/page.h> +#include <asm/unaligned.h> #include "dm.h" @@ -84,7 +86,10 @@ struct crypt_config { */ struct crypt_iv_operations *iv_gen_ops; char *iv_mode; - struct crypto_cipher *iv_gen_private; + union { + struct crypto_cipher *essiv_tfm; + int benbi_shift; + } iv_gen_private; sector_t iv_offset; unsigned int iv_size; @@ -100,7 +105,7 @@ struct crypt_config { #define MIN_POOL_PAGES 32 #define MIN_BIO_PAGES 8 -static kmem_cache_t *_crypt_io_pool; +static struct kmem_cache *_crypt_io_pool; /* * Different IV generation algorithms: @@ -112,6 +117,9 @@ static kmem_cache_t *_crypt_io_pool; * encrypted with the bulk cipher using a salt as key. The salt * should be derived from the bulk cipher's key via hashing. * + * benbi: the 64-bit "big-endian 'narrow block'-count", starting at 1 + * (needed for LRW-32-AES and possible other narrow block modes) + * * plumb: unimplemented, see: * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 */ @@ -190,21 +198,61 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, } kfree(salt); - cc->iv_gen_private = essiv_tfm; + cc->iv_gen_private.essiv_tfm = essiv_tfm; return 0; } static void crypt_iv_essiv_dtr(struct crypt_config *cc) { - crypto_free_cipher(cc->iv_gen_private); - cc->iv_gen_private = NULL; + crypto_free_cipher(cc->iv_gen_private.essiv_tfm); + cc->iv_gen_private.essiv_tfm = NULL; } static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) { memset(iv, 0, cc->iv_size); *(u64 *)iv = cpu_to_le64(sector); - crypto_cipher_encrypt_one(cc->iv_gen_private, iv, iv); + crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv); + return 0; +} + +static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + unsigned int bs = crypto_blkcipher_blocksize(cc->tfm); + int log = ilog2(bs); + + /* we need to calculate how far we must shift the sector count + * to get the cipher block count, we use this shift in _gen */ + + if (1 << log != bs) { + ti->error = "cypher blocksize is not a power of 2"; + return -EINVAL; + } + + if (log > 9) { + ti->error = "cypher blocksize is > 512"; + return -EINVAL; + } + + cc->iv_gen_private.benbi_shift = 9 - log; + + return 0; +} + +static void crypt_iv_benbi_dtr(struct crypt_config *cc) +{ +} + +static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +{ + __be64 val; + + memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ + + val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1); + put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); + return 0; } @@ -218,13 +266,18 @@ static struct crypt_iv_operations crypt_iv_essiv_ops = { .generator = crypt_iv_essiv_gen }; +static struct crypt_iv_operations crypt_iv_benbi_ops = { + .ctr = crypt_iv_benbi_ctr, + .dtr = crypt_iv_benbi_dtr, + .generator = crypt_iv_benbi_gen +}; static int crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out, struct scatterlist *in, unsigned int length, int write, sector_t sector) { - u8 iv[cc->iv_size]; + u8 iv[cc->iv_size] __attribute__ ((aligned(__alignof__(u64)))); struct blkcipher_desc desc = { .tfm = cc->tfm, .info = iv, @@ -457,11 +510,11 @@ static void dec_pending(struct crypt_io *io, int error) * interrupt context. */ static struct workqueue_struct *_kcryptd_workqueue; -static void kcryptd_do_work(void *data); +static void kcryptd_do_work(struct work_struct *work); static void kcryptd_queue_io(struct crypt_io *io) { - INIT_WORK(&io->work, kcryptd_do_work, io); + INIT_WORK(&io->work, kcryptd_do_work); queue_work(_kcryptd_workqueue, &io->work); } @@ -602,7 +655,7 @@ static void process_write(struct crypt_io *io) /* out of memory -> run queues */ if (remaining) - blk_congestion_wait(bio_data_dir(clone), HZ/100); + congestion_wait(bio_data_dir(clone), HZ/100); } } @@ -617,9 +670,9 @@ static void process_read_endio(struct crypt_io *io) dec_pending(io, crypt_convert(cc, &ctx)); } -static void kcryptd_do_work(void *data) +static void kcryptd_do_work(struct work_struct *work) { - struct crypt_io *io = data; + struct crypt_io *io = container_of(work, struct crypt_io, work); if (io->post_process) process_read_endio(io); @@ -767,7 +820,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->tfm = tfm; /* - * Choose ivmode. Valid modes: "plain", "essiv:<esshash>". + * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". * See comments at iv code */ @@ -777,6 +830,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->iv_gen_ops = &crypt_iv_plain_ops; else if (strcmp(ivmode, "essiv") == 0) cc->iv_gen_ops = &crypt_iv_essiv_ops; + else if (strcmp(ivmode, "benbi") == 0) + cc->iv_gen_ops = &crypt_iv_benbi_ops; else { ti->error = "Invalid IV mode"; goto bad2; @@ -907,15 +962,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, atomic_set(&io->pending, 0); kcryptd_queue_io(io); - return 0; + return DM_MAPIO_SUBMITTED; } static int crypt_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { struct crypt_config *cc = (struct crypt_config *) ti->private; - const char *cipher; - const char *chainmode = NULL; unsigned int sz = 0; switch (type) { @@ -924,14 +977,11 @@ static int crypt_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - cipher = crypto_blkcipher_name(cc->tfm); - - chainmode = cc->chainmode; - if (cc->iv_mode) - DMEMIT("%s-%s-%s ", cipher, chainmode, cc->iv_mode); + DMEMIT("%s-%s-%s ", cc->cipher, cc->chainmode, + cc->iv_mode); else - DMEMIT("%s-%s ", cipher, chainmode); + DMEMIT("%s-%s ", cc->cipher, cc->chainmode); if (cc->key_size > 0) { if ((maxlen - sz) < ((cc->key_size << 1) + 1)) diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c index 2b2d45d7..265c467 100644 --- a/drivers/md/dm-emc.c +++ b/drivers/md/dm-emc.c @@ -40,7 +40,7 @@ static inline void free_bio(struct bio *bio) static int emc_endio(struct bio *bio, unsigned int bytes_done, int error) { - struct path *path = bio->bi_private; + struct dm_path *path = bio->bi_private; if (bio->bi_size) return 1; @@ -61,7 +61,7 @@ static int emc_endio(struct bio *bio, unsigned int bytes_done, int error) return 0; } -static struct bio *get_failover_bio(struct path *path, unsigned data_size) +static struct bio *get_failover_bio(struct dm_path *path, unsigned data_size) { struct bio *bio; struct page *page; @@ -96,7 +96,7 @@ static struct bio *get_failover_bio(struct path *path, unsigned data_size) } static struct request *get_failover_req(struct emc_handler *h, - struct bio *bio, struct path *path) + struct bio *bio, struct dm_path *path) { struct request *rq; struct block_device *bdev = bio->bi_bdev; @@ -133,7 +133,7 @@ static struct request *get_failover_req(struct emc_handler *h, } static struct request *emc_trespass_get(struct emc_handler *h, - struct path *path) + struct dm_path *path) { struct bio *bio; struct request *rq; @@ -191,7 +191,7 @@ static struct request *emc_trespass_get(struct emc_handler *h, } static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed, - struct path *path) + struct dm_path *path) { struct request *rq; struct request_queue *q = bdev_get_queue(path->dev->bdev); diff --git a/drivers/md/dm-hw-handler.h b/drivers/md/dm-hw-handler.h index 15f5629..32eff28 100644 --- a/drivers/md/dm-hw-handler.h +++ b/drivers/md/dm-hw-handler.h @@ -32,7 +32,7 @@ struct hw_handler_type { void (*destroy) (struct hw_handler *hwh); void (*pg_init) (struct hw_handler *hwh, unsigned bypassed, - struct path *path); + struct dm_path *path); unsigned (*error) (struct hw_handler *hwh, struct bio *bio); int (*status) (struct hw_handler *hwh, status_type_t type, char *result, unsigned int maxlen); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index da663d2..4eb73d3 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -92,12 +92,12 @@ void dm_io_put(unsigned int num_pages) *---------------------------------------------------------------*/ static inline void bio_set_region(struct bio *bio, unsigned region) { - bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len = region; + bio->bi_io_vec[bio->bi_max_vecs].bv_len = region; } static inline unsigned bio_get_region(struct bio *bio) { - return bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len; + return bio->bi_io_vec[bio->bi_max_vecs].bv_len; } /*----------------------------------------------------------------- @@ -136,6 +136,7 @@ static int endio(struct bio *bio, unsigned int done, int error) zero_fill_bio(bio); dec_count(io, bio_get_region(bio), error); + bio->bi_max_vecs++; bio_put(bio); return 0; @@ -250,16 +251,18 @@ static void do_region(int rw, unsigned int region, struct io_region *where, while (remaining) { /* - * Allocate a suitably sized bio, we add an extra - * bvec for bio_get/set_region(). + * Allocate a suitably sized-bio: we add an extra + * bvec for bio_get/set_region() and decrement bi_max_vecs + * to hide it from bio_add_page(). */ - num_bvecs = (remaining / (PAGE_SIZE >> 9)) + 2; + num_bvecs = (remaining / (PAGE_SIZE >> SECTOR_SHIFT)) + 2; bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios); bio->bi_sector = where->sector + (where->count - remaining); bio->bi_bdev = where->bdev; bio->bi_end_io = endio; bio->bi_private = io; bio->bi_destructor = dm_bio_destructor; + bio->bi_max_vecs--; bio_set_region(bio, region); /* @@ -302,7 +305,7 @@ static void dispatch_io(int rw, unsigned int num_regions, } /* - * Drop the extra refence that we were holding to avoid + * Drop the extra reference that we were holding to avoid * the io being completed too early. */ dec_count(io, 0, 0); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index d13bb15..cd6a184 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -606,9 +606,14 @@ static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) return __get_name_cell(param->name); md = dm_get_md(huge_decode_dev(param->dev)); - if (md) - mdptr = dm_get_mdptr(md); + if (!md) + goto out; + mdptr = dm_get_mdptr(md); + if (!mdptr) + dm_put(md); + +out: return mdptr; } @@ -760,7 +765,7 @@ out: static int do_suspend(struct dm_ioctl *param) { int r = 0; - int do_lockfs = 1; + unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; struct mapped_device *md; md = find_device(param); @@ -768,10 +773,12 @@ static int do_suspend(struct dm_ioctl *param) return -ENXIO; if (param->flags & DM_SKIP_LOCKFS_FLAG) - do_lockfs = 0; + suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; + if (param->flags & DM_NOFLUSH_FLAG) + suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; if (!dm_suspended(md)) - r = dm_suspend(md, do_lockfs); + r = dm_suspend(md, suspend_flags); if (!r) r = __dev_status(md, param); @@ -783,7 +790,7 @@ static int do_suspend(struct dm_ioctl *param) static int do_resume(struct dm_ioctl *param) { int r = 0; - int do_lockfs = 1; + unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; struct hash_cell *hc; struct mapped_device *md; struct dm_table *new_map; @@ -809,9 +816,11 @@ static int do_resume(struct dm_ioctl *param) if (new_map) { /* Suspend if it isn't already suspended */ if (param->flags & DM_SKIP_LOCKFS_FLAG) - do_lockfs = 0; + suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; + if (param->flags & DM_NOFLUSH_FLAG) + suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; if (!dm_suspended(md)) - dm_suspend(md, do_lockfs); + dm_suspend(md, suspend_flags); r = dm_swap_table(md, new_map); if (r) { diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 0023490..17753d8 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -77,7 +77,7 @@ static int linear_map(struct dm_target *ti, struct bio *bio, bio->bi_bdev = lc->dev->bdev; bio->bi_sector = lc->start + (bio->bi_sector - ti->begin); - return 1; + return DM_MAPIO_REMAPPED; } static int linear_status(struct dm_target *ti, status_type_t type, @@ -108,7 +108,7 @@ static int linear_ioctl(struct dm_target *ti, struct inode *inode, struct dentry fake_dentry = {}; fake_file.f_mode = lc->dev->mode; - fake_file.f_dentry = &fake_dentry; + fake_file.f_path.dentry = &fake_dentry; fake_dentry.d_inode = bdev->bd_inode; return blkdev_driver_ioctl(bdev->bd_inode, &fake_file, bdev->bd_disk, cmd, arg); diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 64b764b..6a92613 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -466,6 +466,7 @@ static int disk_resume(struct dirty_log *log) /* copy clean across to sync */ memcpy(lc->sync_bits, lc->clean_bits, size); lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); + lc->sync_search = 0; /* set the correct number of regions in the header */ lc->header.nr_regions = lc->region_count; @@ -480,6 +481,13 @@ static uint32_t core_get_region_size(struct dirty_log *log) return lc->region_size; } +static int core_resume(struct dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + lc->sync_search = 0; + return 0; +} + static int core_is_clean(struct dirty_log *log, region_t region) { struct log_c *lc = (struct log_c *) log->context; @@ -549,16 +557,19 @@ static int core_get_resync_work(struct dirty_log *log, region_t *region) return 1; } -static void core_complete_resync_work(struct dirty_log *log, region_t region, - int success) +static void core_set_region_sync(struct dirty_log *log, region_t region, + int in_sync) { struct log_c *lc = (struct log_c *) log->context; log_clear_bit(lc, lc->recovering_bits, region); - if (success) { + if (in_sync) { log_set_bit(lc, lc->sync_bits, region); lc->sync_count++; - } + } else if (log_test_bit(lc->sync_bits, region)) { + lc->sync_count--; + log_clear_bit(lc, lc->sync_bits, region); + } } static region_t core_get_sync_count(struct dirty_log *log) @@ -618,6 +629,7 @@ static struct dirty_log_type _core_type = { .module = THIS_MODULE, .ctr = core_ctr, .dtr = core_dtr, + .resume = core_resume, .get_region_size = core_get_region_size, .is_clean = core_is_clean, .in_sync = core_in_sync, @@ -625,7 +637,7 @@ static struct dirty_log_type _core_type = { .mark_region = core_mark_region, .clear_region = core_clear_region, .get_resync_work = core_get_resync_work, - .complete_resync_work = core_complete_resync_work, + .set_region_sync = core_set_region_sync, .get_sync_count = core_get_sync_count, .status = core_status, }; @@ -644,7 +656,7 @@ static struct dirty_log_type _disk_type = { .mark_region = core_mark_region, .clear_region = core_clear_region, .get_resync_work = core_get_resync_work, - .complete_resync_work = core_complete_resync_work, + .set_region_sync = core_set_region_sync, .get_sync_count = core_get_sync_count, .status = disk_status, }; diff --git a/drivers/md/dm-log.h b/drivers/md/dm-log.h index 5ae5309..86a301c 100644 --- a/drivers/md/dm-log.h +++ b/drivers/md/dm-log.h @@ -90,12 +90,12 @@ struct dirty_log_type { int (*get_resync_work)(struct dirty_log *log, region_t *region); /* - * This notifies the log that the resync of an area has - * been completed. The log should then mark this region - * as CLEAN. + * This notifies the log that the resync status of a region + * has changed. It also clears the region from the recovering + * list (if present). */ - void (*complete_resync_work)(struct dirty_log *log, - region_t region, int success); + void (*set_region_sync)(struct dirty_log *log, + region_t region, int in_sync); /* * Returns the number of regions that are in sync. diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index d754e0b..3aa0135 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -31,7 +31,7 @@ struct pgpath { struct priority_group *pg; /* Owning PG */ unsigned fail_count; /* Cumulative failure count */ - struct path path; + struct dm_path path; }; #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) @@ -101,11 +101,11 @@ typedef int (*action_fn) (struct pgpath *pgpath); #define MIN_IOS 256 /* Mempool size */ -static kmem_cache_t *_mpio_cache; +static struct kmem_cache *_mpio_cache; struct workqueue_struct *kmultipathd; -static void process_queued_ios(void *data); -static void trigger_event(void *data); +static void process_queued_ios(struct work_struct *work); +static void trigger_event(struct work_struct *work); /*----------------------------------------------- @@ -173,8 +173,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti) INIT_LIST_HEAD(&m->priority_groups); spin_lock_init(&m->lock); m->queue_io = 1; - INIT_WORK(&m->process_queued_ios, process_queued_ios, m); - INIT_WORK(&m->trigger_event, trigger_event, m); + INIT_WORK(&m->process_queued_ios, process_queued_ios); + INIT_WORK(&m->trigger_event, trigger_event); m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); if (!m->mpio_pool) { kfree(m); @@ -229,7 +229,7 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath) static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) { - struct path *path; + struct dm_path *path; path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); if (!path) @@ -282,10 +282,27 @@ failed: m->current_pg = NULL; } +/* + * Check whether bios must be queued in the device-mapper core rather + * than here in the target. + * + * m->lock must be held on entry. + * + * If m->queue_if_no_path and m->saved_queue_if_no_path hold the + * same value then we are not between multipath_presuspend() + * and multipath_resume() calls and we have no need to check + * for the DMF_NOFLUSH_SUSPENDING flag. + */ +static int __must_push_back(struct multipath *m) +{ + return (m->queue_if_no_path != m->saved_queue_if_no_path && + dm_noflush_suspending(m->ti)); +} + static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio, unsigned was_queued) { - int r = 1; + int r = DM_MAPIO_REMAPPED; unsigned long flags; struct pgpath *pgpath; @@ -310,11 +327,13 @@ static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio, !m->queue_io) queue_work(kmultipathd, &m->process_queued_ios); pgpath = NULL; - r = 0; - } else if (!pgpath) - r = -EIO; /* Failed */ - else + r = DM_MAPIO_SUBMITTED; + } else if (pgpath) bio->bi_bdev = pgpath->path.dev->bdev; + else if (__must_push_back(m)) + r = DM_MAPIO_REQUEUE; + else + r = -EIO; /* Failed */ mpio->pgpath = pgpath; @@ -372,16 +391,19 @@ static void dispatch_queued_ios(struct multipath *m) r = map_io(m, bio, mpio, 1); if (r < 0) bio_endio(bio, bio->bi_size, r); - else if (r == 1) + else if (r == DM_MAPIO_REMAPPED) generic_make_request(bio); + else if (r == DM_MAPIO_REQUEUE) + bio_endio(bio, bio->bi_size, -EIO); bio = next; } } -static void process_queued_ios(void *data) +static void process_queued_ios(struct work_struct *work) { - struct multipath *m = (struct multipath *) data; + struct multipath *m = + container_of(work, struct multipath, process_queued_ios); struct hw_handler *hwh = &m->hw_handler; struct pgpath *pgpath = NULL; unsigned init_required = 0, must_queue = 1; @@ -421,9 +443,10 @@ out: * An event is triggered whenever a path is taken out of use. * Includes path failure and PG bypass. */ -static void trigger_event(void *data) +static void trigger_event(struct work_struct *work) { - struct multipath *m = (struct multipath *) data; + struct multipath *m = + container_of(work, struct multipath, trigger_event); dm_table_event(m->ti->table); } @@ -781,7 +804,7 @@ static int multipath_map(struct dm_target *ti, struct bio *bio, map_context->ptr = mpio; bio->bi_rw |= (1 << BIO_RW_FAILFAST); r = map_io(m, bio, mpio, 0); - if (r < 0) + if (r < 0 || r == DM_MAPIO_REQUEUE) mempool_free(mpio, m->mpio_pool); return r; @@ -955,7 +978,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) /* * pg_init must call this when it has completed its initialisation */ -void dm_pg_init_complete(struct path *path, unsigned err_flags) +void dm_pg_init_complete(struct dm_path *path, unsigned err_flags) { struct pgpath *pgpath = path_to_pgpath(path); struct priority_group *pg = pgpath->pg; @@ -1005,7 +1028,10 @@ static int do_end_io(struct multipath *m, struct bio *bio, spin_lock_irqsave(&m->lock, flags); if (!m->nr_valid_paths) { - if (!m->queue_if_no_path) { + if (__must_push_back(m)) { + spin_unlock_irqrestore(&m->lock, flags); + return DM_ENDIO_REQUEUE; + } else if (!m->queue_if_no_path) { spin_unlock_irqrestore(&m->lock, flags); return -EIO; } else { @@ -1040,7 +1066,7 @@ static int do_end_io(struct multipath *m, struct bio *bio, queue_work(kmultipathd, &m->process_queued_ios); spin_unlock_irqrestore(&m->lock, flags); - return 1; /* io not complete */ + return DM_ENDIO_INCOMPLETE; /* io not complete */ } static int multipath_end_io(struct dm_target *ti, struct bio *bio, @@ -1058,7 +1084,7 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio, if (ps->type->end_io) ps->type->end_io(ps, &pgpath->path); } - if (r <= 0) + if (r != DM_ENDIO_INCOMPLETE) mempool_free(mpio, m->mpio_pool); return r; @@ -1270,7 +1296,7 @@ static int multipath_ioctl(struct dm_target *ti, struct inode *inode, struct dentry fake_dentry = {}; int r = 0; - fake_file.f_dentry = &fake_dentry; + fake_file.f_path.dentry = &fake_dentry; spin_lock_irqsave(&m->lock, flags); diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h index 8a4bf2b..b9cdcbb 100644 --- a/drivers/md/dm-mpath.h +++ b/drivers/md/dm-mpath.h @@ -11,7 +11,7 @@ struct dm_dev; -struct path { +struct dm_path { struct dm_dev *dev; /* Read-only */ unsigned is_active; /* Read-only */ @@ -20,6 +20,6 @@ struct path { }; /* Callback for hwh_pg_init_fn to use when complete */ -void dm_pg_init_complete(struct path *path, unsigned err_flags); +void dm_pg_init_complete(struct dm_path *path, unsigned err_flags); #endif diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index 732d06a..27357b8 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -44,7 +44,7 @@ struct path_selector_type { * Add an opaque path object, along with some selector specific * path args (eg, path priority). */ - int (*add_path) (struct path_selector *ps, struct path *path, + int (*add_path) (struct path_selector *ps, struct dm_path *path, int argc, char **argv, char **error); /* @@ -55,27 +55,27 @@ struct path_selector_type { * calling the function again. 0 means don't call it again unless * the path fails. */ - struct path *(*select_path) (struct path_selector *ps, + struct dm_path *(*select_path) (struct path_selector *ps, unsigned *repeat_count); /* * Notify the selector that a path has failed. */ - void (*fail_path) (struct path_selector *ps, struct path *p); + void (*fail_path) (struct path_selector *ps, struct dm_path *p); /* * Ask selector to reinstate a path. */ - int (*reinstate_path) (struct path_selector *ps, struct path *p); + int (*reinstate_path) (struct path_selector *ps, struct dm_path *p); /* * Table content based on parameters added in ps_add_path_fn * or path selector status */ - int (*status) (struct path_selector *ps, struct path *path, + int (*status) (struct path_selector *ps, struct dm_path *path, status_type_t type, char *result, unsigned int maxlen); - int (*end_io) (struct path_selector *ps, struct path *path); + int (*end_io) (struct path_selector *ps, struct dm_path *path); }; /* Register a path selector */ diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 659224c..23a6426 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -24,6 +24,7 @@ static struct workqueue_struct *_kmirrord_wq; static struct work_struct _kmirrord_work; +static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); static inline void wake(void) { @@ -83,6 +84,7 @@ struct region_hash { struct list_head *buckets; spinlock_t region_lock; + atomic_t recovery_in_flight; struct semaphore recovery_count; struct list_head clean_regions; struct list_head quiesced_regions; @@ -191,6 +193,7 @@ static int rh_init(struct region_hash *rh, struct mirror_set *ms, spin_lock_init(&rh->region_lock); sema_init(&rh->recovery_count, 0); + atomic_set(&rh->recovery_in_flight, 0); INIT_LIST_HEAD(&rh->clean_regions); INIT_LIST_HEAD(&rh->quiesced_regions); INIT_LIST_HEAD(&rh->recovered_regions); @@ -341,6 +344,17 @@ static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list) } } +static void complete_resync_work(struct region *reg, int success) +{ + struct region_hash *rh = reg->rh; + + rh->log->type->set_region_sync(rh->log, reg->key, success); + dispatch_bios(rh->ms, ®->delayed_bios); + if (atomic_dec_and_test(&rh->recovery_in_flight)) + wake_up_all(&_kmirrord_recovery_stopped); + up(&rh->recovery_count); +} + static void rh_update_states(struct region_hash *rh) { struct region *reg, *next; @@ -380,9 +394,7 @@ static void rh_update_states(struct region_hash *rh) */ list_for_each_entry_safe (reg, next, &recovered, list) { rh->log->type->clear_region(rh->log, reg->key); - rh->log->type->complete_resync_work(rh->log, reg->key, 1); - dispatch_bios(rh->ms, ®->delayed_bios); - up(&rh->recovery_count); + complete_resync_work(reg, 1); mempool_free(reg, rh->region_pool); } @@ -502,11 +514,21 @@ static int __rh_recovery_prepare(struct region_hash *rh) static void rh_recovery_prepare(struct region_hash *rh) { - while (!down_trylock(&rh->recovery_count)) + /* Extra reference to avoid race with rh_stop_recovery */ + atomic_inc(&rh->recovery_in_flight); + + while (!down_trylock(&rh->recovery_count)) { + atomic_inc(&rh->recovery_in_flight); if (__rh_recovery_prepare(rh) <= 0) { + atomic_dec(&rh->recovery_in_flight); up(&rh->recovery_count); break; } + } + + /* Drop the extra reference */ + if (atomic_dec_and_test(&rh->recovery_in_flight)) + wake_up_all(&_kmirrord_recovery_stopped); } /* @@ -868,7 +890,7 @@ static void do_mirror(struct mirror_set *ms) do_writes(ms, &writes); } -static void do_work(void *ignored) +static void do_work(struct work_struct *ignored) { struct mirror_set *ms; @@ -1122,7 +1144,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, if (rw == WRITE) { queue_bio(ms, bio, rw); - return 0; + return DM_MAPIO_SUBMITTED; } r = ms->rh.log->type->in_sync(ms->rh.log, @@ -1131,7 +1153,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, return r; if (r == -EWOULDBLOCK) /* FIXME: ugly */ - r = 0; + r = DM_MAPIO_SUBMITTED; /* * We don't want to fast track a recovery just for a read @@ -1144,7 +1166,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, if (!r) { /* Pass this io over to the daemon */ queue_bio(ms, bio, rw); - return 0; + return DM_MAPIO_SUBMITTED; } m = choose_mirror(ms, bio->bi_sector); @@ -1152,7 +1174,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, return -EIO; map_bio(ms, m, bio); - return 1; + return DM_MAPIO_REMAPPED; } static int mirror_end_io(struct dm_target *ti, struct bio *bio, @@ -1177,6 +1199,11 @@ static void mirror_postsuspend(struct dm_target *ti) struct dirty_log *log = ms->rh.log; rh_stop_recovery(&ms->rh); + + /* Wait for all I/O we generated to complete */ + wait_event(_kmirrord_recovery_stopped, + !atomic_read(&ms->rh.recovery_in_flight)); + if (log->type->suspend && log->type->suspend(log)) /* FIXME: need better error handling */ DMWARN("log suspend failed"); @@ -1249,7 +1276,7 @@ static int __init dm_mirror_init(void) dm_dirty_log_exit(); return r; } - INIT_WORK(&_kmirrord_work, do_work, NULL); + INIT_WORK(&_kmirrord_work, do_work); r = dm_register_target(&mirror_target); if (r < 0) { diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index c5a16c5..a348a97 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -21,7 +21,7 @@ *---------------------------------------------------------------*/ struct path_info { struct list_head list; - struct path *path; + struct dm_path *path; unsigned repeat_count; }; @@ -80,7 +80,7 @@ static void rr_destroy(struct path_selector *ps) ps->context = NULL; } -static int rr_status(struct path_selector *ps, struct path *path, +static int rr_status(struct path_selector *ps, struct dm_path *path, status_type_t type, char *result, unsigned int maxlen) { struct path_info *pi; @@ -106,7 +106,7 @@ static int rr_status(struct path_selector *ps, struct path *path, * Called during initialisation to register each path with an * optional repeat_count. */ -static int rr_add_path(struct path_selector *ps, struct path *path, +static int rr_add_path(struct path_selector *ps, struct dm_path *path, int argc, char **argv, char **error) { struct selector *s = (struct selector *) ps->context; @@ -136,12 +136,12 @@ static int rr_add_path(struct path_selector *ps, struct path *path, path->pscontext = pi; - list_add(&pi->list, &s->valid_paths); + list_add_tail(&pi->list, &s->valid_paths); return 0; } -static void rr_fail_path(struct path_selector *ps, struct path *p) +static void rr_fail_path(struct path_selector *ps, struct dm_path *p) { struct selector *s = (struct selector *) ps->context; struct path_info *pi = p->pscontext; @@ -149,7 +149,7 @@ static void rr_fail_path(struct path_selector *ps, struct path *p) list_move(&pi->list, &s->invalid_paths); } -static int rr_reinstate_path(struct path_selector *ps, struct path *p) +static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) { struct selector *s = (struct selector *) ps->context; struct path_info *pi = p->pscontext; @@ -159,7 +159,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct path *p) return 0; } -static struct path *rr_select_path(struct path_selector *ps, +static struct dm_path *rr_select_path(struct path_selector *ps, unsigned *repeat_count) { struct selector *s = (struct selector *) ps->context; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 5281e00..0821a2b 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -39,8 +39,8 @@ */ #define SNAPSHOT_PAGES 256 -struct workqueue_struct *ksnapd; -static void flush_queued_bios(void *data); +static struct workqueue_struct *ksnapd; +static void flush_queued_bios(struct work_struct *work); struct pending_exception { struct exception e; @@ -88,8 +88,8 @@ struct pending_exception { * Hash table mapping origin volumes to lists of snapshots and * a lock to protect it */ -static kmem_cache_t *exception_cache; -static kmem_cache_t *pending_cache; +static struct kmem_cache *exception_cache; +static struct kmem_cache *pending_cache; static mempool_t *pending_pool; /* @@ -228,7 +228,7 @@ static int init_exception_table(struct exception_table *et, uint32_t size) return 0; } -static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem) +static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem) { struct list_head *slot; struct exception *ex, *next; @@ -528,7 +528,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) } bio_list_init(&s->queued_bios); - INIT_WORK(&s->queued_bios_work, flush_queued_bios, s); + INIT_WORK(&s->queued_bios_work, flush_queued_bios); /* Add snapshot to the list of snapshots for this origin */ /* Exceptions aren't triggered till snapshot_resume() is called */ @@ -564,6 +564,17 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) return r; } +static void __free_exceptions(struct dm_snapshot *s) +{ + kcopyd_client_destroy(s->kcopyd_client); + s->kcopyd_client = NULL; + + exit_exception_table(&s->pending, pending_cache); + exit_exception_table(&s->complete, exception_cache); + + s->store.destroy(&s->store); +} + static void snapshot_dtr(struct dm_target *ti) { struct dm_snapshot *s = (struct dm_snapshot *) ti->private; @@ -574,13 +585,7 @@ static void snapshot_dtr(struct dm_target *ti) /* After this returns there can be no new kcopyd jobs. */ unregister_snapshot(s); - kcopyd_client_destroy(s->kcopyd_client); - - exit_exception_table(&s->pending, pending_cache); - exit_exception_table(&s->complete, exception_cache); - - /* Deallocate memory used */ - s->store.destroy(&s->store); + __free_exceptions(s); dm_put_device(ti, s->origin); dm_put_device(ti, s->cow); @@ -603,9 +608,10 @@ static void flush_bios(struct bio *bio) } } -static void flush_queued_bios(void *data) +static void flush_queued_bios(struct work_struct *work) { - struct dm_snapshot *s = (struct dm_snapshot *) data; + struct dm_snapshot *s = + container_of(work, struct dm_snapshot, queued_bios_work); struct bio *queued_bios; unsigned long flags; @@ -867,7 +873,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, { struct exception *e; struct dm_snapshot *s = (struct dm_snapshot *) ti->private; - int r = 1; + int r = DM_MAPIO_REMAPPED; chunk_t chunk; struct pending_exception *pe = NULL; @@ -913,7 +919,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, remap_exception(s, &pe->e, bio); bio_list_add(&pe->snapshot_bios, bio); - r = 0; + r = DM_MAPIO_SUBMITTED; if (!pe->started) { /* this is protected by snap->lock */ @@ -991,7 +997,7 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, *---------------------------------------------------------------*/ static int __origin_write(struct list_head *snapshots, struct bio *bio) { - int r = 1, first = 0; + int r = DM_MAPIO_REMAPPED, first = 0; struct dm_snapshot *snap; struct exception *e; struct pending_exception *pe, *next_pe, *primary_pe = NULL; @@ -1049,7 +1055,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) bio_list_add(&primary_pe->origin_bios, bio); - r = 0; + r = DM_MAPIO_SUBMITTED; } if (!pe->primary_pe) { @@ -1098,7 +1104,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) static int do_origin(struct dm_dev *origin, struct bio *bio) { struct origin *o; - int r = 1; + int r = DM_MAPIO_REMAPPED; down_read(&_origins_lock); o = __lookup_origin(origin->bdev); @@ -1155,7 +1161,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio, return -EOPNOTSUPP; /* Only tell snapshots if this is a write */ - return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1; + return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; } #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 6c29fce..51f5e07 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -186,7 +186,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, bio->bi_bdev = sc->stripe[stripe].dev->bdev; bio->bi_sector = sc->stripe[stripe].physical_start + (chunk << sc->chunk_shift) + (offset & sc->chunk_mask); - return 1; + return DM_MAPIO_REMAPPED; } static int stripe_status(struct dm_target *ti, diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index ea569f7..f314d7d 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -46,7 +46,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio, bio_endio(bio, bio->bi_size, 0); /* accepted bio, don't make new request */ - return 0; + return DM_MAPIO_SUBMITTED; } static struct target_type zero_target = { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b5764a8..fe7c56e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -68,10 +68,12 @@ union map_info *dm_get_mapinfo(struct bio *bio) #define DMF_FROZEN 2 #define DMF_FREEING 3 #define DMF_DELETING 4 +#define DMF_NOFLUSH_SUSPENDING 5 struct mapped_device { struct rw_semaphore io_lock; struct semaphore suspend_lock; + spinlock_t pushback_lock; rwlock_t map_lock; atomic_t holders; atomic_t open_count; @@ -89,7 +91,8 @@ struct mapped_device { */ atomic_t pending; wait_queue_head_t wait; - struct bio_list deferred; + struct bio_list deferred; + struct bio_list pushback; /* * The current mapping. @@ -121,8 +124,8 @@ struct mapped_device { }; #define MIN_IOS 256 -static kmem_cache_t *_io_cache; -static kmem_cache_t *_tio_cache; +static struct kmem_cache *_io_cache; +static struct kmem_cache *_tio_cache; static int __init local_init(void) { @@ -444,23 +447,50 @@ int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) * you this clearly demarcated crap. *---------------------------------------------------------------*/ +static int __noflush_suspending(struct mapped_device *md) +{ + return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); +} + /* * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. */ static void dec_pending(struct dm_io *io, int error) { - if (error) + unsigned long flags; + + /* Push-back supersedes any I/O errors */ + if (error && !(io->error > 0 && __noflush_suspending(io->md))) io->error = error; if (atomic_dec_and_test(&io->io_count)) { + if (io->error == DM_ENDIO_REQUEUE) { + /* + * Target requested pushing back the I/O. + * This must be handled before the sleeper on + * suspend queue merges the pushback list. + */ + spin_lock_irqsave(&io->md->pushback_lock, flags); + if (__noflush_suspending(io->md)) + bio_list_add(&io->md->pushback, io->bio); + else + /* noflush suspend was interrupted. */ + io->error = -EIO; + spin_unlock_irqrestore(&io->md->pushback_lock, flags); + } + if (end_io_acct(io)) /* nudge anyone waiting on suspend queue */ wake_up(&io->md->wait); - blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE); + if (io->error != DM_ENDIO_REQUEUE) { + blk_add_trace_bio(io->md->queue, io->bio, + BLK_TA_COMPLETE); + + bio_endio(io->bio, io->bio->bi_size, io->error); + } - bio_endio(io->bio, io->bio->bi_size, io->error); free_io(io->md, io); } } @@ -480,12 +510,19 @@ static int clone_endio(struct bio *bio, unsigned int done, int error) if (endio) { r = endio(tio->ti, bio, error, &tio->info); - if (r < 0) + if (r < 0 || r == DM_ENDIO_REQUEUE) + /* + * error and requeue request are handled + * in dec_pending(). + */ error = r; - - else if (r > 0) - /* the target wants another shot at the io */ + else if (r == DM_ENDIO_INCOMPLETE) + /* The target will handle the io */ return 1; + else if (r) { + DMWARN("unimplemented target endio return value: %d", r); + BUG(); + } } dec_pending(tio->io, error); @@ -543,7 +580,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, atomic_inc(&tio->io->io_count); sector = clone->bi_sector; r = ti->type->map(ti, clone, &tio->info); - if (r > 0) { + if (r == DM_MAPIO_REMAPPED) { /* the bio has been remapped so dispatch it */ blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, @@ -551,10 +588,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, clone->bi_sector); generic_make_request(clone); - } - - else if (r < 0) { - /* error the io and bail out */ + } else if (r < 0 || r == DM_MAPIO_REQUEUE) { + /* error the io and bail out, or requeue it if needed */ md = tio->io->md; dec_pending(tio->io, r); /* @@ -563,6 +598,9 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, clone->bi_private = md->bs; bio_put(clone); free_tio(md, tio); + } else if (r) { + DMWARN("unimplemented target map return value: %d", r); + BUG(); } } @@ -948,6 +986,7 @@ static struct mapped_device *alloc_dev(int minor) memset(md, 0, sizeof(*md)); init_rwsem(&md->io_lock); init_MUTEX(&md->suspend_lock); + spin_lock_init(&md->pushback_lock); rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); @@ -966,8 +1005,8 @@ static struct mapped_device *alloc_dev(int minor) md->queue->issue_flush_fn = dm_flush_all; md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); - if (!md->io_pool) - goto bad2; + if (!md->io_pool) + goto bad2; md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); if (!md->tio_pool) @@ -1275,20 +1314,30 @@ static void unlock_fs(struct mapped_device *md) * dm_bind_table, dm_suspend must be called to flush any in * flight bios and ensure that any further io gets deferred. */ -int dm_suspend(struct mapped_device *md, int do_lockfs) +int dm_suspend(struct mapped_device *md, unsigned suspend_flags) { struct dm_table *map = NULL; + unsigned long flags; DECLARE_WAITQUEUE(wait, current); struct bio *def; int r = -EINVAL; + int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; + int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; down(&md->suspend_lock); if (dm_suspended(md)) - goto out; + goto out_unlock; map = dm_get_table(md); + /* + * DMF_NOFLUSH_SUSPENDING must be set before presuspend. + * This flag is cleared before dm_suspend returns. + */ + if (noflush) + set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); + /* This does not get reverted if there's an error later. */ dm_table_presuspend_targets(map); @@ -1296,11 +1345,14 @@ int dm_suspend(struct mapped_device *md, int do_lockfs) if (!md->suspended_bdev) { DMWARN("bdget failed in dm_suspend"); r = -ENOMEM; - goto out; + goto flush_and_out; } - /* Flush I/O to the device. */ - if (do_lockfs) { + /* + * Flush I/O to the device. + * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os. + */ + if (do_lockfs && !noflush) { r = lock_fs(md); if (r) goto out; @@ -1336,6 +1388,14 @@ int dm_suspend(struct mapped_device *md, int do_lockfs) down_write(&md->io_lock); remove_wait_queue(&md->wait, &wait); + if (noflush) { + spin_lock_irqsave(&md->pushback_lock, flags); + clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); + bio_list_merge_head(&md->deferred, &md->pushback); + bio_list_init(&md->pushback); + spin_unlock_irqrestore(&md->pushback_lock, flags); + } + /* were we interrupted ? */ r = -EINTR; if (atomic_read(&md->pending)) { @@ -1344,7 +1404,7 @@ int dm_suspend(struct mapped_device *md, int do_lockfs) __flush_deferred_io(md, def); up_write(&md->io_lock); unlock_fs(md); - goto out; + goto out; /* pushback list is already flushed, so skip flush */ } up_write(&md->io_lock); @@ -1354,6 +1414,25 @@ int dm_suspend(struct mapped_device *md, int do_lockfs) r = 0; +flush_and_out: + if (r && noflush) { + /* + * Because there may be already I/Os in the pushback list, + * flush them before return. + */ + down_write(&md->io_lock); + + spin_lock_irqsave(&md->pushback_lock, flags); + clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); + bio_list_merge_head(&md->deferred, &md->pushback); + bio_list_init(&md->pushback); + spin_unlock_irqrestore(&md->pushback_lock, flags); + + def = bio_list_get(&md->deferred); + __flush_deferred_io(md, def); + up_write(&md->io_lock); + } + out: if (r && md->suspended_bdev) { bdput(md->suspended_bdev); @@ -1361,6 +1440,8 @@ out: } dm_table_put(map); + +out_unlock: up(&md->suspend_lock); return r; } @@ -1438,6 +1519,17 @@ int dm_suspended(struct mapped_device *md) return test_bit(DMF_SUSPENDED, &md->flags); } +int dm_noflush_suspending(struct dm_target *ti) +{ + struct mapped_device *md = dm_table_get_md(ti->table); + int r = __noflush_suspending(md); + + dm_put(md); + + return r; +} +EXPORT_SYMBOL_GPL(dm_noflush_suspending); + static struct block_device_operations dm_blk_dops = { .open = dm_blk_open, .release = dm_blk_close, diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a48ec5e..2f796b1 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -33,6 +33,25 @@ #define SECTOR_SHIFT 9 /* + * Definitions of return values from target end_io function. + */ +#define DM_ENDIO_INCOMPLETE 1 +#define DM_ENDIO_REQUEUE 2 + +/* + * Definitions of return values from target map function. + */ +#define DM_MAPIO_SUBMITTED 0 +#define DM_MAPIO_REMAPPED 1 +#define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE + +/* + * Suspend feature flags + */ +#define DM_SUSPEND_LOCKFS_FLAG (1 << 0) +#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) + +/* * List of devices that a metadevice uses and should open/close. */ struct dm_dev { diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c index f1db6ef..b46f6c5 100644 --- a/drivers/md/kcopyd.c +++ b/drivers/md/kcopyd.c @@ -203,7 +203,7 @@ struct kcopyd_job { /* FIXME: this should scale with the number of pages */ #define MIN_JOBS 512 -static kmem_cache_t *_job_cache; +static struct kmem_cache *_job_cache; static mempool_t *_job_pool; /* @@ -417,7 +417,7 @@ static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) /* * kcopyd does this every time it's woken up. */ -static void do_work(void *ignored) +static void do_work(struct work_struct *ignored) { /* * The order that these are called is *very* important. @@ -628,7 +628,7 @@ static int kcopyd_init(void) } kcopyd_clients++; - INIT_WORK(&_kcopyd_work, do_work, NULL); + INIT_WORK(&_kcopyd_work, do_work); mutex_unlock(&kcopyd_init_lock); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 57fa64f..21e2a7b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -39,10 +39,10 @@ #include <linux/raid/bitmap.h> #include <linux/sysctl.h> #include <linux/buffer_head.h> /* for invalidate_bdev */ -#include <linux/suspend.h> #include <linux/poll.h> #include <linux/mutex.h> #include <linux/ctype.h> +#include <linux/freezer.h> #include <linux/init.h> @@ -974,12 +974,13 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) * version 1 superblock */ -static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) +static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) { - unsigned int disk_csum, csum; + __le32 disk_csum; + u32 csum; unsigned long long newcsum; int size = 256 + le32_to_cpu(sb->max_dev)*2; - unsigned int *isuper = (unsigned int*)sb; + __le32 *isuper = (__le32*)sb; int i; disk_csum = sb->sb_csum; @@ -989,7 +990,7 @@ static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) newcsum += le32_to_cpu(*isuper++); if (size == 2) - newcsum += le16_to_cpu(*(unsigned short*) isuper); + newcsum += le16_to_cpu(*(__le16*) isuper); csum = (newcsum & 0xffffffff) + (newcsum >> 32); sb->sb_csum = disk_csum; @@ -1106,7 +1107,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (le32_to_cpu(sb->chunksize)) rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); - if (le32_to_cpu(sb->size) > rdev->size*2) + if (le64_to_cpu(sb->size) > rdev->size*2) return -EINVAL; return ret; } @@ -1228,7 +1229,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) else sb->resync_offset = cpu_to_le64(0); - sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors); + sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); sb->raid_disks = cpu_to_le32(mddev->raid_disks); sb->size = cpu_to_le64(mddev->size<<1); @@ -1412,7 +1413,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); if (IS_ERR(bdev)) { printk(KERN_ERR "md: could not open %s.\n", __bdevname(dev, b)); @@ -1422,7 +1423,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) if (err) { printk(KERN_ERR "md: could not bd_claim %s.\n", bdevname(bdev, b)); - blkdev_put_partition(bdev); + blkdev_put(bdev); return err; } rdev->bdev = bdev; @@ -1436,7 +1437,7 @@ static void unlock_rdev(mdk_rdev_t *rdev) if (!bdev) MD_BUG(); bd_release(bdev); - blkdev_put_partition(bdev); + blkdev_put(bdev); } void md_autodetect_dev(dev_t dev); @@ -2002,6 +2003,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi kobject_init(&rdev->kobj); rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; rdev->flags = 0; rdev->data_offset = 0; rdev->sb_events = 0; @@ -3198,6 +3200,7 @@ static int do_md_run(mddev_t * mddev) mddev->changed = 1; md_new_event(mddev); + kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE); return 0; } @@ -3311,6 +3314,10 @@ static int do_md_stop(mddev_t * mddev, int mode) module_put(mddev->pers->owner); mddev->pers = NULL; + + set_capacity(disk, 0); + mddev->changed = 1; + if (mddev->ro) mddev->ro = 0; } @@ -3330,7 +3337,7 @@ static int do_md_stop(mddev_t * mddev, int mode) if (mode == 0) { mdk_rdev_t *rdev; struct list_head *tmp; - struct gendisk *disk; + printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); bitmap_destroy(mddev); @@ -3355,10 +3362,6 @@ static int do_md_stop(mddev_t * mddev, int mode) mddev->raid_disks = 0; mddev->recovery_cp = 0; - disk = mddev->gendisk; - if (disk) - set_capacity(disk, 0); - mddev->changed = 1; } else if (mddev->pers) printk(KERN_INFO "md: %s switched to read-only mode.\n", mdname(mddev)); @@ -3368,6 +3371,7 @@ out: return err; } +#ifndef MODULE static void autorun_array(mddev_t *mddev) { mdk_rdev_t *rdev; @@ -3482,6 +3486,7 @@ static void autorun_devices(int part) } printk(KERN_INFO "md: ... autorun DONE.\n"); } +#endif /* !MODULE */ static int get_version(void __user * arg) { @@ -3719,6 +3724,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (err) export_rdev(rdev); + md_update_sb(mddev, 1); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); return err; @@ -4043,11 +4049,8 @@ static int update_size(mddev_t *mddev, unsigned long size) return -EBUSY; ITERATE_RDEV(mddev,rdev,tmp) { sector_t avail; - if (rdev->sb_offset > rdev->data_offset) - avail = (rdev->sb_offset*2) - rdev->data_offset; - else - avail = get_capacity(rdev->bdev->bd_disk) - - rdev->data_offset; + avail = rdev->size * 2; + if (fit && (size == 0 || size > avail/2)) size = avail/2; if (avail < ((sector_t)size << 1)) @@ -4423,7 +4426,7 @@ static int md_open(struct inode *inode, struct file *file) mddev_t *mddev = inode->i_bdev->bd_disk->private_data; int err; - if ((err = mddev_lock(mddev))) + if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) goto out; err = 0; @@ -4486,6 +4489,7 @@ static int md_thread(void * arg) * many dirty RAID5 blocks. */ + current->flags |= PF_NOFREEZE; allow_signal(SIGKILL); while (!kthread_should_stop()) { @@ -4502,7 +4506,6 @@ static int md_thread(void * arg) test_bit(THREAD_WAKEUP, &thread->flags) || kthread_should_stop(), thread->timeout); - try_to_freeze(); clear_bit(THREAD_WAKEUP, &thread->flags); @@ -4846,8 +4849,8 @@ static int md_seq_show(struct seq_file *seq, void *v) chunk_kb ? "KB" : "B"); if (bitmap->file) { seq_printf(seq, ", file: "); - seq_path(seq, bitmap->file->f_vfsmnt, - bitmap->file->f_dentry," \t\n"); + seq_path(seq, bitmap->file->f_path.mnt, + bitmap->file->f_path.dentry," \t\n"); } seq_printf(seq, "\n"); @@ -4912,6 +4915,7 @@ static unsigned int mdstat_poll(struct file *filp, poll_table *wait) } static struct file_operations md_seq_fops = { + .owner = THIS_MODULE, .open = md_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -5272,7 +5276,6 @@ void md_do_sync(mddev_t *mddev) mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && - test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > 2) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { @@ -5296,6 +5299,7 @@ void md_do_sync(mddev_t *mddev) rdev->recovery_offset = mddev->curr_resync; } } + set_bit(MD_CHANGE_DEVS, &mddev->flags); skip: mddev->curr_resync = 0; @@ -5592,7 +5596,7 @@ static void autostart_arrays(int part) autorun_devices(part); } -#endif +#endif /* !MODULE */ static __exit void md_exit(void) { diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 171ff41..14da37f 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -277,6 +277,7 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); conf->working_disks--; + mddev->degraded++; printk(KERN_ALERT "multipath: IO failure on %s," " disabling IO path. \n Operation continuing" " on %d IO paths.\n", @@ -336,6 +337,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); conf->working_disks++; + mddev->degraded--; rdev->raid_disk = path; set_bit(In_sync, &rdev->flags); rcu_assign_pointer(p->rdev, rdev); @@ -501,7 +503,7 @@ static int multipath_run (mddev_t *mddev) mdname(mddev)); goto out_free_conf; } - mddev->degraded = conf->raid_disks = conf->working_disks; + mddev->degraded = conf->raid_disks - conf->working_disks; conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS, sizeof(struct multipath_bh)); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index dc9d2de..b3c5e12 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1474,8 +1474,8 @@ static void fix_read_error(conf_t *conf, int read_disk, "raid1:%s: read error corrected " "(%d sectors at %llu on %s)\n", mdname(mddev), s, - (unsigned long long)sect + - rdev->data_offset, + (unsigned long long)(sect + + rdev->data_offset), bdevname(rdev->bdev, b)); } } @@ -1951,6 +1951,7 @@ static int run(mddev_t *mddev) !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; + conf->fullsync = 1; } } if (mddev->degraded == conf->raid_disks) { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1250f0e..7492d60 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1470,8 +1470,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) "raid10:%s: read error corrected" " (%d sectors at %llu on %s)\n", mdname(mddev), s, - (unsigned long long)sect+ - rdev->data_offset, + (unsigned long long)(sect+ + rdev->data_offset), bdevname(rdev->bdev, b)); rdev_dec_pending(rdev, mddev); @@ -2079,7 +2079,7 @@ static int run(mddev_t *mddev) disk = conf->mirrors + i; if (!disk->rdev || - !test_bit(In_sync, &rdev->flags)) { + !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e14f457..377f8bc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -134,6 +134,8 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) if (!test_bit(STRIPE_EXPANDING, &sh->state)) { list_add_tail(&sh->lru, &conf->inactive_list); wake_up(&conf->wait_for_stripe); + if (conf->retry_read_aligned) + md_wakeup_thread(conf->mddev->thread); } } } @@ -348,7 +350,7 @@ static int grow_one_stripe(raid5_conf_t *conf) static int grow_stripes(raid5_conf_t *conf, int num) { - kmem_cache_t *sc; + struct kmem_cache *sc; int devs = conf->raid_disks; sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev)); @@ -397,7 +399,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) LIST_HEAD(newstripes); struct disk_info *ndisks; int err = 0; - kmem_cache_t *sc; + struct kmem_cache *sc; int i; if (newsize <= conf->pool_size) @@ -542,35 +544,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, } if (uptodate) { -#if 0 - struct bio *bio; - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - /* we can return a buffer if we bypassed the cache or - * if the top buffer is not in highmem. If there are - * multiple buffers, leave the extra work to - * handle_stripe - */ - buffer = sh->bh_read[i]; - if (buffer && - (!PageHighMem(buffer->b_page) - || buffer->b_page == bh->b_page ) - ) { - sh->bh_read[i] = buffer->b_reqnext; - buffer->b_reqnext = NULL; - } else - buffer = NULL; - spin_unlock_irqrestore(&conf->device_lock, flags); - if (sh->bh_page[i]==bh->b_page) - set_buffer_uptodate(bh); - if (buffer) { - if (buffer->b_page != bh->b_page) - memcpy(buffer->b_data, bh->b_data, bh->b_size); - buffer->b_end_io(buffer, 1); - } -#else set_bit(R5_UPTODATE, &sh->dev[i].flags); -#endif if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", @@ -616,14 +590,6 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, } } rdev_dec_pending(conf->disks[i].rdev, conf->mddev); -#if 0 - /* must restore b_page before unlocking buffer... */ - if (sh->bh_page[i] != bh->b_page) { - bh->b_page = sh->bh_page[i]; - bh->b_data = page_address(bh->b_page); - clear_buffer_uptodate(bh); - } -#endif clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -821,7 +787,8 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, static sector_t compute_blocknr(struct stripe_head *sh, int i) { raid5_conf_t *conf = sh->raid_conf; - int raid_disks = sh->disks, data_disks = raid_disks - 1; + int raid_disks = sh->disks; + int data_disks = raid_disks - conf->max_degraded; sector_t new_sector = sh->sector, check; int sectors_per_chunk = conf->chunk_size >> 9; sector_t stripe; @@ -857,7 +824,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) } break; case 6: - data_disks = raid_disks - 2; if (i == raid6_next_disk(sh->pd_idx, raid_disks)) return 0; /* It is the Q disk */ switch (conf->algorithm) { @@ -1353,8 +1319,10 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) int pd_idx, dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); - raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk - + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf); + raid5_compute_sector(stripe * (disks - conf->max_degraded) + *sectors_per_chunk + chunk_offset, + disks, disks - conf->max_degraded, + &dd_idx, &pd_idx, conf); return pd_idx; } @@ -1615,15 +1583,6 @@ static void handle_stripe5(struct stripe_head *sh) } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); -#if 0 - /* if I am just reading this block and we don't have - a failed drive, or any pending writes then sidestep the cache */ - if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && - ! syncing && !failed && !to_write) { - sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; - sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; - } -#endif locked++; PRINTK("Reading block %d (sync=%d)\n", i, syncing); @@ -1641,9 +1600,6 @@ static void handle_stripe5(struct stripe_head *sh) dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && (!test_bit(R5_LOCKED, &dev->flags) -#if 0 -|| sh->bh_page[i]!=bh->b_page -#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags) @@ -1655,9 +1611,6 @@ static void handle_stripe5(struct stripe_head *sh) /* Would I have to read this buffer for reconstruct_write */ if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && (!test_bit(R5_LOCKED, &dev->flags) -#if 0 -|| sh->bh_page[i] != bh->b_page -#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags)) rcw++; @@ -1865,7 +1818,9 @@ static void handle_stripe5(struct stripe_head *sh) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); + bi->bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, &bi->bi_flags) + ? 0 : -EIO); } for (i=disks; i-- ;) { int rw; @@ -2193,15 +2148,6 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); -#if 0 - /* if I am just reading this block and we don't have - a failed drive, or any pending writes then sidestep the cache */ - if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && - ! syncing && !failed && !to_write) { - sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; - sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; - } -#endif locked++; PRINTK("Reading block %d (sync=%d)\n", i, syncing); @@ -2220,9 +2166,6 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (!test_bit(R5_OVERWRITE, &dev->flags) && i != pd_idx && i != qd_idx && (!test_bit(R5_LOCKED, &dev->flags) -#if 0 - || sh->bh_page[i] != bh->b_page -#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags)) rcw++; @@ -2418,7 +2361,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); + bi->bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, &bi->bi_flags) + ? 0 : -EIO); } for (i=disks; i-- ;) { int rw; @@ -2611,6 +2556,180 @@ static int raid5_congested(void *data, int bits) return 0; } +/* We want read requests to align with chunks where possible, + * but write requests don't need to. + */ +static int raid5_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec) +{ + mddev_t *mddev = q->queuedata; + sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + int max; + unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int bio_sectors = bio->bi_size >> 9; + + if (bio_data_dir(bio)) + return biovec->bv_len; /* always allow writes to be mergeable */ + + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; + if (max < 0) max = 0; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; + else + return max; +} + + +static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) +{ + sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int bio_sectors = bio->bi_size >> 9; + + return chunk_sectors >= + ((sector & (chunk_sectors - 1)) + bio_sectors); +} + +/* + * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) + * later sampled by raid5d. + */ +static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) +{ + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + + bi->bi_next = conf->retry_read_aligned_list; + conf->retry_read_aligned_list = bi; + + spin_unlock_irqrestore(&conf->device_lock, flags); + md_wakeup_thread(conf->mddev->thread); +} + + +static struct bio *remove_bio_from_retry(raid5_conf_t *conf) +{ + struct bio *bi; + + bi = conf->retry_read_aligned; + if (bi) { + conf->retry_read_aligned = NULL; + return bi; + } + bi = conf->retry_read_aligned_list; + if(bi) { + conf->retry_read_aligned = bi->bi_next; + bi->bi_next = NULL; + bi->bi_phys_segments = 1; /* biased count of active stripes */ + bi->bi_hw_segments = 0; /* count of processed stripes */ + } + + return bi; +} + + +/* + * The "raid5_align_endio" should check if the read succeeded and if it + * did, call bio_endio on the original bio (having bio_put the new bio + * first). + * If the read failed.. + */ +static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error) +{ + struct bio* raid_bi = bi->bi_private; + mddev_t *mddev; + raid5_conf_t *conf; + int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + mdk_rdev_t *rdev; + + if (bi->bi_size) + return 1; + bio_put(bi); + + mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; + conf = mddev_to_conf(mddev); + rdev = (void*)raid_bi->bi_next; + raid_bi->bi_next = NULL; + + rdev_dec_pending(rdev, conf->mddev); + + if (!error && uptodate) { + bio_endio(raid_bi, bytes, 0); + if (atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_stripe); + return 0; + } + + + PRINTK("raid5_align_endio : io error...handing IO for a retry\n"); + + add_bio_to_retry(raid_bi, conf); + return 0; +} + +static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) +{ + mddev_t *mddev = q->queuedata; + raid5_conf_t *conf = mddev_to_conf(mddev); + const unsigned int raid_disks = conf->raid_disks; + const unsigned int data_disks = raid_disks - conf->max_degraded; + unsigned int dd_idx, pd_idx; + struct bio* align_bi; + mdk_rdev_t *rdev; + + if (!in_chunk_boundary(mddev, raid_bio)) { + printk("chunk_aligned_read : non aligned\n"); + return 0; + } + /* + * use bio_clone to make a copy of the bio + */ + align_bi = bio_clone(raid_bio, GFP_NOIO); + if (!align_bi) + return 0; + /* + * set bi_end_io to a new function, and set bi_private to the + * original bio. + */ + align_bi->bi_end_io = raid5_align_endio; + align_bi->bi_private = raid_bio; + /* + * compute position + */ + align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector, + raid_disks, + data_disks, + &dd_idx, + &pd_idx, + conf); + + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[dd_idx].rdev); + if (rdev && test_bit(In_sync, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + raid_bio->bi_next = (void*)rdev; + align_bi->bi_bdev = rdev->bdev; + align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); + align_bi->bi_sector += rdev->data_offset; + + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_stripe, + conf->quiesce == 0, + conf->device_lock, /* nothing */); + atomic_inc(&conf->active_aligned_reads); + spin_unlock_irq(&conf->device_lock); + + generic_make_request(align_bi); + return 1; + } else { + rcu_read_unlock(); + bio_put(align_bi); + return 0; + } +} + + static int make_request(request_queue_t *q, struct bio * bi) { mddev_t *mddev = q->queuedata; @@ -2632,6 +2751,11 @@ static int make_request(request_queue_t *q, struct bio * bi) disk_stat_inc(mddev->gendisk, ios[rw]); disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi)); + if (bio_data_dir(bi) == READ && + mddev->reshape_position == MaxSector && + chunk_aligned_read(q,bi)) + return 0; + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); bi->bi_next = NULL; @@ -2739,7 +2863,9 @@ static int make_request(request_queue_t *q, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); + bi->bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, &bi->bi_flags) + ? 0 : -EIO); } return 0; } @@ -2950,6 +3076,74 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski return STRIPE_SECTORS; } +static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) +{ + /* We may not be able to submit a whole bio at once as there + * may not be enough stripe_heads available. + * We cannot pre-allocate enough stripe_heads as we may need + * more than exist in the cache (if we allow ever large chunks). + * So we do one stripe head at a time and record in + * ->bi_hw_segments how many have been done. + * + * We *know* that this entire raid_bio is in one chunk, so + * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. + */ + struct stripe_head *sh; + int dd_idx, pd_idx; + sector_t sector, logical_sector, last_sector; + int scnt = 0; + int remaining; + int handled = 0; + + logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + sector = raid5_compute_sector( logical_sector, + conf->raid_disks, + conf->raid_disks - conf->max_degraded, + &dd_idx, + &pd_idx, + conf); + last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); + + for (; logical_sector < last_sector; + logical_sector += STRIPE_SECTORS, scnt++) { + + if (scnt < raid_bio->bi_hw_segments) + /* already done this stripe */ + continue; + + sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); + + if (!sh) { + /* failed to get a stripe - must wait */ + raid_bio->bi_hw_segments = scnt; + conf->retry_read_aligned = raid_bio; + return handled; + } + + set_bit(R5_ReadError, &sh->dev[dd_idx].flags); + add_stripe_bio(sh, raid_bio, dd_idx, 0); + handle_stripe(sh, NULL); + release_stripe(sh); + handled++; + } + spin_lock_irq(&conf->device_lock); + remaining = --raid_bio->bi_phys_segments; + spin_unlock_irq(&conf->device_lock); + if (remaining == 0) { + int bytes = raid_bio->bi_size; + + raid_bio->bi_size = 0; + raid_bio->bi_end_io(raid_bio, bytes, + test_bit(BIO_UPTODATE, &raid_bio->bi_flags) + ? 0 : -EIO); + } + if (atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_stripe); + return handled; +} + + + /* * This is our raid5 kernel thread. * @@ -2971,6 +3165,7 @@ static void raid5d (mddev_t *mddev) spin_lock_irq(&conf->device_lock); while (1) { struct list_head *first; + struct bio *bio; if (conf->seq_flush != conf->seq_write) { int seq = conf->seq_flush; @@ -2987,6 +3182,16 @@ static void raid5d (mddev_t *mddev) !list_empty(&conf->delayed_list)) raid5_activate_delayed(conf); + while ((bio = remove_bio_from_retry(conf))) { + int ok; + spin_unlock_irq(&conf->device_lock); + ok = retry_aligned_read(conf, bio); + spin_lock_irq(&conf->device_lock); + if (!ok) + break; + handled++; + } + if (list_empty(&conf->handle_list)) break; @@ -3174,6 +3379,7 @@ static int run(mddev_t *mddev) INIT_LIST_HEAD(&conf->inactive_list); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); + atomic_set(&conf->active_aligned_reads, 0); PRINTK("raid5: run(%s) called.\n", mdname(mddev)); @@ -3320,6 +3526,8 @@ static int run(mddev_t *mddev) mddev->array_size = mddev->size * (conf->previous_raid_disks - conf->max_degraded); + blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); + return 0; abort: if (conf) { @@ -3659,7 +3867,7 @@ static void end_reshape(raid5_conf_t *conf) bdev = bdget_disk(conf->mddev->gendisk, 0); if (bdev) { mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, conf->mddev->array_size << 10); + i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10); mutex_unlock(&bdev->bd_inode->i_mutex); bdput(bdev); } @@ -3694,7 +3902,8 @@ static void raid5_quiesce(mddev_t *mddev, int state) spin_lock_irq(&conf->device_lock); conf->quiesce = 1; wait_event_lock_irq(conf->wait_for_stripe, - atomic_read(&conf->active_stripes) == 0, + atomic_read(&conf->active_stripes) == 0 && + atomic_read(&conf->active_aligned_reads) == 0, conf->device_lock, /* nothing */); spin_unlock_irq(&conf->device_lock); break; |