diff options
author | Scott Wood <scottwood@freescale.com> | 2015-02-13 22:12:06 (GMT) |
---|---|---|
committer | Scott Wood <scottwood@freescale.com> | 2015-02-13 22:19:22 (GMT) |
commit | 6faa2909871d8937cb2f79a10e1b21ffe193fac1 (patch) | |
tree | f558a94f1553814cc122ab8d9e04c0ebad5262a5 /drivers/md | |
parent | fcb2fb84301c673ee15ca04e7a2fc965712d49a0 (diff) | |
download | linux-fsl-qoriq-6faa2909871d8937cb2f79a10e1b21ffe193fac1.tar.xz |
Reset to 3.12.37
Diffstat (limited to 'drivers/md')
29 files changed, 487 insertions, 314 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 5eb76dd..f950c9d 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -1,7 +1,6 @@ config BCACHE tristate "Block device as cache" - depends on !PREEMPT_RT_FULL ---help--- Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 0f12382..7552207 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -663,9 +663,13 @@ struct gc_stat { * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. * flushing dirty data). + * + * CACHE_SET_RUNNING means all cache devices have been registered and journal + * replay is complete. */ #define CACHE_SET_UNREGISTERING 0 #define CACHE_SET_STOPPING 1 +#define CACHE_SET_RUNNING 2 struct cache_set { struct closure cl; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index d1734d9..26ca4db 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -141,7 +141,7 @@ static void bch_btree_node_read_done(struct btree *b) struct bset *i = b->sets[0].data; struct btree_iter *iter; - iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); + iter = mempool_alloc(b->c->fill_iter, GFP_NOIO); iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8435f81..c494379 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -42,11 +42,11 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, int ret = 0; sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); - pr_debug("reading %llu", (uint64_t) bucket); + pr_debug("reading %u", bucket_index); while (offset < ca->sb.bucket_size) { reread: left = ca->sb.bucket_size - offset; - len = min_t(unsigned, left, PAGE_SECTORS * 8); + len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS); bio_reset(bio); bio->bi_sector = bucket + offset; @@ -72,17 +72,26 @@ reread: left = ca->sb.bucket_size - offset; struct list_head *where; size_t blocks, bytes = set_bytes(j); - if (j->magic != jset_magic(ca->set)) + if (j->magic != jset_magic(ca->set)) { + pr_debug("%u: bad magic", bucket_index); return ret; + } - if (bytes > left << 9) + if (bytes > left << 9 || + bytes > PAGE_SIZE << JSET_BITS) { + pr_info("%u: too big, %zu bytes, offset %u", + bucket_index, bytes, offset); return ret; + } if (bytes > len << 9) goto reread; - if (j->csum != csum_set(j)) + if (j->csum != csum_set(j)) { + pr_info("%u: bad csum, %zu bytes, offset %u", + bucket_index, bytes, offset); return ret; + } blocks = set_blocks(j, ca->set); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 547c4c5..f5004c5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1235,6 +1235,9 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) if (test_bit(CACHE_SET_STOPPING, &c->flags)) return -EINTR; + if (!test_bit(CACHE_SET_RUNNING, &c->flags)) + return -EPERM; + u = uuid_find_empty(c); if (!u) { pr_err("Can't create volume, no room for UUID"); @@ -1300,8 +1303,11 @@ static void cache_set_free(struct closure *cl) bch_journal_free(c); for_each_cache(ca, c, i) - if (ca) + if (ca) { + ca->set = NULL; + c->cache[ca->sb.nr_this_dev] = NULL; kobject_put(&ca->kobj); + } free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); @@ -1637,6 +1643,7 @@ static void run_cache_set(struct cache_set *c) flash_devs_run(c); + set_bit(CACHE_SET_RUNNING, &c->flags); return; err_unlock_gc: closure_set_stopped(&c->gc.cl); @@ -1722,8 +1729,10 @@ void bch_cache_release(struct kobject *kobj) { struct cache *ca = container_of(kobj, struct cache, kobj); - if (ca->set) + if (ca->set) { + BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca); ca->set->cache[ca->sb.nr_this_dev] = NULL; + } bch_cache_allocator_exit(ca); @@ -1794,7 +1803,7 @@ err: } static void register_cache(struct cache_sb *sb, struct page *sb_page, - struct block_device *bdev, struct cache *ca) + struct block_device *bdev, struct cache *ca) { char name[BDEVNAME_SIZE]; const char *err = "cannot allocate memory"; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index ea345c6..4dd1e2c 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -417,8 +417,8 @@ do { \ average_frequency, frequency_units); \ __print_time_stat(stats, name, \ average_duration, duration_units); \ - __print_time_stat(stats, name, \ - max_duration, duration_units); \ + sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \ + div_u64((stats)->max_duration, NSEC_PER_ ## duration_units));\ \ sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ ? div_s64(local_clock() - (stats)->last, \ diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 54bdd923..93edd89 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -463,6 +463,7 @@ static void __relink_lru(struct dm_buffer *b, int dirty) c->n_buffers[dirty]++; b->list_mode = dirty; list_move(&b->lru_list, &c->lru[dirty]); + b->last_accessed = jiffies; } /*---------------------------------------------------------------- @@ -529,6 +530,19 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block, end_io(&b->bio, r); } +static void inline_endio(struct bio *bio, int error) +{ + bio_end_io_t *end_fn = bio->bi_private; + + /* + * Reset the bio to free any attached resources + * (e.g. bio integrity profiles). + */ + bio_reset(bio); + + end_fn(bio, error); +} + static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, bio_end_io_t *end_io) { @@ -540,7 +554,12 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; b->bio.bi_sector = block << b->c->sectors_per_block_bits; b->bio.bi_bdev = b->c->bdev; - b->bio.bi_end_io = end_io; + b->bio.bi_end_io = inline_endio; + /* + * Use of .bi_private isn't a problem here because + * the dm_buffer's inline bio is local to bufio. + */ + b->bio.bi_private = end_io; /* * We assume that if len >= PAGE_SIZE ptr is page-aligned. @@ -1417,9 +1436,9 @@ static void drop_buffers(struct dm_bufio_client *c) /* * Test if the buffer is unused and too old, and commit it. - * At if noio is set, we must not do any I/O because we hold - * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to - * different bufio client. + * And if GFP_NOFS is used, we must not do any I/O because we hold + * dm_bufio_clients_lock and we would risk deadlock if the I/O gets + * rerouted to different bufio client. */ static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, unsigned long max_jiffies) @@ -1427,7 +1446,7 @@ static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, if (jiffies - b->last_accessed < max_jiffies) return 0; - if (!(gfp & __GFP_IO)) { + if (!(gfp & __GFP_FS)) { if (test_bit(B_READING, &b->state) || test_bit(B_WRITING, &b->state) || test_bit(B_DIRTY, &b->state)) @@ -1455,9 +1474,9 @@ static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { freed += __cleanup_old_buffer(b, gfp_mask, 0); if (!--nr_to_scan) - break; + return freed; + dm_bufio_cond_resched(); } - dm_bufio_cond_resched(); } return freed; } @@ -1469,7 +1488,7 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unsigned long freed; c = container_of(shrink, struct dm_bufio_client, shrinker); - if (sc->gfp_mask & __GFP_IO) + if (sc->gfp_mask & __GFP_FS) dm_bufio_lock(c); else if (!dm_bufio_trylock(c)) return SHRINK_STOP; @@ -1486,7 +1505,7 @@ dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count; c = container_of(shrink, struct dm_bufio_client, shrinker); - if (sc->gfp_mask & __GFP_IO) + if (sc->gfp_mask & __GFP_FS) dm_bufio_lock(c); else if (!dm_bufio_trylock(c)) return 0; @@ -1511,7 +1530,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign BUG_ON(block_size < 1 << SECTOR_SHIFT || (block_size & (block_size - 1))); - c = kmalloc(sizeof(*c), GFP_KERNEL); + c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) { r = -ENOMEM; goto bad_client; diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 1af7255..0bfd9c0 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -88,6 +88,9 @@ struct cache_disk_superblock { } __packed; struct dm_cache_metadata { + atomic_t ref_count; + struct list_head list; + struct block_device *bdev; struct dm_block_manager *bm; struct dm_space_map *metadata_sm; @@ -114,6 +117,12 @@ struct dm_cache_metadata { unsigned policy_version[CACHE_POLICY_VERSION_SIZE]; size_t policy_hint_size; struct dm_cache_statistics stats; + + /* + * Reading the space map root can fail, so we read it into this + * buffer before the superblock is locked and updated. + */ + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; }; /*------------------------------------------------------------------- @@ -242,11 +251,31 @@ static void __setup_mapping_info(struct dm_cache_metadata *cmd) } } +static int __save_sm_root(struct dm_cache_metadata *cmd) +{ + int r; + size_t metadata_len; + + r = dm_sm_root_size(cmd->metadata_sm, &metadata_len); + if (r < 0) + return r; + + return dm_sm_copy_root(cmd->metadata_sm, &cmd->metadata_space_map_root, + metadata_len); +} + +static void __copy_sm_root(struct dm_cache_metadata *cmd, + struct cache_disk_superblock *disk_super) +{ + memcpy(&disk_super->metadata_space_map_root, + &cmd->metadata_space_map_root, + sizeof(cmd->metadata_space_map_root)); +} + static int __write_initial_superblock(struct dm_cache_metadata *cmd) { int r; struct dm_block *sblock; - size_t metadata_len; struct cache_disk_superblock *disk_super; sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT; @@ -254,12 +283,16 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS) bdev_size = DM_CACHE_METADATA_MAX_SECTORS; - r = dm_sm_root_size(cmd->metadata_sm, &metadata_len); + r = dm_tm_pre_commit(cmd->tm); if (r < 0) return r; - r = dm_tm_pre_commit(cmd->tm); - if (r < 0) + /* + * dm_sm_copy_root() can fail. So we need to do it before we start + * updating the superblock. + */ + r = __save_sm_root(cmd); + if (r) return r; r = superblock_lock_zero(cmd, &sblock); @@ -275,10 +308,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); disk_super->policy_hint_size = 0; - r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, - metadata_len); - if (r < 0) - goto bad_locked; + __copy_sm_root(cmd, disk_super); disk_super->mapping_root = cpu_to_le64(cmd->root); disk_super->hint_root = cpu_to_le64(cmd->hint_root); @@ -295,10 +325,6 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->write_misses = cpu_to_le32(0); return dm_tm_commit(cmd->tm, sblock); - -bad_locked: - dm_bm_unlock(sblock); - return r; } static int __format_metadata(struct dm_cache_metadata *cmd) @@ -384,6 +410,15 @@ static int __open_metadata(struct dm_cache_metadata *cmd) disk_super = dm_block_data(sblock); + /* Verify the data block size hasn't changed */ + if (le32_to_cpu(disk_super->data_block_size) != cmd->data_block_size) { + DMERR("changing the data block size (from %u to %llu) is not supported", + le32_to_cpu(disk_super->data_block_size), + (unsigned long long)cmd->data_block_size); + r = -EINVAL; + goto bad; + } + r = __check_incompat_features(disk_super, cmd); if (r < 0) goto bad; @@ -511,8 +546,9 @@ static int __begin_transaction_flags(struct dm_cache_metadata *cmd, disk_super = dm_block_data(sblock); update_flags(disk_super, mutator); read_superblock_fields(cmd, disk_super); + dm_bm_unlock(sblock); - return dm_bm_flush_and_unlock(cmd->bm, sblock); + return dm_bm_flush(cmd->bm); } static int __begin_transaction(struct dm_cache_metadata *cmd) @@ -540,7 +576,6 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, flags_mutator mutator) { int r; - size_t metadata_len; struct cache_disk_superblock *disk_super; struct dm_block *sblock; @@ -558,8 +593,8 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, if (r < 0) return r; - r = dm_sm_root_size(cmd->metadata_sm, &metadata_len); - if (r < 0) + r = __save_sm_root(cmd); + if (r) return r; r = superblock_lock(cmd, &sblock); @@ -586,13 +621,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits); disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses); - - r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, - metadata_len); - if (r < 0) { - dm_bm_unlock(sblock); - return r; - } + __copy_sm_root(cmd, disk_super); return dm_tm_commit(cmd->tm, sblock); } @@ -624,10 +653,10 @@ static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags) /*----------------------------------------------------------------*/ -struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, - sector_t data_block_size, - bool may_format_device, - size_t policy_hint_size) +static struct dm_cache_metadata *metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size) { int r; struct dm_cache_metadata *cmd; @@ -638,6 +667,7 @@ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, return NULL; } + atomic_set(&cmd->ref_count, 1); init_rwsem(&cmd->root_lock); cmd->bdev = bdev; cmd->data_block_size = data_block_size; @@ -660,10 +690,95 @@ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, return cmd; } +/* + * We keep a little list of ref counted metadata objects to prevent two + * different target instances creating separate bufio instances. This is + * an issue if a table is reloaded before the suspend. + */ +static DEFINE_MUTEX(table_lock); +static LIST_HEAD(table); + +static struct dm_cache_metadata *lookup(struct block_device *bdev) +{ + struct dm_cache_metadata *cmd; + + list_for_each_entry(cmd, &table, list) + if (cmd->bdev == bdev) { + atomic_inc(&cmd->ref_count); + return cmd; + } + + return NULL; +} + +static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size) +{ + struct dm_cache_metadata *cmd, *cmd2; + + mutex_lock(&table_lock); + cmd = lookup(bdev); + mutex_unlock(&table_lock); + + if (cmd) + return cmd; + + cmd = metadata_open(bdev, data_block_size, may_format_device, policy_hint_size); + if (cmd) { + mutex_lock(&table_lock); + cmd2 = lookup(bdev); + if (cmd2) { + mutex_unlock(&table_lock); + __destroy_persistent_data_objects(cmd); + kfree(cmd); + return cmd2; + } + list_add(&cmd->list, &table); + mutex_unlock(&table_lock); + } + + return cmd; +} + +static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size) +{ + if (cmd->data_block_size != data_block_size) { + DMERR("data_block_size (%llu) different from that in metadata (%llu)\n", + (unsigned long long) data_block_size, + (unsigned long long) cmd->data_block_size); + return false; + } + + return true; +} + +struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size) +{ + struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, + may_format_device, policy_hint_size); + if (cmd && !same_params(cmd, data_block_size)) { + dm_cache_metadata_close(cmd); + return NULL; + } + + return cmd; +} + void dm_cache_metadata_close(struct dm_cache_metadata *cmd) { - __destroy_persistent_data_objects(cmd); - kfree(cmd); + if (atomic_dec_and_test(&cmd->ref_count)) { + mutex_lock(&table_lock); + list_del(&cmd->list); + mutex_unlock(&table_lock); + + __destroy_persistent_data_objects(cmd); + kfree(cmd); + } } int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 6ab68e0..1771845 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -146,7 +146,13 @@ struct cache { struct list_head need_commit_migrations; sector_t migration_threshold; wait_queue_head_t migration_wait; - atomic_t nr_migrations; + atomic_t nr_allocated_migrations; + + /* + * The number of in flight migrations that are performing + * background io. eg, promotion, writeback. + */ + atomic_t nr_io_migrations; wait_queue_head_t quiescing_wait; atomic_t quiescing_ack; @@ -154,7 +160,7 @@ struct cache { /* * cache_size entries, dirty if set */ - dm_cblock_t nr_dirty; + atomic_t nr_dirty; unsigned long *dirty_bitset; /* @@ -162,7 +168,7 @@ struct cache { */ dm_dblock_t discard_nr_blocks; unsigned long *discard_bitset; - uint32_t discard_block_size; /* a power of 2 times sectors per block */ + uint32_t discard_block_size; /* * Rather than reconstructing the table line for the status we just @@ -182,7 +188,6 @@ struct cache { struct dm_deferred_set *all_io_ds; mempool_t *migration_pool; - struct dm_cache_migration *next_migration; struct dm_cache_policy *policy; unsigned policy_nr_args; @@ -265,10 +270,31 @@ static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cel dm_bio_prison_free_cell(cache->prison, cell); } +static struct dm_cache_migration *alloc_migration(struct cache *cache) +{ + struct dm_cache_migration *mg; + + mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); + if (mg) { + mg->cache = cache; + atomic_inc(&mg->cache->nr_allocated_migrations); + } + + return mg; +} + +static void free_migration(struct dm_cache_migration *mg) +{ + if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations)) + wake_up(&mg->cache->migration_wait); + + mempool_free(mg, mg->cache->migration_pool); +} + static int prealloc_data_structs(struct cache *cache, struct prealloc *p) { if (!p->mg) { - p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); + p->mg = alloc_migration(cache); if (!p->mg) return -ENOMEM; } @@ -297,7 +323,7 @@ static void prealloc_free_structs(struct cache *cache, struct prealloc *p) free_prison_cell(cache, p->cell1); if (p->mg) - mempool_free(p->mg, cache->migration_pool); + free_migration(p->mg); } static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) @@ -408,7 +434,7 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b) static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) { if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { - cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1); + atomic_inc(&cache->nr_dirty); policy_set_dirty(cache->policy, oblock); } } @@ -417,8 +443,7 @@ static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cbl { if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { policy_clear_dirty(cache->policy, oblock); - cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1); - if (!from_cblock(cache->nr_dirty)) + if (atomic_dec_return(&cache->nr_dirty) == 0) dm_table_event(cache->ti->table); } } @@ -709,24 +734,14 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, * Migration covers moving data from the origin device to the cache, or * vice versa. *--------------------------------------------------------------*/ -static void free_migration(struct dm_cache_migration *mg) +static void inc_io_migrations(struct cache *cache) { - mempool_free(mg, mg->cache->migration_pool); + atomic_inc(&cache->nr_io_migrations); } -static void inc_nr_migrations(struct cache *cache) +static void dec_io_migrations(struct cache *cache) { - atomic_inc(&cache->nr_migrations); -} - -static void dec_nr_migrations(struct cache *cache) -{ - atomic_dec(&cache->nr_migrations); - - /* - * Wake the worker in case we're suspending the target. - */ - wake_up(&cache->migration_wait); + atomic_dec(&cache->nr_io_migrations); } static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, @@ -749,11 +764,10 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, wake_worker(cache); } -static void cleanup_migration(struct dm_cache_migration *mg) +static void free_io_migration(struct dm_cache_migration *mg) { - struct cache *cache = mg->cache; + dec_io_migrations(mg->cache); free_migration(mg); - dec_nr_migrations(cache); } static void migration_failure(struct dm_cache_migration *mg) @@ -778,7 +792,7 @@ static void migration_failure(struct dm_cache_migration *mg) cell_defer(cache, mg->new_ocell, 1); } - cleanup_migration(mg); + free_io_migration(mg); } static void migration_success_pre_commit(struct dm_cache_migration *mg) @@ -787,9 +801,9 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg) struct cache *cache = mg->cache; if (mg->writeback) { - cell_defer(cache, mg->old_ocell, false); clear_dirty(cache, mg->old_oblock, mg->cblock); - cleanup_migration(mg); + cell_defer(cache, mg->old_ocell, false); + free_io_migration(mg); return; } else if (mg->demote) { @@ -799,14 +813,14 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg) mg->old_oblock); if (mg->promote) cell_defer(cache, mg->new_ocell, true); - cleanup_migration(mg); + free_io_migration(mg); return; } } else { if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) { DMWARN_LIMIT("promotion failed; couldn't update on disk metadata"); policy_remove_mapping(cache->policy, mg->new_oblock); - cleanup_migration(mg); + free_io_migration(mg); return; } } @@ -837,12 +851,12 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) spin_unlock_irqrestore(&cache->lock, flags); } else - cleanup_migration(mg); + free_io_migration(mg); } else { - cell_defer(cache, mg->new_ocell, true); clear_dirty(cache, mg->new_oblock, mg->cblock); - cleanup_migration(mg); + cell_defer(cache, mg->new_ocell, true); + free_io_migration(mg); } } @@ -1003,7 +1017,7 @@ static void promote(struct cache *cache, struct prealloc *structs, mg->new_ocell = cell; mg->start_jiffies = jiffies; - inc_nr_migrations(cache); + inc_io_migrations(cache); quiesce_migration(mg); } @@ -1024,7 +1038,7 @@ static void writeback(struct cache *cache, struct prealloc *structs, mg->new_ocell = NULL; mg->start_jiffies = jiffies; - inc_nr_migrations(cache); + inc_io_migrations(cache); quiesce_migration(mg); } @@ -1048,7 +1062,7 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs, mg->new_ocell = new_ocell; mg->start_jiffies = jiffies; - inc_nr_migrations(cache); + inc_io_migrations(cache); quiesce_migration(mg); } @@ -1109,7 +1123,7 @@ static void process_discard_bio(struct cache *cache, struct bio *bio) static bool spare_migration_bandwidth(struct cache *cache) { - sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) * + sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * cache->sectors_per_block; return current_volume < cache->migration_threshold; } @@ -1400,7 +1414,7 @@ static void stop_quiescing(struct cache *cache) static void wait_for_migrations(struct cache *cache) { - wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations)); + wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); } static void stop_worker(struct cache *cache) @@ -1509,9 +1523,6 @@ static void destroy(struct cache *cache) { unsigned i; - if (cache->next_migration) - mempool_free(cache->next_migration, cache->migration_pool); - if (cache->migration_pool) mempool_destroy(cache->migration_pool); @@ -1908,35 +1919,6 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca, return 0; } -/* - * We want the discard block size to be a power of two, at least the size - * of the cache block size, and have no more than 2^14 discard blocks - * across the origin. - */ -#define MAX_DISCARD_BLOCKS (1 << 14) - -static bool too_many_discard_blocks(sector_t discard_block_size, - sector_t origin_size) -{ - (void) sector_div(origin_size, discard_block_size); - - return origin_size > MAX_DISCARD_BLOCKS; -} - -static sector_t calculate_discard_block_size(sector_t cache_block_size, - sector_t origin_size) -{ - sector_t discard_block_size; - - discard_block_size = roundup_pow_of_two(cache_block_size); - - if (origin_size) - while (too_many_discard_blocks(discard_block_size, origin_size)) - discard_block_size *= 2; - - return discard_block_size; -} - #define DEFAULT_MIGRATION_THRESHOLD 2048 static int cache_create(struct cache_args *ca, struct cache **result) @@ -1961,6 +1943,8 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->num_discard_bios = 1; ti->discards_supported = true; ti->discard_zeroes_data_unsupported = true; + /* Discard bios must be split on a block boundary */ + ti->split_discard_bios = true; cache->features = ca->features; ti->per_bio_data_size = get_per_bio_data_size(cache); @@ -2026,14 +2010,15 @@ static int cache_create(struct cache_args *ca, struct cache **result) INIT_LIST_HEAD(&cache->quiesced_migrations); INIT_LIST_HEAD(&cache->completed_migrations); INIT_LIST_HEAD(&cache->need_commit_migrations); - atomic_set(&cache->nr_migrations, 0); + atomic_set(&cache->nr_allocated_migrations, 0); + atomic_set(&cache->nr_io_migrations, 0); init_waitqueue_head(&cache->migration_wait); init_waitqueue_head(&cache->quiescing_wait); atomic_set(&cache->quiescing_ack, 0); r = -ENOMEM; - cache->nr_dirty = 0; + atomic_set(&cache->nr_dirty, 0); cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); if (!cache->dirty_bitset) { *error = "could not allocate dirty bitset"; @@ -2041,9 +2026,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) } clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); - cache->discard_block_size = - calculate_discard_block_size(cache->sectors_per_block, - cache->origin_sectors); + cache->discard_block_size = cache->sectors_per_block; cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks); cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); if (!cache->discard_bitset) { @@ -2087,8 +2070,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } - cache->next_migration = NULL; - cache->need_tick_bio = true; cache->sized = false; cache->quiescing = false; @@ -2531,7 +2512,7 @@ static void cache_status(struct dm_target *ti, status_type_t type, residency = policy_residency(cache->policy); - DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ", + DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %lu ", (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), (unsigned long long)nr_blocks_metadata, (unsigned) atomic_read(&cache->stats.read_hit), @@ -2541,7 +2522,7 @@ static void cache_status(struct dm_target *ti, status_type_t type, (unsigned) atomic_read(&cache->stats.demotion), (unsigned) atomic_read(&cache->stats.promotion), (unsigned long long) from_cblock(residency), - cache->nr_dirty); + (unsigned long) atomic_read(&cache->nr_dirty)); if (cache->features.write_through) DMEMIT("1 writethrough "); @@ -2630,7 +2611,7 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits) /* * FIXME: these limits may be incompatible with the cache device */ - limits->max_discard_sectors = cache->discard_block_size * 1024; + limits->max_discard_sectors = cache->discard_block_size; limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; } diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 0fce0bc..0f64dc5 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -18,7 +18,6 @@ #include <linux/crypto.h> #include <linux/workqueue.h> #include <linux/backing-dev.h> -#include <linux/percpu.h> #include <linux/atomic.h> #include <linux/scatterlist.h> #include <asm/page.h> @@ -44,6 +43,7 @@ struct convert_context { unsigned int idx_out; sector_t cc_sector; atomic_t cc_pending; + struct ablkcipher_request *req; }; /* @@ -105,15 +105,7 @@ struct iv_lmk_private { enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; /* - * Duplicated per-CPU state for cipher. - */ -struct crypt_cpu { - struct ablkcipher_request *req; -}; - -/* - * The fields in here must be read only after initialization, - * changing state should be in crypt_cpu. + * The fields in here must be read only after initialization. */ struct crypt_config { struct dm_dev *dev; @@ -143,12 +135,6 @@ struct crypt_config { sector_t iv_offset; unsigned int iv_size; - /* - * Duplicated per cpu state. Access through - * per_cpu_ptr() only. - */ - struct crypt_cpu __percpu *cpu; - /* ESSIV: struct crypto_cipher *essiv_tfm */ void *iv_private; struct crypto_ablkcipher **tfms; @@ -184,11 +170,6 @@ static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); -static struct crypt_cpu *this_crypt_config(struct crypt_config *cc) -{ - return this_cpu_ptr(cc->cpu); -} - /* * Use this to access cipher attributes that are the same for each CPU. */ @@ -738,16 +719,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, static void crypt_alloc_req(struct crypt_config *cc, struct convert_context *ctx) { - struct crypt_cpu *this_cc = this_crypt_config(cc); unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1); - if (!this_cc->req) - this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); + if (!ctx->req) + ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO); - ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]); - ablkcipher_request_set_callback(this_cc->req, + ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]); + ablkcipher_request_set_callback(ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); + kcryptd_async_done, dmreq_of_req(cc, ctx->req)); } /* @@ -756,7 +736,6 @@ static void crypt_alloc_req(struct crypt_config *cc, static int crypt_convert(struct crypt_config *cc, struct convert_context *ctx) { - struct crypt_cpu *this_cc = this_crypt_config(cc); int r; atomic_set(&ctx->cc_pending, 1); @@ -768,7 +747,7 @@ static int crypt_convert(struct crypt_config *cc, atomic_inc(&ctx->cc_pending); - r = crypt_convert_block(cc, ctx, this_cc->req); + r = crypt_convert_block(cc, ctx, ctx->req); switch (r) { /* async */ @@ -777,7 +756,7 @@ static int crypt_convert(struct crypt_config *cc, INIT_COMPLETION(ctx->restart); /* fall through*/ case -EINPROGRESS: - this_cc->req = NULL; + ctx->req = NULL; ctx->cc_sector++; continue; @@ -876,6 +855,7 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc, io->sector = sector; io->error = 0; io->base_io = NULL; + io->ctx.req = NULL; atomic_set(&io->io_pending, 0); return io; @@ -901,6 +881,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io) if (!atomic_dec_and_test(&io->io_pending)) return; + if (io->ctx.req) + mempool_free(io->ctx.req, cc->req_pool); mempool_free(io, cc->io_pool); if (likely(!base_io)) @@ -1326,8 +1308,6 @@ static int crypt_wipe_key(struct crypt_config *cc) static void crypt_dtr(struct dm_target *ti) { struct crypt_config *cc = ti->private; - struct crypt_cpu *cpu_cc; - int cpu; ti->private = NULL; @@ -1339,13 +1319,6 @@ static void crypt_dtr(struct dm_target *ti) if (cc->crypt_queue) destroy_workqueue(cc->crypt_queue); - if (cc->cpu) - for_each_possible_cpu(cpu) { - cpu_cc = per_cpu_ptr(cc->cpu, cpu); - if (cpu_cc->req) - mempool_free(cpu_cc->req, cc->req_pool); - } - crypt_free_tfms(cc); if (cc->bs) @@ -1364,9 +1337,6 @@ static void crypt_dtr(struct dm_target *ti) if (cc->dev) dm_put_device(ti, cc->dev); - if (cc->cpu) - free_percpu(cc->cpu); - kzfree(cc->cipher); kzfree(cc->cipher_string); @@ -1421,13 +1391,6 @@ static int crypt_ctr_cipher(struct dm_target *ti, if (tmp) DMWARN("Ignoring unexpected additional cipher options"); - cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)), - __alignof__(struct crypt_cpu)); - if (!cc->cpu) { - ti->error = "Cannot allocate per cpu state"; - goto bad_mem; - } - /* * For compatibility with the original dm-crypt mapping format, if * only the cipher name is supplied, use cbc-plain. @@ -1543,6 +1506,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) unsigned int key_size, opt_params; unsigned long long tmpll; int ret; + size_t iv_size_padding; struct dm_arg_set as; const char *opt_string; char dummy; @@ -1579,12 +1543,23 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->dmreq_start = sizeof(struct ablkcipher_request); cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); - cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); - cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) & - ~(crypto_tfm_ctx_alignment() - 1); + cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request)); + + if (crypto_ablkcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) { + /* Allocate the padding exactly */ + iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request)) + & crypto_ablkcipher_alignmask(any_tfm(cc)); + } else { + /* + * If the cipher requires greater alignment than kmalloc + * alignment, we don't know the exact position of the + * initialization vector. We must assume worst case. + */ + iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc)); + } cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + - sizeof(struct dm_crypt_request) + cc->iv_size); + sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size); if (!cc->req_pool) { ti->error = "Cannot allocate crypt request mempool"; goto bad; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 2a20986..e60c2ea 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -10,6 +10,7 @@ #include <linux/device-mapper.h> #include <linux/bio.h> +#include <linux/completion.h> #include <linux/mempool.h> #include <linux/module.h> #include <linux/sched.h> @@ -32,7 +33,7 @@ struct dm_io_client { struct io { unsigned long error_bits; atomic_t count; - struct task_struct *sleeper; + struct completion *wait; struct dm_io_client *client; io_notify_fn callback; void *context; @@ -121,8 +122,8 @@ static void dec_count(struct io *io, unsigned int region, int error) invalidate_kernel_vmap_range(io->vma_invalidate_address, io->vma_invalidate_size); - if (io->sleeper) - wake_up_process(io->sleeper); + if (io->wait) + complete(io->wait); else { unsigned long r = io->error_bits; @@ -385,6 +386,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, */ volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io)); + DECLARE_COMPLETION_ONSTACK(wait); if (num_regions > 1 && (rw & RW_MASK) != WRITE) { WARN_ON(1); @@ -393,7 +395,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ - io->sleeper = current; + io->wait = &wait; io->client = client; io->vma_invalidate_address = dp->vma_invalidate_address; @@ -401,15 +403,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, dispatch_io(rw, num_regions, where, dp, io, 1); - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); - - if (!atomic_read(&io->count)) - break; - - io_schedule(); - } - set_current_state(TASK_RUNNING); + wait_for_completion_io(&wait); if (error_bits) *error_bits = io->error_bits; @@ -432,7 +426,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io = mempool_alloc(client->pool, GFP_NOIO); io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ - io->sleeper = NULL; + io->wait = NULL; io->client = client; io->callback = fn; io->context = context; diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 08d9a20..c69d0b7 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c @@ -272,7 +272,7 @@ int dm_ulog_tfr_init(void) r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback); if (r) { - cn_del_callback(&ulog_cn_id); + kfree(prealloced_cn_msg); return r; } diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 4880b69..5971538 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -785,8 +785,7 @@ struct dm_raid_superblock { __le32 layout; __le32 stripe_sectors; - __u8 pad[452]; /* Round struct to 512 bytes. */ - /* Always set to 0 when writing. */ + /* Remainder of a logical block is zero-filled when writing (see super_sync()). */ } __packed; static int read_disk_sb(struct md_rdev *rdev, int size) @@ -823,7 +822,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev) test_bit(Faulty, &(rs->dev[i].rdev.flags))) failed_devices |= (1ULL << i); - memset(sb, 0, sizeof(*sb)); + memset(sb + 1, 0, rdev->sb_size - sizeof(*sb)); sb->magic = cpu_to_le32(DM_RAID_MAGIC); sb->features = cpu_to_le32(0); /* No features yet */ @@ -858,7 +857,11 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) uint64_t events_sb, events_refsb; rdev->sb_start = 0; - rdev->sb_size = sizeof(*sb); + rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev); + if (rdev->sb_size < sizeof(*sb) || rdev->sb_size > PAGE_SIZE) { + DMERR("superblock size of a logical block is no longer valid"); + return -EINVAL; + } ret = read_disk_sb(rdev, rdev->sb_size); if (ret) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 3bb4506..b63095c 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -192,6 +192,13 @@ struct dm_pool_metadata { * operation possible in this state is the closing of the device. */ bool fail_io:1; + + /* + * Reading the space map roots can fail, so we read it into these + * buffers before the superblock is locked and updated. + */ + __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; }; struct dm_thin_device { @@ -431,26 +438,53 @@ static void __setup_btree_details(struct dm_pool_metadata *pmd) pmd->details_info.value_type.equal = NULL; } +static int save_sm_roots(struct dm_pool_metadata *pmd) +{ + int r; + size_t len; + + r = dm_sm_root_size(pmd->metadata_sm, &len); + if (r < 0) + return r; + + r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len); + if (r < 0) + return r; + + r = dm_sm_root_size(pmd->data_sm, &len); + if (r < 0) + return r; + + return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len); +} + +static void copy_sm_roots(struct dm_pool_metadata *pmd, + struct thin_disk_superblock *disk) +{ + memcpy(&disk->metadata_space_map_root, + &pmd->metadata_space_map_root, + sizeof(pmd->metadata_space_map_root)); + + memcpy(&disk->data_space_map_root, + &pmd->data_space_map_root, + sizeof(pmd->data_space_map_root)); +} + static int __write_initial_superblock(struct dm_pool_metadata *pmd) { int r; struct dm_block *sblock; - size_t metadata_len, data_len; struct thin_disk_superblock *disk_super; sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT; if (bdev_size > THIN_METADATA_MAX_SECTORS) bdev_size = THIN_METADATA_MAX_SECTORS; - r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); - if (r < 0) - return r; - - r = dm_sm_root_size(pmd->data_sm, &data_len); + r = dm_sm_commit(pmd->data_sm); if (r < 0) return r; - r = dm_sm_commit(pmd->data_sm); + r = save_sm_roots(pmd); if (r < 0) return r; @@ -471,15 +505,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd) disk_super->trans_id = 0; disk_super->held_root = 0; - r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, - metadata_len); - if (r < 0) - goto bad_locked; - - r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root, - data_len); - if (r < 0) - goto bad_locked; + copy_sm_roots(pmd, disk_super); disk_super->data_mapping_root = cpu_to_le64(pmd->root); disk_super->device_details_root = cpu_to_le64(pmd->details_root); @@ -488,10 +514,6 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd) disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); return dm_tm_commit(pmd->tm, sblock); - -bad_locked: - dm_bm_unlock(sblock); - return r; } static int __format_metadata(struct dm_pool_metadata *pmd) @@ -591,6 +613,15 @@ static int __open_metadata(struct dm_pool_metadata *pmd) disk_super = dm_block_data(sblock); + /* Verify the data block size hasn't changed */ + if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) { + DMERR("changing the data block size (from %u to %llu) is not supported", + le32_to_cpu(disk_super->data_block_size), + (unsigned long long)pmd->data_block_size); + r = -EINVAL; + goto bad_unlock_sblock; + } + r = __check_incompat_features(disk_super, pmd); if (r < 0) goto bad_unlock_sblock; @@ -769,6 +800,10 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) if (r < 0) return r; + r = save_sm_roots(pmd); + if (r < 0) + return r; + r = superblock_lock(pmd, &sblock); if (r) return r; @@ -780,21 +815,9 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) disk_super->trans_id = cpu_to_le64(pmd->trans_id); disk_super->flags = cpu_to_le32(pmd->flags); - r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, - metadata_len); - if (r < 0) - goto out_locked; - - r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root, - data_len); - if (r < 0) - goto out_locked; + copy_sm_roots(pmd, disk_super); return dm_tm_commit(pmd->tm, sblock); - -out_locked: - dm_bm_unlock(sblock); - return r; } struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index e9587101..0396d7f 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1332,9 +1332,9 @@ static void process_deferred_bios(struct pool *pool) */ if (ensure_next_mapping(pool)) { spin_lock_irqsave(&pool->lock, flags); + bio_list_add(&pool->deferred_bios, bio); bio_list_merge(&pool->deferred_bios, &bios); spin_unlock_irqrestore(&pool->lock, flags); - break; } @@ -1504,6 +1504,14 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } + /* + * We must hold the virtual cell before doing the lookup, otherwise + * there's a race with discard. + */ + build_virtual_key(tc->td, block, &key); + if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) + return DM_MAPIO_SUBMITTED; + r = dm_thin_find_block(td, block, 0, &result); /* @@ -1527,13 +1535,10 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * shared flag will be set in their case. */ thin_defer_bio(tc, bio); + cell_defer_no_holder_no_free(tc, &cell1); return DM_MAPIO_SUBMITTED; } - build_virtual_key(tc->td, block, &key); - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) - return DM_MAPIO_SUBMITTED; - build_data_key(tc->td, result.block, &key); if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { cell_defer_no_holder_no_free(tc, &cell1); @@ -1554,6 +1559,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * of doing so. Just error it. */ bio_io_error(bio); + cell_defer_no_holder_no_free(tc, &cell1); return DM_MAPIO_SUBMITTED; } /* fall through */ @@ -1564,6 +1570,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * provide the hint to load the metadata into cache. */ thin_defer_bio(tc, bio); + cell_defer_no_holder_no_free(tc, &cell1); return DM_MAPIO_SUBMITTED; default: @@ -1573,6 +1580,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * pool is switched to fail-io mode. */ bio_io_error(bio); + cell_defer_no_holder_no_free(tc, &cell1); return DM_MAPIO_SUBMITTED; } } @@ -2695,7 +2703,8 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) */ if (pt->adjusted_pf.discard_passdown) { data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; - limits->discard_granularity = data_limits->discard_granularity; + limits->discard_granularity = max(data_limits->discard_granularity, + pool->sectors_per_block << SECTOR_SHIFT); } else limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c147f06..a562d5a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1813,14 +1813,14 @@ static void dm_request_fn(struct request_queue *q) if (map_request(ti, clone, md)) goto requeued; - BUG_ON_NONRT(!irqs_disabled()); + BUG_ON(!irqs_disabled()); spin_lock(q->queue_lock); } goto out; requeued: - BUG_ON_NONRT(!irqs_disabled()); + BUG_ON(!irqs_disabled()); spin_lock(q->queue_lock); delay_and_out: diff --git a/drivers/md/md.c b/drivers/md/md.c index 015bc45..bf030d4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7371,8 +7371,10 @@ void md_do_sync(struct md_thread *thread) /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; - if (mddev->ro) /* never try to sync a read-only array */ + if (mddev->ro) {/* never try to sync a read-only array */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); return; + } if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { @@ -7482,6 +7484,19 @@ void md_do_sync(struct md_thread *thread) rdev->recovery_offset < j) j = rdev->recovery_offset; rcu_read_unlock(); + + /* If there is a bitmap, we need to make sure all + * writes that started before we added a spare + * complete before we start doing a recovery. + * Otherwise the write might complete and (via + * bitmap_endwrite) set a bit in the bitmap after the + * recovery has checked that bit and skipped that + * region. + */ + if (mddev->bitmap) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } } printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); @@ -7825,6 +7840,7 @@ void md_check_recovery(struct mddev *mddev) /* There is no thread, but we need to call * ->spare_active and clear saved_raid_disk */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; @@ -8520,7 +8536,8 @@ static int md_notify_reboot(struct notifier_block *this, if (mddev_trylock(mddev)) { if (mddev->pers) __md_stop_writes(mddev); - mddev->safemode = 2; + if (mddev->persistent) + mddev->safemode = 2; mddev_unlock(mddev); } need_delay = 1; diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 064a3c2..30597f3 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -595,25 +595,14 @@ int dm_bm_unlock(struct dm_block *b) } EXPORT_SYMBOL_GPL(dm_bm_unlock); -int dm_bm_flush_and_unlock(struct dm_block_manager *bm, - struct dm_block *superblock) +int dm_bm_flush(struct dm_block_manager *bm) { - int r; - if (bm->read_only) return -EPERM; - r = dm_bufio_write_dirty_buffers(bm->bufio); - if (unlikely(r)) { - dm_bm_unlock(superblock); - return r; - } - - dm_bm_unlock(superblock); - return dm_bufio_write_dirty_buffers(bm->bufio); } -EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock); +EXPORT_SYMBOL_GPL(dm_bm_flush); void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b) { diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index 13cd58e..1b95dfc 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -105,8 +105,7 @@ int dm_bm_unlock(struct dm_block *b); * * This method always blocks. */ -int dm_bm_flush_and_unlock(struct dm_block_manager *bm, - struct dm_block *superblock); +int dm_bm_flush(struct dm_block_manager *bm); /* * Request data is prefetched into the cache. diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index 37d367b..bf2b80d 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -42,6 +42,12 @@ struct btree_node { } __packed; +/* + * Locks a block using the btree node validator. + */ +int bn_read_lock(struct dm_btree_info *info, dm_block_t b, + struct dm_block **result); + void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, struct dm_btree_value_type *vt); diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index cf9fd67..1b5e13e 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -92,7 +92,7 @@ struct dm_block_validator btree_node_validator = { /*----------------------------------------------------------------*/ -static int bn_read_lock(struct dm_btree_info *info, dm_block_t b, +int bn_read_lock(struct dm_btree_info *info, dm_block_t b, struct dm_block **result) { return dm_tm_read_lock(info->tm, b, &btree_node_validator, result); diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 468e371..9701d29 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -828,22 +828,26 @@ EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); * FIXME: We shouldn't use a recursive algorithm when we have limited stack * space. Also this only works for single level trees. */ -static int walk_node(struct ro_spine *s, dm_block_t block, +static int walk_node(struct dm_btree_info *info, dm_block_t block, int (*fn)(void *context, uint64_t *keys, void *leaf), void *context) { int r; unsigned i, nr; + struct dm_block *node; struct btree_node *n; uint64_t keys; - r = ro_step(s, block); - n = ro_node(s); + r = bn_read_lock(info, block, &node); + if (r) + return r; + + n = dm_block_data(node); nr = le32_to_cpu(n->header.nr_entries); for (i = 0; i < nr; i++) { if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { - r = walk_node(s, value64(n, i), fn, context); + r = walk_node(info, value64(n, i), fn, context); if (r) goto out; } else { @@ -855,7 +859,7 @@ static int walk_node(struct ro_spine *s, dm_block_t block, } out: - ro_pop(s); + dm_tm_unlock(info->tm, node); return r; } @@ -863,15 +867,7 @@ int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, int (*fn)(void *context, uint64_t *keys, void *leaf), void *context) { - int r; - struct ro_spine spine; - BUG_ON(info->levels > 1); - - init_ro_spine(&spine, info); - r = walk_node(&spine, root, fn, context); - exit_ro_spine(&spine); - - return r; + return walk_node(info, root, fn, context); } EXPORT_SYMBOL_GPL(dm_btree_walk); diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 579b582..d9a5aa5 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -564,7 +564,9 @@ static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count { struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - return smm->ll.nr_blocks; + *count = smm->ll.nr_blocks; + + return 0; } static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count) diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index 81da1a2..3bc30a0 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -154,7 +154,7 @@ int dm_tm_pre_commit(struct dm_transaction_manager *tm) if (r < 0) return r; - return 0; + return dm_bm_flush(tm->bm); } EXPORT_SYMBOL_GPL(dm_tm_pre_commit); @@ -164,8 +164,9 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root) return -EWOULDBLOCK; wipe_shadow_table(tm); + dm_bm_unlock(root); - return dm_bm_flush_and_unlock(tm->bm, root); + return dm_bm_flush(tm->bm); } EXPORT_SYMBOL_GPL(dm_tm_commit); diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h index b5b1390..2772ed2 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.h +++ b/drivers/md/persistent-data/dm-transaction-manager.h @@ -38,18 +38,17 @@ struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transac /* * We use a 2-phase commit here. * - * i) In the first phase the block manager is told to start flushing, and - * the changes to the space map are written to disk. You should interrogate - * your particular space map to get detail of its root node etc. to be - * included in your superblock. + * i) Make all changes for the transaction *except* for the superblock. + * Then call dm_tm_pre_commit() to flush them to disk. * - * ii) @root will be committed last. You shouldn't use more than the - * first 512 bytes of @root if you wish the transaction to survive a power - * failure. You *must* have a write lock held on @root for both stage (i) - * and (ii). The commit will drop the write lock. + * ii) Lock your superblock. Update. Then call dm_tm_commit() which will + * unlock the superblock and flush it. No other blocks should be updated + * during this period. Care should be taken to never unlock a partially + * updated superblock; perform any operations that could fail *before* you + * take the superblock lock. */ int dm_tm_pre_commit(struct dm_transaction_manager *tm); -int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root); +int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock); /* * These methods are the only way to get hold of a writeable block. diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 6edc2db..6564eeb 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -94,6 +94,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) struct pool_info *pi = data; struct r1bio *r1_bio; struct bio *bio; + int need_pages; int i, j; r1_bio = r1bio_pool_alloc(gfp_flags, pi); @@ -116,15 +117,15 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) * RESYNC_PAGES for each bio. */ if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) - j = pi->raid_disks; + need_pages = pi->raid_disks; else - j = 1; - while(j--) { + need_pages = 1; + for (j = 0; j < need_pages; j++) { bio = r1_bio->bios[j]; bio->bi_vcnt = RESYNC_PAGES; if (bio_alloc_pages(bio, gfp_flags)) - goto out_free_bio; + goto out_free_pages; } /* If not user-requests, copy the page pointers to all bios */ if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) { @@ -138,6 +139,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) return r1_bio; +out_free_pages: + while (--j >= 0) { + struct bio_vec *bv; + + bio_for_each_segment_all(bv, r1_bio->bios[j], i) + __free_page(bv->bv_page); + } + out_free_bio: while (++j < pi->raid_disks) bio_put(r1_bio->bios[j]); @@ -1397,12 +1406,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) mddev->degraded++; set_bit(Faulty, &rdev->flags); spin_unlock_irqrestore(&conf->device_lock, flags); - /* - * if recovery is running, make sure it aborts. - */ - set_bit(MD_RECOVERY_INTR, &mddev->recovery); } else set_bit(Faulty, &rdev->flags); + /* + * if recovery is running, make sure it aborts. + */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_CHANGE_DEVS, &mddev->flags); printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" @@ -2043,7 +2052,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, d--; rdev = conf->mirrors[d].rdev; if (rdev && - test_bit(In_sync, &rdev->flags)) + !test_bit(Faulty, &rdev->flags)) r1_sync_page_io(rdev, sect, s, conf->tmppage, WRITE); } @@ -2055,7 +2064,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, d--; rdev = conf->mirrors[d].rdev; if (rdev && - test_bit(In_sync, &rdev->flags)) { + !test_bit(Faulty, &rdev->flags)) { if (r1_sync_page_io(rdev, sect, s, conf->tmppage, READ)) { atomic_add(s, &rdev->corrected_errors); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 308575d..9ccb107 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1698,13 +1698,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) spin_unlock_irqrestore(&conf->device_lock, flags); return; } - if (test_and_clear_bit(In_sync, &rdev->flags)) { + if (test_and_clear_bit(In_sync, &rdev->flags)) mddev->degraded++; - /* - * if recovery is running, make sure it aborts. - */ - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - } + /* + * If recovery is running, make sure it aborts. + */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -2970,6 +2969,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, */ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { end_reshape(conf); + close_sync(conf); return 0; } @@ -4420,7 +4420,7 @@ read_more: read_bio->bi_private = r10_bio; read_bio->bi_end_io = end_sync_read; read_bio->bi_rw = READ; - read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); + read_bio->bi_flags &= (~0UL << BIO_RESET_BITS); read_bio->bi_flags |= 1 << BIO_UPTODATE; read_bio->bi_vcnt = 0; read_bio->bi_size = 0; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ab00c1e..7b54c3b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -64,6 +64,10 @@ #define cpu_to_group(cpu) cpu_to_node(cpu) #define ANY_GROUP NUMA_NO_NODE +static bool devices_handle_discard_safely = false; +module_param(devices_handle_discard_safely, bool, 0644); +MODULE_PARM_DESC(devices_handle_discard_safely, + "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); static struct workqueue_struct *raid5_wq; /* * Stripe cache @@ -1541,9 +1545,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) struct raid5_percpu *percpu; unsigned long cpu; - cpu = get_cpu_light(); + cpu = get_cpu(); percpu = per_cpu_ptr(conf->percpu, cpu); - spin_lock(&percpu->lock); if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; @@ -1595,8 +1598,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) if (test_and_clear_bit(R5_Overlap, &dev->flags)) wake_up(&sh->raid_conf->wait_for_overlap); } - spin_unlock(&percpu->lock); - put_cpu_light(); + put_cpu(); } static int grow_one_stripe(struct r5conf *conf) @@ -2787,7 +2789,8 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, (s->failed >= 2 && fdev[1]->toread) || (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || - (sh->raid_conf->level == 6 && s->failed && s->to_write))) { + ((sh->raid_conf->level == 6 || sh->sector >= sh->raid_conf->mddev->recovery_cp) + && s->failed && s->to_write))) { /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync */ @@ -3674,6 +3677,8 @@ static void handle_stripe(struct stripe_head *sh) set_bit(R5_Wantwrite, &dev->flags); if (prexor) continue; + if (s.failed > 1) + continue; if (!test_bit(R5_Insync, &dev->flags) || ((i == sh->pd_idx || i == sh->qd_idx) && s.failed == 0)) @@ -5458,7 +5463,6 @@ static int raid5_alloc_percpu(struct r5conf *conf) __func__, cpu); break; } - spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock); } put_online_cpus(); @@ -5939,7 +5943,7 @@ static int run(struct mddev *mddev) mddev->queue->limits.discard_granularity = stripe; /* * unaligned part of discard request will be ignored, so can't - * guarantee discard_zerors_data + * guarantee discard_zeroes_data */ mddev->queue->limits.discard_zeroes_data = 0; @@ -5964,6 +5968,18 @@ static int run(struct mddev *mddev) !bdev_get_queue(rdev->bdev)-> limits.discard_zeroes_data) discard_supported = false; + /* Unfortunately, discard_zeroes_data is not currently + * a guarantee - just a hint. So we only allow DISCARD + * if the sysadmin has confirmed that only safe devices + * are in use by setting a module parameter. + */ + if (!devices_handle_discard_safely) { + if (discard_supported) { + pr_info("md/raid456: discard support disabled due to uncertainty.\n"); + pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n"); + } + discard_supported = false; + } } if (discard_supported && diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 0a5e1e1..2113ffa 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -444,7 +444,6 @@ struct r5conf { int recovery_disabled; /* per cpu variables */ struct raid5_percpu { - spinlock_t lock; /* Protection for -RT */ struct page *spare_page; /* Used when checking P/Q in raid6 */ void *scribble; /* space for constructing buffer * lists and performing address |