From 718822c1c112dc99e0c72c8968ee1db9d9d910f0 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 15 Nov 2013 16:12:20 -0500 Subject: dm delay: fix a possible deadlock due to shared workqueue The dm-delay target uses a shared workqueue for multiple instances. This can cause deadlock if two or more dm-delay targets are stacked on the top of each other. This patch changes dm-delay to use a per-instance workqueue. Cc: stable@vger.kernel.org # 2.6.22+ Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 496d5f3..2f91d6d 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -20,6 +20,7 @@ struct delay_c { struct timer_list delay_timer; struct mutex timer_lock; + struct workqueue_struct *kdelayd_wq; struct work_struct flush_expired_bios; struct list_head delayed_bios; atomic_t may_delay; @@ -45,14 +46,13 @@ struct dm_delay_info { static DEFINE_MUTEX(delayed_bios_lock); -static struct workqueue_struct *kdelayd_wq; static struct kmem_cache *delayed_cache; static void handle_delayed_timer(unsigned long data) { struct delay_c *dc = (struct delay_c *)data; - queue_work(kdelayd_wq, &dc->flush_expired_bios); + queue_work(dc->kdelayd_wq, &dc->flush_expired_bios); } static void queue_timeout(struct delay_c *dc, unsigned long expires) @@ -191,6 +191,12 @@ out: goto bad_dev_write; } + dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); + if (!dc->kdelayd_wq) { + DMERR("Couldn't start kdelayd"); + goto bad_queue; + } + setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc); INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); @@ -203,6 +209,8 @@ out: ti->private = dc; return 0; +bad_queue: + mempool_destroy(dc->delayed_pool); bad_dev_write: if (dc->dev_write) dm_put_device(ti, dc->dev_write); @@ -217,7 +225,7 @@ static void delay_dtr(struct dm_target *ti) { struct delay_c *dc = ti->private; - flush_workqueue(kdelayd_wq); + destroy_workqueue(dc->kdelayd_wq); dm_put_device(ti, dc->dev_read); @@ -350,12 +358,6 @@ static int __init dm_delay_init(void) { int r = -ENOMEM; - kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); - if (!kdelayd_wq) { - DMERR("Couldn't start kdelayd"); - goto bad_queue; - } - delayed_cache = KMEM_CACHE(dm_delay_info, 0); if (!delayed_cache) { DMERR("Couldn't create delayed bio cache."); @@ -373,8 +375,6 @@ static int __init dm_delay_init(void) bad_register: kmem_cache_destroy(delayed_cache); bad_memcache: - destroy_workqueue(kdelayd_wq); -bad_queue: return r; } @@ -382,7 +382,6 @@ static void __exit dm_delay_exit(void) { dm_unregister_target(&delay_target); kmem_cache_destroy(delayed_cache); - destroy_workqueue(kdelayd_wq); } /* Module hooks */ -- cgit v0.10.2 From 230c83afdd9cd384348475bea1e14b80b3b6b1b8 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 29 Nov 2013 18:13:37 -0500 Subject: dm snapshot: avoid snapshot space leak on crash There is a possible leak of snapshot space in case of crash. The reason for space leaking is that chunks in the snapshot device are allocated sequentially, but they are finished (and stored in the metadata) out of order, depending on the order in which copying finished. For example, supposed that the metadata contains the following records SUPERBLOCK METADATA (blocks 0 ... 250) DATA 0 DATA 1 DATA 2 ... DATA 250 Now suppose that you allocate 10 new data blocks 251-260. Suppose that copying of these blocks finish out of order (block 260 finished first and the block 251 finished last). Now, the snapshot device looks like this: SUPERBLOCK METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256) DATA 0 DATA 1 DATA 2 ... DATA 250 DATA 251 DATA 252 DATA 253 DATA 254 DATA 255 METADATA (blocks 255, 254, 253, 252, 251) DATA 256 DATA 257 DATA 258 DATA 259 DATA 260 Now, if the machine crashes after writing the first metadata block but before writing the second metadata block, the space for areas DATA 250-255 is leaked, it contains no valid data and it will never be used in the future. This patch makes dm-snapshot complete exceptions in the same order they were allocated, thus fixing this bug. Note: when backporting this patch to the stable kernel, change the version field in the following way: * if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0} * if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it to {1, 10, 2} Userspace reads the version to determine if the bug was fixed, so the version change is needed. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index aec57d7..944690b 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -66,6 +66,18 @@ struct dm_snapshot { atomic_t pending_exceptions_count; + /* Protected by "lock" */ + sector_t exception_start_sequence; + + /* Protected by kcopyd single-threaded callback */ + sector_t exception_complete_sequence; + + /* + * A list of pending exceptions that completed out of order. + * Protected by kcopyd single-threaded callback. + */ + struct list_head out_of_order_list; + mempool_t *pending_pool; struct dm_exception_table pending; @@ -173,6 +185,14 @@ struct dm_snap_pending_exception { */ int started; + /* There was copying error. */ + int copy_error; + + /* A sequence number, it is used for in-order completion. */ + sector_t exception_sequence; + + struct list_head out_of_order_entry; + /* * For writing a complete chunk, bypassing the copy. */ @@ -1094,6 +1114,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->valid = 1; s->active = 0; atomic_set(&s->pending_exceptions_count, 0); + s->exception_start_sequence = 0; + s->exception_complete_sequence = 0; + INIT_LIST_HEAD(&s->out_of_order_list); init_rwsem(&s->lock); INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); @@ -1443,6 +1466,19 @@ static void commit_callback(void *context, int success) pending_complete(pe, success); } +static void complete_exception(struct dm_snap_pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + + if (unlikely(pe->copy_error)) + pending_complete(pe, 0); + + else + /* Update the metadata if we are persistent */ + s->store->type->commit_exception(s->store, &pe->e, + commit_callback, pe); +} + /* * Called when the copy I/O has finished. kcopyd actually runs * this code so don't block. @@ -1452,13 +1488,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) struct dm_snap_pending_exception *pe = context; struct dm_snapshot *s = pe->snap; - if (read_err || write_err) - pending_complete(pe, 0); + pe->copy_error = read_err || write_err; - else - /* Update the metadata if we are persistent */ - s->store->type->commit_exception(s->store, &pe->e, - commit_callback, pe); + if (pe->exception_sequence == s->exception_complete_sequence) { + s->exception_complete_sequence++; + complete_exception(pe); + + while (!list_empty(&s->out_of_order_list)) { + pe = list_entry(s->out_of_order_list.next, + struct dm_snap_pending_exception, out_of_order_entry); + if (pe->exception_sequence != s->exception_complete_sequence) + break; + s->exception_complete_sequence++; + list_del(&pe->out_of_order_entry); + complete_exception(pe); + } + } else { + struct list_head *lh; + struct dm_snap_pending_exception *pe2; + + list_for_each_prev(lh, &s->out_of_order_list) { + pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry); + if (pe2->exception_sequence < pe->exception_sequence) + break; + } + list_add(&pe->out_of_order_entry, lh); + } } /* @@ -1553,6 +1608,8 @@ __find_pending_exception(struct dm_snapshot *s, return NULL; } + pe->exception_sequence = s->exception_start_sequence++; + dm_insert_exception(&s->pending, &pe->e); return pe; @@ -2192,7 +2249,7 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 11, 1}, + .version = {1, 12, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, -- cgit v0.10.2 From 5b2d06576c5410c10d95adfd5c4d8b24de861d87 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 22 Nov 2013 19:52:06 -0500 Subject: dm table: fail dm_table_create on dm_round_up overflow The dm_round_up function may overflow to zero. In this case, dm_table_create() must fail rather than go on to allocate an empty array with alloc_targets(). This fixes a possible memory corruption that could be caused by passing too large a number in "param->target_count". Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 465f08c..3ba6a38 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -200,6 +200,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode, num_targets = dm_round_up(num_targets, KEYS_PER_NODE); + if (!num_targets) { + kfree(t); + return -ENOMEM; + } + if (alloc_targets(t, num_targets)) { kfree(t); return -ENOMEM; -- cgit v0.10.2 From f62b6b8f498658a9d537c7d380e9966f15e1b2a1 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 2 Dec 2013 16:47:01 -0500 Subject: dm space map metadata: return on failure in sm_metadata_new_block Commit 2fc48021f4afdd109b9e52b6eef5db89ca80bac7 ("dm persistent metadata: add space map threshold callback") introduced a regression to the metadata block allocation path that resulted in errors being ignored. This regression was uncovered by running the following device-mapper-test-suite test: dmtest run --suite thin-provisioning -n /exhausting_metadata_space_causes_fail_mode/ The ignored error codes in sm_metadata_new_block() could crash the kernel through use of either the dm-thin or dm-cache targets, e.g.: device-mapper: thin: 253:4: reached low water mark for metadata device: sending event. device-mapper: space map metadata: unable to allocate new metadata block general protection fault: 0000 [#1] SMP ... Workqueue: dm-thin do_worker [dm_thin_pool] task: ffff880035ce2ab0 ti: ffff88021a054000 task.ti: ffff88021a054000 RIP: 0010:[] [] metadata_ll_load_ie+0x15/0x30 [dm_persistent_data] RSP: 0018:ffff88021a055a68 EFLAGS: 00010202 RAX: 003fc8243d212ba0 RBX: ffff88021a780070 RCX: ffff88021a055a78 RDX: ffff88021a055a78 RSI: 0040402222a92a80 RDI: ffff88021a780070 RBP: ffff88021a055a68 R08: ffff88021a055ba4 R09: 0000000000000010 R10: 0000000000000000 R11: 00000002a02e1000 R12: ffff88021a055ad4 R13: 0000000000000598 R14: ffffffffa0338470 R15: ffff88021a055ba4 FS: 0000000000000000(0000) GS:ffff88033fca0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f467c0291b8 CR3: 0000000001a0b000 CR4: 00000000000007e0 Stack: ffff88021a055ab8 ffffffffa0332020 ffff88021a055b30 0000000000000001 ffff88021a055b30 0000000000000000 ffff88021a055b18 0000000000000000 ffff88021a055ba4 ffff88021a055b98 ffff88021a055ae8 ffffffffa033304c Call Trace: [] sm_ll_lookup_bitmap+0x40/0xa0 [dm_persistent_data] [] sm_metadata_count_is_more_than_one+0x8c/0xc0 [dm_persistent_data] [] dm_tm_shadow_block+0x65/0x110 [dm_persistent_data] [] sm_ll_mutate+0x80/0x300 [dm_persistent_data] [] ? set_ref_count+0x10/0x10 [dm_persistent_data] [] sm_ll_inc+0x1a/0x20 [dm_persistent_data] [] sm_disk_new_block+0x60/0x80 [dm_persistent_data] [] ? down_write+0x16/0x40 [] dm_pool_alloc_data_block+0x54/0x80 [dm_thin_pool] [] alloc_data_block+0x9c/0x130 [dm_thin_pool] [] provision_block+0x4e/0x180 [dm_thin_pool] [] ? dm_thin_find_block+0x6a/0x110 [dm_thin_pool] [] process_bio+0x1ca/0x1f0 [dm_thin_pool] [] ? mempool_free+0x8d/0xa0 [] process_deferred_bios+0xc5/0x230 [dm_thin_pool] [] do_worker+0x51/0x60 [dm_thin_pool] [] process_one_work+0x182/0x3b0 [] worker_thread+0x120/0x3a0 [] ? manage_workers+0x160/0x160 [] kthread+0xce/0xe0 [] ? kthread_freezable_should_stop+0x70/0x70 [] ret_from_fork+0x7c/0xb0 [] ? kthread_freezable_should_stop+0x70/0x70 [] ret_from_fork+0x7c/0xb0 [] ? kthread_freezable_should_stop+0x70/0x70 Signed-off-by: Mike Snitzer Acked-by: Joe Thornber Cc: stable@vger.kernel.org # v3.10+ diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 1c95968..58fc1ee 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -384,12 +384,16 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); int r = sm_metadata_new_block_(sm, b); - if (r) + if (r) { DMERR("unable to allocate new metadata block"); + return r; + } r = sm_metadata_get_nr_free(sm, &count); - if (r) + if (r) { DMERR("couldn't get free block count"); + return r; + } check_threshold(&smm->threshold, count); -- cgit v0.10.2 From fafc7a815e40255d24e80a1cb7365892362fa398 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 2 Dec 2013 17:57:42 -0500 Subject: dm thin: switch to read only mode if a mapping insert fails Switch the thin pool to read-only mode when dm_thin_insert_block() fails since there is little reason to expect the cause of the failure to be resolved without further action by user space. This issue was noticed with the device-mapper-test-suite using: dmtest run --suite thin-provisioning -n /exhausting_metadata_space_causes_fail_mode/ The quantity of errors logged in this case must be reduced. before patch: device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map common: dm_tm_shadow_block() failed device-mapper: thin: 253:4: no free metadata space available. device-mapper: thin: 253:4: switching pool to read-only mode after patch: device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: 253:4: dm_thin_insert_block() failed: error = -28 device-mapper: thin: 253:4: switching pool to read-only mode Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 2c0cf51..24327ad 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -640,7 +640,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) */ r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); if (r) { - DMERR_LIMIT("dm_thin_insert_block() failed"); + DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d", + dm_device_name(pool->pool_md), r); + set_pool_mode(pool, PM_READ_ONLY); cell_error(pool, m->cell); goto out; } -- cgit v0.10.2 From 4a02b34e0cf1d0d0dd3737702841da4bf615a50a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 3 Dec 2013 12:20:57 -0500 Subject: dm thin: switch to read-only mode if metadata space is exhausted Switch the thin pool to read-only mode in alloc_data_block() if dm_pool_alloc_data_block() fails because the pool's metadata space is exhausted. Differentiate between data and metadata space in messages about no free space available. This issue was noticed with the device-mapper-test-suite using: dmtest run --suite thin-provisioning -n /exhausting_metadata_space_causes_fail_mode/ The quantity of errors logged in this case must be reduced. before patch: device-mapper: thin: 253:4: reached low water mark for metadata device: sending event. device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map common: dm_tm_shadow_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map common: dm_tm_shadow_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map common: dm_tm_shadow_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map common: dm_tm_shadow_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map common: dm_tm_shadow_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: 253:4: commit failed: error = -28 device-mapper: thin: 253:4: switching pool to read-only mode after patch: device-mapper: thin: 253:4: reached low water mark for metadata device: sending event. device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: 253:4: no free metadata space available. device-mapper: thin: 253:4: switching pool to read-only mode Signed-off-by: Mike Snitzer Acked-by: Joe Thornber Cc: stable@vger.kernel.org diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 24327ad..44ee489 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -959,7 +959,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) * table reload). */ if (!free_blocks) { - DMWARN("%s: no free space available.", + DMWARN("%s: no free data space available.", dm_device_name(pool->pool_md)); spin_lock_irqsave(&pool->lock, flags); pool->no_free_space = 1; @@ -969,8 +969,16 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) } r = dm_pool_alloc_data_block(pool->pmd, result); - if (r) + if (r) { + if (r == -ENOSPC && + !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) && + !free_blocks) { + DMWARN("%s: no free metadata space available.", + dm_device_name(pool->pool_md)); + set_pool_mode(pool, PM_READ_ONLY); + } return r; + } return 0; } -- cgit v0.10.2 From 020cc3b5e28c2e24f59f53a9154faf08564f308e Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 4 Dec 2013 15:05:36 -0500 Subject: dm thin: always fallback the pool mode if commit fails Rename commit_or_fallback() to commit(). Now all previous calls to commit() will trigger the pool mode to fallback if the commit fails. Also, check the error returned from commit() in alloc_data_block(). Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 44ee489..af79bae 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -883,32 +883,23 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, } } -static int commit(struct pool *pool) -{ - int r; - - r = dm_pool_commit_metadata(pool->pmd); - if (r) - DMERR_LIMIT("%s: commit failed: error = %d", - dm_device_name(pool->pool_md), r); - - return r; -} - /* * A non-zero return indicates read_only or fail_io mode. * Many callers don't care about the return value. */ -static int commit_or_fallback(struct pool *pool) +static int commit(struct pool *pool) { int r; if (get_pool_mode(pool) != PM_WRITE) return -EINVAL; - r = commit(pool); - if (r) + r = dm_pool_commit_metadata(pool->pmd); + if (r) { + DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d", + dm_device_name(pool->pool_md), r); set_pool_mode(pool, PM_READ_ONLY); + } return r; } @@ -945,7 +936,9 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) * Try to commit to see if that will free up some * more space. */ - (void) commit_or_fallback(pool); + r = commit(pool); + if (r) + return r; r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); if (r) @@ -1359,7 +1352,7 @@ static void process_deferred_bios(struct pool *pool) if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) return; - if (commit_or_fallback(pool)) { + if (commit(pool)) { while ((bio = bio_list_pop(&bios))) bio_io_error(bio); return; @@ -2276,7 +2269,7 @@ static int pool_preresume(struct dm_target *ti) return r; if (need_commit1 || need_commit2) - (void) commit_or_fallback(pool); + (void) commit(pool); return 0; } @@ -2303,7 +2296,7 @@ static void pool_postsuspend(struct dm_target *ti) cancel_delayed_work(&pool->waker); flush_workqueue(pool->wq); - (void) commit_or_fallback(pool); + (void) commit(pool); } static int check_arg_count(unsigned argc, unsigned args_required) @@ -2437,7 +2430,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct if (r) return r; - (void) commit_or_fallback(pool); + (void) commit(pool); r = dm_pool_reserve_metadata_snap(pool->pmd); if (r) @@ -2499,7 +2492,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) DMWARN("Unrecognised thin pool target message received: %s", argv[0]); if (!r) - (void) commit_or_fallback(pool); + (void) commit(pool); return r; } @@ -2554,7 +2547,7 @@ static void pool_status(struct dm_target *ti, status_type_t type, /* Commit to ensure statistics aren't out-of-date */ if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) - (void) commit_or_fallback(pool); + (void) commit(pool); r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); if (r) { -- cgit v0.10.2 From 5383ef3a929a1366e2ced45cd6d74be7aa2a2281 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 4 Dec 2013 16:30:01 -0500 Subject: dm thin: re-establish read-only state when switching to fail mode If the thin-pool transitioned to fail mode and the thin-pool's table were reloaded for some reason: the new table's default pool mode would be read-write, though it will transition to fail mode during resume. When the pool mode transitions directly from PM_WRITE to PM_FAIL we need to re-establish the intermediate read-only state in both the metadata and persistent-data block manager (as is usually done with the normal pool mode transition sequence: PM_WRITE -> PM_READ_ONLY -> PM_FAIL). Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index af79bae..0d7852e 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1400,6 +1400,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode) case PM_FAIL: DMERR("%s: switching pool to failure mode", dm_device_name(pool->pool_md)); + dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_fail; pool->process_discard = process_bio_fail; pool->process_prepared_mapping = process_prepared_mapping_fail; -- cgit v0.10.2 From 9b7aaa64f96f7ca280d75326fca42f42017b89ef Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 4 Dec 2013 16:58:19 -0500 Subject: dm thin: allow pool in read-only mode to transition to read-write mode A thin-pool may be in read-only mode because the pool's data or metadata space was exhausted. To allow for recovery, by adding more space to the pool, we must allow a pool to transition from PM_READ_ONLY to PM_WRITE mode. Otherwise, running out of space will render the pool permanently read-only. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 60bce43..8a30ad5 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1697,6 +1697,14 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) up_write(&pmd->root_lock); } +void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd) +{ + down_write(&pmd->root_lock); + pmd->read_only = false; + dm_bm_set_read_write(pmd->bm); + up_write(&pmd->root_lock); +} + int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, dm_block_t threshold, dm_sm_threshold_fn fn, diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 845ebbe..7bcc0e1 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -193,6 +193,7 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_siz * that nothing is changing. */ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd); +void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd); int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, dm_block_t threshold, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 0d7852e..ee29037 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1425,6 +1425,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode) break; case PM_WRITE: + dm_pool_metadata_read_write(pool->pmd); pool->process_bio = process_bio; pool->process_discard = process_discard; pool->process_prepared_mapping = process_prepared_mapping; @@ -1641,12 +1642,19 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) struct pool_c *pt = ti->private; /* - * We want to make sure that degraded pools are never upgraded. + * We want to make sure that a pool in PM_FAIL mode is never upgraded. */ enum pool_mode old_mode = pool->pf.mode; enum pool_mode new_mode = pt->adjusted_pf.mode; - if (old_mode > new_mode) + /* + * If we were in PM_FAIL mode, rollback of metadata failed. We're + * not going to recover without a thin_repair. So we never let the + * pool move out of the old mode. On the other hand a PM_READ_ONLY + * may have been due to a lack of metadata or data space, and may + * now work (ie. if the underlying devices have been resized). + */ + if (old_mode == PM_FAIL) new_mode = old_mode; pool->ti = ti; diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index a7e8bf2..064a3c2 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -626,6 +626,12 @@ void dm_bm_set_read_only(struct dm_block_manager *bm) } EXPORT_SYMBOL_GPL(dm_bm_set_read_only); +void dm_bm_set_read_write(struct dm_block_manager *bm) +{ + bm->read_only = false; +} +EXPORT_SYMBOL_GPL(dm_bm_set_read_write); + u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) { return crc32c(~(u32) 0, data, len) ^ init_xor; diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index 9a82083..13cd58e 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -108,9 +108,9 @@ int dm_bm_unlock(struct dm_block *b); int dm_bm_flush_and_unlock(struct dm_block_manager *bm, struct dm_block *superblock); - /* - * Request data be prefetched into the cache. - */ +/* + * Request data is prefetched into the cache. + */ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); /* @@ -125,6 +125,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); * be returned if you do. */ void dm_bm_set_read_only(struct dm_block_manager *bm); +void dm_bm_set_read_write(struct dm_block_manager *bm); u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); -- cgit v0.10.2 From af95e7a69b54bca48092e3013a92cfa3043c9c73 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 15 Nov 2013 10:51:20 +0000 Subject: dm cache policy mq: fix promotions to occur as expected Micro benchmarks that repeatedly issued IO to a single block were failing to cause a promotion from the origin device to the cache. Fix this by not updating the stats during map() if -EWOULDBLOCK will be returned. The mq policy will only update stats, consider migration, etc, once per tick period (a unit of time established between dm-cache core and the policies). When the IO thread calls the policy's map method, if it would like to migrate the associated block it returns -EWOULDBLOCK, the IO then gets handed over to a worker thread which handles the migration. The worker thread calls map again, to check the migration is still needed (avoids a race among other things). *BUT*, before this fix, if we were still in the same tick period the stats were already updated by the previous map call -- so the migration would no longer be requested. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 416b7b7..64780ad 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -730,15 +730,18 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, int r = 0; bool updated = updated_this_tick(mq, e); - requeue_and_update_tick(mq, e); - if ((!discarded_oblock && updated) || - !should_promote(mq, e, discarded_oblock, data_dir)) + !should_promote(mq, e, discarded_oblock, data_dir)) { + requeue_and_update_tick(mq, e); result->op = POLICY_MISS; - else if (!can_migrate) + + } else if (!can_migrate) r = -EWOULDBLOCK; - else + + else { + requeue_and_update_tick(mq, e); r = pre_cache_to_cache(mq, e, result); + } return r; } -- cgit v0.10.2 From 83f539e1a45a07934f0da69dc545bcbde01de36c Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 26 Nov 2013 11:03:54 -0500 Subject: dm cache: update Documentation for invalidate_cblocks's range syntax The cache target's invalidate_cblocks message allows cache block (cblock) ranges to be expressed with: - The range's value is "one past the end", so the range includes through -1. Signed-off-by: Mike Snitzer Acked-by: Joe Thornber diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt index 274752f..719320b 100644 --- a/Documentation/device-mapper/cache.txt +++ b/Documentation/device-mapper/cache.txt @@ -266,10 +266,12 @@ E.g. Invalidation is removing an entry from the cache without writing it back. Cache blocks can be invalidated via the invalidate_cblocks message, which takes an arbitrary number of cblock ranges. Each cblock -must be expressed as a decimal value, in the future a variant message -that takes cblock ranges expressed in hexidecimal may be needed to -better support efficient invalidation of larger caches. The cache must -be in passthrough mode when invalidate_cblocks is used. +range's end value is "one past the end", meaning 5-10 expresses a range +of values from 5 to 9. Each cblock must be expressed as a decimal +value, in the future a variant message that takes cblock ranges +expressed in hexidecimal may be needed to better support efficient +invalidation of larger caches. The cache must be in passthrough mode +when invalidate_cblocks is used. invalidate_cblocks [|-]* -- cgit v0.10.2 From 088448007bb97af47ec3f05fc3e9517ffb5e9fba Mon Sep 17 00:00:00 2001 From: Vincent Pelletier Date: Sat, 30 Nov 2013 12:58:42 +0100 Subject: dm cache: actually resize cache Commit f494a9c6b1b6dd9a9f21bbb75d9210d478eeb498 ("dm cache: cache shrinking support") broke cache resizing support. dm_cache_resize() is called with cache->cache_size before it gets updated to new_size, so it is a no-op. But the dm-cache superblock is updated with the new_size even though the backing dm-array is not resized. Fix this by passing the new_size to dm_cache_resize(). Signed-off-by: Vincent Pelletier Acked-by: Joe Thornber Signed-off-by: Mike Snitzer diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 9efcf10..1b1469e 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2755,7 +2755,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) { int r; - r = dm_cache_resize(cache->cmd, cache->cache_size); + r = dm_cache_resize(cache->cmd, new_size); if (r) { DMERR("could not resize cache metadata"); return r; -- cgit v0.10.2 From 4cb57ab4a2e61978f3a9b7d4f53988f30d61c27f Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 5 Dec 2013 17:33:29 -0500 Subject: dm bufio: initialize read-only module parameters Some module parameters in dm-bufio are read-only. These parameters inform the user about memory consumption. They are not supposed to be changed by the user. However, despite being read-only, these parameters can be set on modprobe or insmod command line, for example: modprobe dm-bufio current_allocated_bytes=12345 The kernel doesn't expect that these variables can be non-zero at module initialization and if the user sets them, it results in BUG. This patch initializes the variables in the module init routine, so that user-supplied values are ignored. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.2+ diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 173cbb2..54bdd923 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1717,6 +1717,11 @@ static int __init dm_bufio_init(void) { __u64 mem; + dm_bufio_allocated_kmem_cache = 0; + dm_bufio_allocated_get_free_pages = 0; + dm_bufio_allocated_vmalloc = 0; + dm_bufio_current_allocated = 0; + memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); -- cgit v0.10.2 From 76f5bee5c3b45c617f91243e85547fc8f67bc678 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 5 Dec 2013 17:34:19 -0500 Subject: dm stats: initialize read-only module parameter The module parameter stats_current_allocated_bytes in dm-mod is read-only. This parameter informs the user about memory consumption. It is not supposed to be changed by the user. However, despite being read-only, this parameter can be set on modprobe or insmod command line: modprobe dm-mod stats_current_allocated_bytes=12345 The kernel doesn't expect that this variable can be non-zero at module initialization and if the user sets it, it results in warning. This patch initializes the variable in the module init routine, so that user-supplied value is ignored. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.12+ diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 3d404c1..28a9012 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -964,6 +964,7 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, int __init dm_statistics_init(void) { + shared_memory_amount = 0; dm_stat_need_rcu_barrier = 0; return 0; } -- cgit v0.10.2 From 5b564d80f8bc21094c0cd2b19b679d983aabcc29 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 13 Dec 2013 12:31:08 +0000 Subject: dm space map: disallow decrementing a reference count below zero The old behaviour, returning -EINVAL if a ref_count of 0 would be decremented, was removed in commit f722063 ("dm space map: optimise sm_ll_dec and sm_ll_inc"). To fix this regression we return an error code from the mutator function pointer passed to sm_ll_mutate() and have dec_ref_count() return -EINVAL if the old ref_count is 0. Add a DMERR to reflect the potential seriousness of this error. Also, add missing dm_tm_unlock() to sm_ll_mutate()'s error path. With this fix the following dmts regression test now passes: dmtest run --suite cache -n /metadata_use_kernel/ The next patch fixes the higher-level dm-array code that exposed this regression. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.12+ diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 6058569..466a60b 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -381,7 +381,7 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, } static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, - uint32_t (*mutator)(void *context, uint32_t old), + int (*mutator)(void *context, uint32_t old, uint32_t *new), void *context, enum allocation_event *ev) { int r; @@ -410,11 +410,17 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, if (old > 2) { r = sm_ll_lookup_big_ref_count(ll, b, &old); - if (r < 0) + if (r < 0) { + dm_tm_unlock(ll->tm, nb); return r; + } } - ref_count = mutator(context, old); + r = mutator(context, old, &ref_count); + if (r) { + dm_tm_unlock(ll->tm, nb); + return r; + } if (ref_count <= 2) { sm_set_bitmap(bm_le, bit, ref_count); @@ -465,9 +471,10 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, return ll->save_ie(ll, index, &ie_disk); } -static uint32_t set_ref_count(void *context, uint32_t old) +static int set_ref_count(void *context, uint32_t old, uint32_t *new) { - return *((uint32_t *) context); + *new = *((uint32_t *) context); + return 0; } int sm_ll_insert(struct ll_disk *ll, dm_block_t b, @@ -476,9 +483,10 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); } -static uint32_t inc_ref_count(void *context, uint32_t old) +static int inc_ref_count(void *context, uint32_t old, uint32_t *new) { - return old + 1; + *new = old + 1; + return 0; } int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) @@ -486,9 +494,15 @@ int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); } -static uint32_t dec_ref_count(void *context, uint32_t old) +static int dec_ref_count(void *context, uint32_t old, uint32_t *new) { - return old - 1; + if (!old) { + DMERR_LIMIT("unable to decrement a reference count below 0"); + return -EINVAL; + } + + *new = old - 1; + return 0; } int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) -- cgit v0.10.2 From ed9571f0cf1fe09d3506302610f3ccdfa1d22c4a Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 13 Dec 2013 14:55:55 +0000 Subject: dm array: fix a reference counting bug in shadow_ablock An old array block could have its reference count decremented below zero when it is being replaced in the btree by a new array block. The fix is to increment the old ablock's reference count just before inserting a new ablock into the btree. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.9+ diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index af96e24..1d75b1d 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -317,8 +317,16 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, * The shadow op will often be a noop. Only insert if it really * copied data. */ - if (dm_block_location(*block) != b) + if (dm_block_location(*block) != b) { + /* + * dm_tm_shadow_block will have already decremented the old + * block, but it is still referenced by the btree. We + * increment to stop the insert decrementing it below zero + * when overwriting the old value. + */ + dm_tm_inc(info->btree_info.tm, b); r = insert_ablock(info, index, *block, root); + } return r; } -- cgit v0.10.2