From 718822c1c112dc99e0c72c8968ee1db9d9d910f0 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 15 Nov 2013 16:12:20 -0500
Subject: dm delay: fix a possible deadlock due to shared workqueue

The dm-delay target uses a shared workqueue for multiple instances.  This
can cause deadlock if two or more dm-delay targets are stacked on the top
of each other.

This patch changes dm-delay to use a per-instance workqueue.

Cc: stable@vger.kernel.org # 2.6.22+

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 496d5f3..2f91d6d 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -20,6 +20,7 @@
 struct delay_c {
 	struct timer_list delay_timer;
 	struct mutex timer_lock;
+	struct workqueue_struct *kdelayd_wq;
 	struct work_struct flush_expired_bios;
 	struct list_head delayed_bios;
 	atomic_t may_delay;
@@ -45,14 +46,13 @@ struct dm_delay_info {
 
 static DEFINE_MUTEX(delayed_bios_lock);
 
-static struct workqueue_struct *kdelayd_wq;
 static struct kmem_cache *delayed_cache;
 
 static void handle_delayed_timer(unsigned long data)
 {
 	struct delay_c *dc = (struct delay_c *)data;
 
-	queue_work(kdelayd_wq, &dc->flush_expired_bios);
+	queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
 }
 
 static void queue_timeout(struct delay_c *dc, unsigned long expires)
@@ -191,6 +191,12 @@ out:
 		goto bad_dev_write;
 	}
 
+	dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
+	if (!dc->kdelayd_wq) {
+		DMERR("Couldn't start kdelayd");
+		goto bad_queue;
+	}
+
 	setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc);
 
 	INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
@@ -203,6 +209,8 @@ out:
 	ti->private = dc;
 	return 0;
 
+bad_queue:
+	mempool_destroy(dc->delayed_pool);
 bad_dev_write:
 	if (dc->dev_write)
 		dm_put_device(ti, dc->dev_write);
@@ -217,7 +225,7 @@ static void delay_dtr(struct dm_target *ti)
 {
 	struct delay_c *dc = ti->private;
 
-	flush_workqueue(kdelayd_wq);
+	destroy_workqueue(dc->kdelayd_wq);
 
 	dm_put_device(ti, dc->dev_read);
 
@@ -350,12 +358,6 @@ static int __init dm_delay_init(void)
 {
 	int r = -ENOMEM;
 
-	kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
-	if (!kdelayd_wq) {
-		DMERR("Couldn't start kdelayd");
-		goto bad_queue;
-	}
-
 	delayed_cache = KMEM_CACHE(dm_delay_info, 0);
 	if (!delayed_cache) {
 		DMERR("Couldn't create delayed bio cache.");
@@ -373,8 +375,6 @@ static int __init dm_delay_init(void)
 bad_register:
 	kmem_cache_destroy(delayed_cache);
 bad_memcache:
-	destroy_workqueue(kdelayd_wq);
-bad_queue:
 	return r;
 }
 
@@ -382,7 +382,6 @@ static void __exit dm_delay_exit(void)
 {
 	dm_unregister_target(&delay_target);
 	kmem_cache_destroy(delayed_cache);
-	destroy_workqueue(kdelayd_wq);
 }
 
 /* Module hooks */
-- 
cgit v0.10.2


From 230c83afdd9cd384348475bea1e14b80b3b6b1b8 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 29 Nov 2013 18:13:37 -0500
Subject: dm snapshot: avoid snapshot space leak on crash

There is a possible leak of snapshot space in case of crash.

The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.

For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250

Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260

Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.

This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.

Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
  to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index aec57d7..944690b 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -66,6 +66,18 @@ struct dm_snapshot {
 
 	atomic_t pending_exceptions_count;
 
+	/* Protected by "lock" */
+	sector_t exception_start_sequence;
+
+	/* Protected by kcopyd single-threaded callback */
+	sector_t exception_complete_sequence;
+
+	/*
+	 * A list of pending exceptions that completed out of order.
+	 * Protected by kcopyd single-threaded callback.
+	 */
+	struct list_head out_of_order_list;
+
 	mempool_t *pending_pool;
 
 	struct dm_exception_table pending;
@@ -173,6 +185,14 @@ struct dm_snap_pending_exception {
 	 */
 	int started;
 
+	/* There was copying error. */
+	int copy_error;
+
+	/* A sequence number, it is used for in-order completion. */
+	sector_t exception_sequence;
+
+	struct list_head out_of_order_entry;
+
 	/*
 	 * For writing a complete chunk, bypassing the copy.
 	 */
@@ -1094,6 +1114,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	s->valid = 1;
 	s->active = 0;
 	atomic_set(&s->pending_exceptions_count, 0);
+	s->exception_start_sequence = 0;
+	s->exception_complete_sequence = 0;
+	INIT_LIST_HEAD(&s->out_of_order_list);
 	init_rwsem(&s->lock);
 	INIT_LIST_HEAD(&s->list);
 	spin_lock_init(&s->pe_lock);
@@ -1443,6 +1466,19 @@ static void commit_callback(void *context, int success)
 	pending_complete(pe, success);
 }
 
+static void complete_exception(struct dm_snap_pending_exception *pe)
+{
+	struct dm_snapshot *s = pe->snap;
+
+	if (unlikely(pe->copy_error))
+		pending_complete(pe, 0);
+
+	else
+		/* Update the metadata if we are persistent */
+		s->store->type->commit_exception(s->store, &pe->e,
+						 commit_callback, pe);
+}
+
 /*
  * Called when the copy I/O has finished.  kcopyd actually runs
  * this code so don't block.
@@ -1452,13 +1488,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
 	struct dm_snap_pending_exception *pe = context;
 	struct dm_snapshot *s = pe->snap;
 
-	if (read_err || write_err)
-		pending_complete(pe, 0);
+	pe->copy_error = read_err || write_err;
 
-	else
-		/* Update the metadata if we are persistent */
-		s->store->type->commit_exception(s->store, &pe->e,
-						 commit_callback, pe);
+	if (pe->exception_sequence == s->exception_complete_sequence) {
+		s->exception_complete_sequence++;
+		complete_exception(pe);
+
+		while (!list_empty(&s->out_of_order_list)) {
+			pe = list_entry(s->out_of_order_list.next,
+					struct dm_snap_pending_exception, out_of_order_entry);
+			if (pe->exception_sequence != s->exception_complete_sequence)
+				break;
+			s->exception_complete_sequence++;
+			list_del(&pe->out_of_order_entry);
+			complete_exception(pe);
+		}
+	} else {
+		struct list_head *lh;
+		struct dm_snap_pending_exception *pe2;
+
+		list_for_each_prev(lh, &s->out_of_order_list) {
+			pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry);
+			if (pe2->exception_sequence < pe->exception_sequence)
+				break;
+		}
+		list_add(&pe->out_of_order_entry, lh);
+	}
 }
 
 /*
@@ -1553,6 +1608,8 @@ __find_pending_exception(struct dm_snapshot *s,
 		return NULL;
 	}
 
+	pe->exception_sequence = s->exception_start_sequence++;
+
 	dm_insert_exception(&s->pending, &pe->e);
 
 	return pe;
@@ -2192,7 +2249,7 @@ static struct target_type origin_target = {
 
 static struct target_type snapshot_target = {
 	.name    = "snapshot",
-	.version = {1, 11, 1},
+	.version = {1, 12, 0},
 	.module  = THIS_MODULE,
 	.ctr     = snapshot_ctr,
 	.dtr     = snapshot_dtr,
-- 
cgit v0.10.2


From 5b2d06576c5410c10d95adfd5c4d8b24de861d87 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 22 Nov 2013 19:52:06 -0500
Subject: dm table: fail dm_table_create on dm_round_up overflow

The dm_round_up function may overflow to zero.  In this case,
dm_table_create() must fail rather than go on to allocate an empty array
with alloc_targets().

This fixes a possible memory corruption that could be caused by passing
too large a number in "param->target_count".

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 465f08c..3ba6a38 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -200,6 +200,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 
 	num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
 
+	if (!num_targets) {
+		kfree(t);
+		return -ENOMEM;
+	}
+
 	if (alloc_targets(t, num_targets)) {
 		kfree(t);
 		return -ENOMEM;
-- 
cgit v0.10.2


From f62b6b8f498658a9d537c7d380e9966f15e1b2a1 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 2 Dec 2013 16:47:01 -0500
Subject: dm space map metadata: return on failure in sm_metadata_new_block

Commit 2fc48021f4afdd109b9e52b6eef5db89ca80bac7 ("dm persistent
metadata: add space map threshold callback") introduced a regression
to the metadata block allocation path that resulted in errors being
ignored.  This regression was uncovered by running the following
device-mapper-test-suite test:
dmtest run --suite thin-provisioning -n /exhausting_metadata_space_causes_fail_mode/

The ignored error codes in sm_metadata_new_block() could crash the
kernel through use of either the dm-thin or dm-cache targets, e.g.:

device-mapper: thin: 253:4: reached low water mark for metadata device: sending event.
device-mapper: space map metadata: unable to allocate new metadata block
general protection fault: 0000 [#1] SMP
...
Workqueue: dm-thin do_worker [dm_thin_pool]
task: ffff880035ce2ab0 ti: ffff88021a054000 task.ti: ffff88021a054000
RIP: 0010:[<ffffffffa0331385>]  [<ffffffffa0331385>] metadata_ll_load_ie+0x15/0x30 [dm_persistent_data]
RSP: 0018:ffff88021a055a68  EFLAGS: 00010202
RAX: 003fc8243d212ba0 RBX: ffff88021a780070 RCX: ffff88021a055a78
RDX: ffff88021a055a78 RSI: 0040402222a92a80 RDI: ffff88021a780070
RBP: ffff88021a055a68 R08: ffff88021a055ba4 R09: 0000000000000010
R10: 0000000000000000 R11: 00000002a02e1000 R12: ffff88021a055ad4
R13: 0000000000000598 R14: ffffffffa0338470 R15: ffff88021a055ba4
FS:  0000000000000000(0000) GS:ffff88033fca0000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00007f467c0291b8 CR3: 0000000001a0b000 CR4: 00000000000007e0
Stack:
 ffff88021a055ab8 ffffffffa0332020 ffff88021a055b30 0000000000000001
 ffff88021a055b30 0000000000000000 ffff88021a055b18 0000000000000000
 ffff88021a055ba4 ffff88021a055b98 ffff88021a055ae8 ffffffffa033304c
Call Trace:
 [<ffffffffa0332020>] sm_ll_lookup_bitmap+0x40/0xa0 [dm_persistent_data]
 [<ffffffffa033304c>] sm_metadata_count_is_more_than_one+0x8c/0xc0 [dm_persistent_data]
 [<ffffffffa0333825>] dm_tm_shadow_block+0x65/0x110 [dm_persistent_data]
 [<ffffffffa0331b00>] sm_ll_mutate+0x80/0x300 [dm_persistent_data]
 [<ffffffffa0330e60>] ? set_ref_count+0x10/0x10 [dm_persistent_data]
 [<ffffffffa0331dba>] sm_ll_inc+0x1a/0x20 [dm_persistent_data]
 [<ffffffffa0332270>] sm_disk_new_block+0x60/0x80 [dm_persistent_data]
 [<ffffffff81520036>] ? down_write+0x16/0x40
 [<ffffffffa001e5c4>] dm_pool_alloc_data_block+0x54/0x80 [dm_thin_pool]
 [<ffffffffa001b23c>] alloc_data_block+0x9c/0x130 [dm_thin_pool]
 [<ffffffffa001c27e>] provision_block+0x4e/0x180 [dm_thin_pool]
 [<ffffffffa001fe9a>] ? dm_thin_find_block+0x6a/0x110 [dm_thin_pool]
 [<ffffffffa001c57a>] process_bio+0x1ca/0x1f0 [dm_thin_pool]
 [<ffffffff8111e2ed>] ? mempool_free+0x8d/0xa0
 [<ffffffffa001d755>] process_deferred_bios+0xc5/0x230 [dm_thin_pool]
 [<ffffffffa001d911>] do_worker+0x51/0x60 [dm_thin_pool]
 [<ffffffff81067872>] process_one_work+0x182/0x3b0
 [<ffffffff81068c90>] worker_thread+0x120/0x3a0
 [<ffffffff81068b70>] ? manage_workers+0x160/0x160
 [<ffffffff8106eb2e>] kthread+0xce/0xe0
 [<ffffffff8106ea60>] ? kthread_freezable_should_stop+0x70/0x70
 [<ffffffff8152af6c>] ret_from_fork+0x7c/0xb0
 [<ffffffff8106ea60>] ? kthread_freezable_should_stop+0x70/0x70
 [<ffffffff8152af6c>] ret_from_fork+0x7c/0xb0
 [<ffffffff8106ea60>] ? kthread_freezable_should_stop+0x70/0x70

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
Cc: stable@vger.kernel.org # v3.10+

diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 1c95968..58fc1ee 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -384,12 +384,16 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 
 	int r = sm_metadata_new_block_(sm, b);
-	if (r)
+	if (r) {
 		DMERR("unable to allocate new metadata block");
+		return r;
+	}
 
 	r = sm_metadata_get_nr_free(sm, &count);
-	if (r)
+	if (r) {
 		DMERR("couldn't get free block count");
+		return r;
+	}
 
 	check_threshold(&smm->threshold, count);
 
-- 
cgit v0.10.2


From fafc7a815e40255d24e80a1cb7365892362fa398 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 2 Dec 2013 17:57:42 -0500
Subject: dm thin: switch to read only mode if a mapping insert fails

Switch the thin pool to read-only mode when dm_thin_insert_block() fails
since there is little reason to expect the cause of the failure to be
resolved without further action by user space.

This issue was noticed with the device-mapper-test-suite using:
dmtest run --suite thin-provisioning -n /exhausting_metadata_space_causes_fail_mode/

The quantity of errors logged in this case must be reduced.

before patch:

device-mapper: thin: dm_thin_insert_block() failed
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: thin: dm_thin_insert_block() failed
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: thin: dm_thin_insert_block() failed
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: thin: dm_thin_insert_block() failed
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: thin: dm_thin_insert_block() failed
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: thin: dm_thin_insert_block() failed
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: thin: dm_thin_insert_block() failed
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: thin: dm_thin_insert_block() failed
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: thin: dm_thin_insert_block() failed
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: thin: dm_thin_insert_block() failed
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: space map metadata: unable to allocate new metadata block
<snip ... these repeat for a long while ... >
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: space map common: dm_tm_shadow_block() failed
device-mapper: thin: 253:4: no free metadata space available.
device-mapper: thin: 253:4: switching pool to read-only mode

after patch:

device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: thin: 253:4: dm_thin_insert_block() failed: error = -28
device-mapper: thin: 253:4: switching pool to read-only mode

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2c0cf51..24327ad 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -640,7 +640,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	 */
 	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
 	if (r) {
-		DMERR_LIMIT("dm_thin_insert_block() failed");
+		DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d",
+			    dm_device_name(pool->pool_md), r);
+		set_pool_mode(pool, PM_READ_ONLY);
 		cell_error(pool, m->cell);
 		goto out;
 	}
-- 
cgit v0.10.2


From 4a02b34e0cf1d0d0dd3737702841da4bf615a50a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 3 Dec 2013 12:20:57 -0500
Subject: dm thin: switch to read-only mode if metadata space is exhausted

Switch the thin pool to read-only mode in alloc_data_block() if
dm_pool_alloc_data_block() fails because the pool's metadata space is
exhausted.

Differentiate between data and metadata space in messages about no
free space available.

This issue was noticed with the device-mapper-test-suite using:
dmtest run --suite thin-provisioning -n /exhausting_metadata_space_causes_fail_mode/

The quantity of errors logged in this case must be reduced.

before patch:

device-mapper: thin: 253:4: reached low water mark for metadata device: sending event.
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: space map common: dm_tm_shadow_block() failed
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: space map common: dm_tm_shadow_block() failed
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: space map common: dm_tm_shadow_block() failed
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: space map common: dm_tm_shadow_block() failed
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: space map common: dm_tm_shadow_block() failed
<snip ... these repeat for a _very_ long while ... >
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: thin: 253:4: commit failed: error = -28
device-mapper: thin: 253:4: switching pool to read-only mode

after patch:

device-mapper: thin: 253:4: reached low water mark for metadata device: sending event.
device-mapper: space map metadata: unable to allocate new metadata block
device-mapper: thin: 253:4: no free metadata space available.
device-mapper: thin: 253:4: switching pool to read-only mode

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
Cc: stable@vger.kernel.org

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 24327ad..44ee489 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -959,7 +959,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 		 * table reload).
 		 */
 		if (!free_blocks) {
-			DMWARN("%s: no free space available.",
+			DMWARN("%s: no free data space available.",
 			       dm_device_name(pool->pool_md));
 			spin_lock_irqsave(&pool->lock, flags);
 			pool->no_free_space = 1;
@@ -969,8 +969,16 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 	}
 
 	r = dm_pool_alloc_data_block(pool->pmd, result);
-	if (r)
+	if (r) {
+		if (r == -ENOSPC &&
+		    !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
+		    !free_blocks) {
+			DMWARN("%s: no free metadata space available.",
+			       dm_device_name(pool->pool_md));
+			set_pool_mode(pool, PM_READ_ONLY);
+		}
 		return r;
+	}
 
 	return 0;
 }
-- 
cgit v0.10.2


From 020cc3b5e28c2e24f59f53a9154faf08564f308e Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 4 Dec 2013 15:05:36 -0500
Subject: dm thin: always fallback the pool mode if commit fails

Rename commit_or_fallback() to commit().  Now all previous calls to
commit() will trigger the pool mode to fallback if the commit fails.

Also, check the error returned from commit() in alloc_data_block().

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 44ee489..af79bae 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -883,32 +883,23 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 	}
 }
 
-static int commit(struct pool *pool)
-{
-	int r;
-
-	r = dm_pool_commit_metadata(pool->pmd);
-	if (r)
-		DMERR_LIMIT("%s: commit failed: error = %d",
-			    dm_device_name(pool->pool_md), r);
-
-	return r;
-}
-
 /*
  * A non-zero return indicates read_only or fail_io mode.
  * Many callers don't care about the return value.
  */
-static int commit_or_fallback(struct pool *pool)
+static int commit(struct pool *pool)
 {
 	int r;
 
 	if (get_pool_mode(pool) != PM_WRITE)
 		return -EINVAL;
 
-	r = commit(pool);
-	if (r)
+	r = dm_pool_commit_metadata(pool->pmd);
+	if (r) {
+		DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d",
+			    dm_device_name(pool->pool_md), r);
 		set_pool_mode(pool, PM_READ_ONLY);
+	}
 
 	return r;
 }
@@ -945,7 +936,9 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 		 * Try to commit to see if that will free up some
 		 * more space.
 		 */
-		(void) commit_or_fallback(pool);
+		r = commit(pool);
+		if (r)
+			return r;
 
 		r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 		if (r)
@@ -1359,7 +1352,7 @@ static void process_deferred_bios(struct pool *pool)
 	if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
 		return;
 
-	if (commit_or_fallback(pool)) {
+	if (commit(pool)) {
 		while ((bio = bio_list_pop(&bios)))
 			bio_io_error(bio);
 		return;
@@ -2276,7 +2269,7 @@ static int pool_preresume(struct dm_target *ti)
 		return r;
 
 	if (need_commit1 || need_commit2)
-		(void) commit_or_fallback(pool);
+		(void) commit(pool);
 
 	return 0;
 }
@@ -2303,7 +2296,7 @@ static void pool_postsuspend(struct dm_target *ti)
 
 	cancel_delayed_work(&pool->waker);
 	flush_workqueue(pool->wq);
-	(void) commit_or_fallback(pool);
+	(void) commit(pool);
 }
 
 static int check_arg_count(unsigned argc, unsigned args_required)
@@ -2437,7 +2430,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct
 	if (r)
 		return r;
 
-	(void) commit_or_fallback(pool);
+	(void) commit(pool);
 
 	r = dm_pool_reserve_metadata_snap(pool->pmd);
 	if (r)
@@ -2499,7 +2492,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
 		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
 
 	if (!r)
-		(void) commit_or_fallback(pool);
+		(void) commit(pool);
 
 	return r;
 }
@@ -2554,7 +2547,7 @@ static void pool_status(struct dm_target *ti, status_type_t type,
 
 		/* Commit to ensure statistics aren't out-of-date */
 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
-			(void) commit_or_fallback(pool);
+			(void) commit(pool);
 
 		r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
 		if (r) {
-- 
cgit v0.10.2


From 5383ef3a929a1366e2ced45cd6d74be7aa2a2281 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 4 Dec 2013 16:30:01 -0500
Subject: dm thin: re-establish read-only state when switching to fail mode

If the thin-pool transitioned to fail mode and the thin-pool's table
were reloaded for some reason: the new table's default pool mode would
be read-write, though it will transition to fail mode during resume.

When the pool mode transitions directly from PM_WRITE to PM_FAIL we need
to re-establish the intermediate read-only state in both the metadata
and persistent-data block manager (as is usually done with the normal
pool mode transition sequence: PM_WRITE -> PM_READ_ONLY -> PM_FAIL).

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index af79bae..0d7852e 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1400,6 +1400,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
 	case PM_FAIL:
 		DMERR("%s: switching pool to failure mode",
 		      dm_device_name(pool->pool_md));
+		dm_pool_metadata_read_only(pool->pmd);
 		pool->process_bio = process_bio_fail;
 		pool->process_discard = process_bio_fail;
 		pool->process_prepared_mapping = process_prepared_mapping_fail;
-- 
cgit v0.10.2


From 9b7aaa64f96f7ca280d75326fca42f42017b89ef Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 4 Dec 2013 16:58:19 -0500
Subject: dm thin: allow pool in read-only mode to transition to read-write
 mode

A thin-pool may be in read-only mode because the pool's data or metadata
space was exhausted.  To allow for recovery, by adding more space to the
pool, we must allow a pool to transition from PM_READ_ONLY to PM_WRITE
mode.  Otherwise, running out of space will render the pool permanently
read-only.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 60bce43..8a30ad5 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1697,6 +1697,14 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
 	up_write(&pmd->root_lock);
 }
 
+void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
+{
+	down_write(&pmd->root_lock);
+	pmd->read_only = false;
+	dm_bm_set_read_write(pmd->bm);
+	up_write(&pmd->root_lock);
+}
+
 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
 					dm_block_t threshold,
 					dm_sm_threshold_fn fn,
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 845ebbe..7bcc0e1 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -193,6 +193,7 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_siz
  * that nothing is changing.
  */
 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd);
+void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd);
 
 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
 					dm_block_t threshold,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 0d7852e..ee29037 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1425,6 +1425,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
 		break;
 
 	case PM_WRITE:
+		dm_pool_metadata_read_write(pool->pmd);
 		pool->process_bio = process_bio;
 		pool->process_discard = process_discard;
 		pool->process_prepared_mapping = process_prepared_mapping;
@@ -1641,12 +1642,19 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 	struct pool_c *pt = ti->private;
 
 	/*
-	 * We want to make sure that degraded pools are never upgraded.
+	 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
 	 */
 	enum pool_mode old_mode = pool->pf.mode;
 	enum pool_mode new_mode = pt->adjusted_pf.mode;
 
-	if (old_mode > new_mode)
+	/*
+	 * If we were in PM_FAIL mode, rollback of metadata failed.  We're
+	 * not going to recover without a thin_repair.  So we never let the
+	 * pool move out of the old mode.  On the other hand a PM_READ_ONLY
+	 * may have been due to a lack of metadata or data space, and may
+	 * now work (ie. if the underlying devices have been resized).
+	 */
+	if (old_mode == PM_FAIL)
 		new_mode = old_mode;
 
 	pool->ti = ti;
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index a7e8bf2..064a3c2 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -626,6 +626,12 @@ void dm_bm_set_read_only(struct dm_block_manager *bm)
 }
 EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
 
+void dm_bm_set_read_write(struct dm_block_manager *bm)
+{
+	bm->read_only = false;
+}
+EXPORT_SYMBOL_GPL(dm_bm_set_read_write);
+
 u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
 {
 	return crc32c(~(u32) 0, data, len) ^ init_xor;
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 9a82083..13cd58e 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -108,9 +108,9 @@ int dm_bm_unlock(struct dm_block *b);
 int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
 			   struct dm_block *superblock);
 
- /*
-  * Request data be prefetched into the cache.
-  */
+/*
+ * Request data is prefetched into the cache.
+ */
 void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
 
 /*
@@ -125,6 +125,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
  * be returned if you do.
  */
 void dm_bm_set_read_only(struct dm_block_manager *bm);
+void dm_bm_set_read_write(struct dm_block_manager *bm);
 
 u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
 
-- 
cgit v0.10.2


From af95e7a69b54bca48092e3013a92cfa3043c9c73 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 Nov 2013 10:51:20 +0000
Subject: dm cache policy mq: fix promotions to occur as expected

Micro benchmarks that repeatedly issued IO to a single block were
failing to cause a promotion from the origin device to the cache.  Fix
this by not updating the stats during map() if -EWOULDBLOCK will be
returned.

The mq policy will only update stats, consider migration, etc, once per
tick period (a unit of time established between dm-cache core and the
policies).

When the IO thread calls the policy's map method, if it would like to
migrate the associated block it returns -EWOULDBLOCK, the IO then gets
handed over to a worker thread which handles the migration.  The worker
thread calls map again, to check the migration is still needed (avoids a
race among other things).  *BUT*, before this fix, if we were still in
the same tick period the stats were already updated by the previous map
call -- so the migration would no longer be requested.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 416b7b7..64780ad 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -730,15 +730,18 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 	int r = 0;
 	bool updated = updated_this_tick(mq, e);
 
-	requeue_and_update_tick(mq, e);
-
 	if ((!discarded_oblock && updated) ||
-	    !should_promote(mq, e, discarded_oblock, data_dir))
+	    !should_promote(mq, e, discarded_oblock, data_dir)) {
+		requeue_and_update_tick(mq, e);
 		result->op = POLICY_MISS;
-	else if (!can_migrate)
+
+	} else if (!can_migrate)
 		r = -EWOULDBLOCK;
-	else
+
+	else {
+		requeue_and_update_tick(mq, e);
 		r = pre_cache_to_cache(mq, e, result);
+	}
 
 	return r;
 }
-- 
cgit v0.10.2


From 83f539e1a45a07934f0da69dc545bcbde01de36c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 26 Nov 2013 11:03:54 -0500
Subject: dm cache: update Documentation for invalidate_cblocks's range syntax

The cache target's invalidate_cblocks message allows cache block
(cblock) ranges to be expressed with: <cblock start>-<cblock end>

The range's <cblock end> value is "one past the end", so the range
includes <cblock start> through <cblock end>-1.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>

diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
index 274752f..719320b 100644
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -266,10 +266,12 @@ E.g.
 Invalidation is removing an entry from the cache without writing it
 back.  Cache blocks can be invalidated via the invalidate_cblocks
 message, which takes an arbitrary number of cblock ranges.  Each cblock
-must be expressed as a decimal value, in the future a variant message
-that takes cblock ranges expressed in hexidecimal may be needed to
-better support efficient invalidation of larger caches.  The cache must
-be in passthrough mode when invalidate_cblocks is used.
+range's end value is "one past the end", meaning 5-10 expresses a range
+of values from 5 to 9.  Each cblock must be expressed as a decimal
+value, in the future a variant message that takes cblock ranges
+expressed in hexidecimal may be needed to better support efficient
+invalidation of larger caches.  The cache must be in passthrough mode
+when invalidate_cblocks is used.
 
    invalidate_cblocks [<cblock>|<cblock begin>-<cblock end>]*
 
-- 
cgit v0.10.2


From 088448007bb97af47ec3f05fc3e9517ffb5e9fba Mon Sep 17 00:00:00 2001
From: Vincent Pelletier <plr.vincent@gmail.com>
Date: Sat, 30 Nov 2013 12:58:42 +0100
Subject: dm cache: actually resize cache

Commit f494a9c6b1b6dd9a9f21bbb75d9210d478eeb498 ("dm cache: cache
shrinking support") broke cache resizing support.

dm_cache_resize() is called with cache->cache_size before it gets
updated to new_size, so it is a no-op.  But the dm-cache superblock is
updated with the new_size even though the backing dm-array is not
resized.  Fix this by passing the new_size to dm_cache_resize().

Signed-off-by: Vincent Pelletier <plr.vincent@gmail.com>
Acked-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9efcf10..1b1469e 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2755,7 +2755,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
 {
 	int r;
 
-	r = dm_cache_resize(cache->cmd, cache->cache_size);
+	r = dm_cache_resize(cache->cmd, new_size);
 	if (r) {
 		DMERR("could not resize cache metadata");
 		return r;
-- 
cgit v0.10.2


From 4cb57ab4a2e61978f3a9b7d4f53988f30d61c27f Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 5 Dec 2013 17:33:29 -0500
Subject: dm bufio: initialize read-only module parameters

Some module parameters in dm-bufio are read-only. These parameters
inform the user about memory consumption. They are not supposed to be
changed by the user.

However, despite being read-only, these parameters can be set on
modprobe or insmod command line, for example:
modprobe dm-bufio current_allocated_bytes=12345

The kernel doesn't expect that these variables can be non-zero at module
initialization and if the user sets them, it results in BUG.

This patch initializes the variables in the module init routine, so that
user-supplied values are ignored.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 3.2+

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 173cbb2..54bdd923 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1717,6 +1717,11 @@ static int __init dm_bufio_init(void)
 {
 	__u64 mem;
 
+	dm_bufio_allocated_kmem_cache = 0;
+	dm_bufio_allocated_get_free_pages = 0;
+	dm_bufio_allocated_vmalloc = 0;
+	dm_bufio_current_allocated = 0;
+
 	memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
 	memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
 
-- 
cgit v0.10.2


From 76f5bee5c3b45c617f91243e85547fc8f67bc678 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 5 Dec 2013 17:34:19 -0500
Subject: dm stats: initialize read-only module parameter

The module parameter stats_current_allocated_bytes in dm-mod is
read-only.  This parameter informs the user about memory
consumption.  It is not supposed to be changed by the user.

However, despite being read-only, this parameter can be set on
modprobe or insmod command line:
modprobe dm-mod stats_current_allocated_bytes=12345

The kernel doesn't expect that this variable can be non-zero at module
initialization and if the user sets it, it results in warning.

This patch initializes the variable in the module init routine, so
that user-supplied value is ignored.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 3.12+

diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 3d404c1..28a9012 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -964,6 +964,7 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
 
 int __init dm_statistics_init(void)
 {
+	shared_memory_amount = 0;
 	dm_stat_need_rcu_barrier = 0;
 	return 0;
 }
-- 
cgit v0.10.2


From 5b564d80f8bc21094c0cd2b19b679d983aabcc29 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 13 Dec 2013 12:31:08 +0000
Subject: dm space map: disallow decrementing a reference count below zero

The old behaviour, returning -EINVAL if a ref_count of 0 would be
decremented, was removed in commit f722063 ("dm space map: optimise
sm_ll_dec and sm_ll_inc").  To fix this regression we return an error
code from the mutator function pointer passed to sm_ll_mutate() and have
dec_ref_count() return -EINVAL if the old ref_count is 0.

Add a DMERR to reflect the potential seriousness of this error.

Also, add missing dm_tm_unlock() to sm_ll_mutate()'s error path.

With this fix the following dmts regression test now passes:
 dmtest run --suite cache -n /metadata_use_kernel/

The next patch fixes the higher-level dm-array code that exposed this
regression.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 3.12+

diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 6058569..466a60b 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -381,7 +381,7 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
 }
 
 static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
-			uint32_t (*mutator)(void *context, uint32_t old),
+			int (*mutator)(void *context, uint32_t old, uint32_t *new),
 			void *context, enum allocation_event *ev)
 {
 	int r;
@@ -410,11 +410,17 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
 
 	if (old > 2) {
 		r = sm_ll_lookup_big_ref_count(ll, b, &old);
-		if (r < 0)
+		if (r < 0) {
+			dm_tm_unlock(ll->tm, nb);
 			return r;
+		}
 	}
 
-	ref_count = mutator(context, old);
+	r = mutator(context, old, &ref_count);
+	if (r) {
+		dm_tm_unlock(ll->tm, nb);
+		return r;
+	}
 
 	if (ref_count <= 2) {
 		sm_set_bitmap(bm_le, bit, ref_count);
@@ -465,9 +471,10 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
 	return ll->save_ie(ll, index, &ie_disk);
 }
 
-static uint32_t set_ref_count(void *context, uint32_t old)
+static int set_ref_count(void *context, uint32_t old, uint32_t *new)
 {
-	return *((uint32_t *) context);
+	*new = *((uint32_t *) context);
+	return 0;
 }
 
 int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
@@ -476,9 +483,10 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
 	return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev);
 }
 
-static uint32_t inc_ref_count(void *context, uint32_t old)
+static int inc_ref_count(void *context, uint32_t old, uint32_t *new)
 {
-	return old + 1;
+	*new = old + 1;
+	return 0;
 }
 
 int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
@@ -486,9 +494,15 @@ int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
 	return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev);
 }
 
-static uint32_t dec_ref_count(void *context, uint32_t old)
+static int dec_ref_count(void *context, uint32_t old, uint32_t *new)
 {
-	return old - 1;
+	if (!old) {
+		DMERR_LIMIT("unable to decrement a reference count below 0");
+		return -EINVAL;
+	}
+
+	*new = old - 1;
+	return 0;
 }
 
 int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
-- 
cgit v0.10.2


From ed9571f0cf1fe09d3506302610f3ccdfa1d22c4a Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 13 Dec 2013 14:55:55 +0000
Subject: dm array: fix a reference counting bug in shadow_ablock

An old array block could have its reference count decremented below
zero when it is being replaced in the btree by a new array block.

The fix is to increment the old ablock's reference count just before
inserting a new ablock into the btree.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 3.9+

diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index af96e24..1d75b1d 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -317,8 +317,16 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
 	 * The shadow op will often be a noop.  Only insert if it really
 	 * copied data.
 	 */
-	if (dm_block_location(*block) != b)
+	if (dm_block_location(*block) != b) {
+		/*
+		 * dm_tm_shadow_block will have already decremented the old
+		 * block, but it is still referenced by the btree.  We
+		 * increment to stop the insert decrementing it below zero
+		 * when overwriting the old value.
+		 */
+		dm_tm_inc(info->btree_info.tm, b);
 		r = insert_ablock(info, index, *block, root);
+	}
 
 	return r;
 }
-- 
cgit v0.10.2