From a1b51e98676932d031f5eec1325b2df4bbdc8f26 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 6 Jan 2009 03:04:53 +0000
Subject: dm table: drop reference at unbind

Move one dm_table_put() so that the last reference in the thread
gets dropped in __unbind().

This is required for a following patch,
dm-table-rework-reference-counting.patch, which will change the logic in
such a way that table destructor is called only at specific points in
the code.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 421c9f0..8237141 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1330,8 +1330,8 @@ void dm_put(struct mapped_device *md)
 			dm_table_presuspend_targets(map);
 			dm_table_postsuspend_targets(map);
 		}
-		__unbind(md);
 		dm_table_put(map);
+		__unbind(md);
 		free_dev(md);
 	}
 }
-- 
cgit v0.10.2


From 90fa1527bddc7147dc0d590ee6184ca88bc50ecf Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 6 Jan 2009 03:04:54 +0000
Subject: dm snapshot: change yield to msleep

Change yield() to msleep(1). If the thread had realtime priority,
yield() doesn't really yield, so the yielding process would loop
indefinitely and cause machine lockup.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 6c96db2..4ceedd4 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -9,6 +9,7 @@
 #include <linux/blkdev.h>
 #include <linux/ctype.h>
 #include <linux/device-mapper.h>
+#include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kdev_t.h>
@@ -735,7 +736,7 @@ static void snapshot_dtr(struct dm_target *ti)
 	unregister_snapshot(s);
 
 	while (atomic_read(&s->pending_exceptions_count))
-		yield();
+		msleep(1);
 	/*
 	 * Ensure instructions in mempool_destroy aren't reordered
 	 * before atomic_read.
@@ -888,10 +889,10 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 
 	/*
 	 * Check for conflicting reads. This is extremely improbable,
-	 * so yield() is sufficient and there is no need for a wait queue.
+	 * so msleep(1) is sufficient and there is no need for a wait queue.
 	 */
 	while (__chunk_is_tracked(s, pe->e.old_chunk))
-		yield();
+		msleep(1);
 
 	/*
 	 * Add a proper exception, and remove the
-- 
cgit v0.10.2


From c7a2bd19b7c1e0bd2c7604c53d2583e91e536948 Mon Sep 17 00:00:00 2001
From: Takahiro Yasui <tyasui@redhat.com>
Date: Tue, 6 Jan 2009 03:04:56 +0000
Subject: dm log: fix dm_io_client leak on error paths

In create_log_context function, dm_io_client_destroy function needs
to be called, when memory allocation of disk_header, sync_bits and
recovering_bits failed, but dm_io_client_destroy is not called.

Cc: stable@kernel.org
Signed-off-by: Takahiro Yasui <tyasui@redhat.com>
Acked-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index a8c0fc7..13e2a1a 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -467,6 +467,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 		lc->disk_header = vmalloc(buf_size);
 		if (!lc->disk_header) {
 			DMWARN("couldn't allocate disk log buffer");
+			dm_io_client_destroy(lc->io_req.client);
 			kfree(lc);
 			return -ENOMEM;
 		}
@@ -482,6 +483,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 		DMWARN("couldn't allocate sync bitset");
 		if (!dev)
 			vfree(lc->clean_bits);
+		else
+			dm_io_client_destroy(lc->io_req.client);
 		vfree(lc->disk_header);
 		kfree(lc);
 		return -ENOMEM;
@@ -495,6 +498,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 		vfree(lc->sync_bits);
 		if (!dev)
 			vfree(lc->clean_bits);
+		else
+			dm_io_client_destroy(lc->io_req.client);
 		vfree(lc->disk_header);
 		kfree(lc);
 		return -ENOMEM;
-- 
cgit v0.10.2


From d460c65a6a9ec9e0d284864ec3a9a2d1b73f0e43 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 6 Jan 2009 03:04:57 +0000
Subject: dm raid1: fix error count

Always increase the error count when I/O on a leg of a mirror fails.

The error count is used to decide whether to select an alternative
mirror leg.  If the target doesn't use the "handle_errors" feature, the
error count is not updated and the bio can get requeued forever by the
read callback.

Fix it by increasing error_count before the handle_errors feature
checking.

Cc: stable@kernel.org
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ec43f9f..d0fed2b 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -197,9 +197,6 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
 	struct mirror_set *ms = m->ms;
 	struct mirror *new;
 
-	if (!errors_handled(ms))
-		return;
-
 	/*
 	 * error_count is used for nothing more than a
 	 * simple way to tell if a device has encountered
@@ -210,6 +207,9 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
 	if (test_and_set_bit(error_type, &m->error_type))
 		return;
 
+	if (!errors_handled(ms))
+		return;
+
 	if (m != get_default_mirror(ms))
 		goto out;
 
-- 
cgit v0.10.2


From 10d3bd09a3c25df114f74f7f86e1b58d070bef32 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 6 Jan 2009 03:04:58 +0000
Subject: dm: consolidate target deregistration error handling

Change dm_unregister_target to return void and use BUG() for error
reporting.

dm_unregister_target can only fail because of programming bug in the
target driver. It can't fail because of user's behavior or disk errors.

This patch changes unregister_target to return void and use BUG if
someone tries to unregister non-registered target or unregister target
that is in use.

This patch removes code duplication (testing of error codes in all dm
targets) and reports bugs in just one place, in dm_unregister_target. In
some target drivers, these return codes were ignored, which could lead
to a situation where bugs could be missed.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3326750..35bda49 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1322,11 +1322,7 @@ static int __init dm_crypt_init(void)
 
 static void __exit dm_crypt_exit(void)
 {
-	int r = dm_unregister_target(&crypt_target);
-
-	if (r < 0)
-		DMERR("unregister failed %d", r);
-
+	dm_unregister_target(&crypt_target);
 	kmem_cache_destroy(_crypt_io_pool);
 }
 
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 848b381..59ee1b0 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -364,11 +364,7 @@ bad_queue:
 
 static void __exit dm_delay_exit(void)
 {
-	int r = dm_unregister_target(&delay_target);
-
-	if (r < 0)
-		DMERR("unregister failed %d", r);
-
+	dm_unregister_target(&delay_target);
 	kmem_cache_destroy(delayed_cache);
 	destroy_workqueue(kdelayd_wq);
 }
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 44042be..79fb53e 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -156,8 +156,5 @@ int __init dm_linear_init(void)
 
 void dm_linear_exit(void)
 {
-	int r = dm_unregister_target(&linear_target);
-
-	if (r < 0)
-		DMERR("unregister failed %d", r);
+	dm_unregister_target(&linear_target);
 }
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 3d7f492..345a260 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1495,14 +1495,10 @@ static int __init dm_multipath_init(void)
 
 static void __exit dm_multipath_exit(void)
 {
-	int r;
-
 	destroy_workqueue(kmpath_handlerd);
 	destroy_workqueue(kmultipathd);
 
-	r = dm_unregister_target(&multipath_target);
-	if (r < 0)
-		DMERR("target unregister failed %d", r);
+	dm_unregister_target(&multipath_target);
 	kmem_cache_destroy(_mpio_cache);
 }
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d0fed2b..250f401 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1300,11 +1300,7 @@ static int __init dm_mirror_init(void)
 
 static void __exit dm_mirror_exit(void)
 {
-	int r;
-
-	r = dm_unregister_target(&mirror_target);
-	if (r < 0)
-		DMERR("unregister failed %d", r);
+	dm_unregister_target(&mirror_target);
 }
 
 /* Module hooks */
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 4ceedd4..a8005b4 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1470,17 +1470,10 @@ static int __init dm_snapshot_init(void)
 
 static void __exit dm_snapshot_exit(void)
 {
-	int r;
-
 	destroy_workqueue(ksnapd);
 
-	r = dm_unregister_target(&snapshot_target);
-	if (r)
-		DMERR("snapshot unregister failed %d", r);
-
-	r = dm_unregister_target(&origin_target);
-	if (r)
-		DMERR("origin unregister failed %d", r);
+	dm_unregister_target(&snapshot_target);
+	dm_unregister_target(&origin_target);
 
 	exit_origin_hash();
 	kmem_cache_destroy(pending_cache);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 9e4ef88..41569bc 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -337,9 +337,7 @@ int __init dm_stripe_init(void)
 
 void dm_stripe_exit(void)
 {
-	if (dm_unregister_target(&stripe_target))
-		DMWARN("target unregistration failed");
-
+	dm_unregister_target(&stripe_target);
 	destroy_workqueue(kstriped);
 
 	return;
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 835cf95..7decf10 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -130,26 +130,26 @@ int dm_register_target(struct target_type *t)
 	return rv;
 }
 
-int dm_unregister_target(struct target_type *t)
+void dm_unregister_target(struct target_type *t)
 {
 	struct tt_internal *ti;
 
 	down_write(&_lock);
 	if (!(ti = __find_target_type(t->name))) {
-		up_write(&_lock);
-		return -EINVAL;
+		DMCRIT("Unregistering unrecognised target: %s", t->name);
+		BUG();
 	}
 
 	if (ti->use) {
-		up_write(&_lock);
-		return -ETXTBSY;
+		DMCRIT("Attempt to unregister target still in use: %s",
+		       t->name);
+		BUG();
 	}
 
 	list_del(&ti->list);
 	kfree(ti);
 
 	up_write(&_lock);
-	return 0;
 }
 
 /*
@@ -187,8 +187,7 @@ int __init dm_target_init(void)
 
 void dm_target_exit(void)
 {
-	if (dm_unregister_target(&error_target))
-		DMWARN("error target unregistration failed");
+	dm_unregister_target(&error_target);
 }
 
 EXPORT_SYMBOL(dm_register_target);
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index cdbf126..bbc9703 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -69,10 +69,7 @@ static int __init dm_zero_init(void)
 
 static void __exit dm_zero_exit(void)
 {
-	int r = dm_unregister_target(&zero_target);
-
-	if (r < 0)
-		DMERR("unregister failed %d", r);
+	dm_unregister_target(&zero_target);
 }
 
 module_init(dm_zero_init)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index c17fd33..89ff2df 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -157,8 +157,7 @@ struct dm_target {
 };
 
 int dm_register_target(struct target_type *t);
-int dm_unregister_target(struct target_type *t);
-
+void dm_unregister_target(struct target_type *t);
 
 /*-----------------------------------------------------------------
  * Functions for creating and manipulating mapped devices.
@@ -276,6 +275,9 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
  *---------------------------------------------------------------*/
 #define DM_NAME "device-mapper"
 
+#define DMCRIT(f, arg...) \
+	printk(KERN_CRIT DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
+
 #define DMERR(f, arg...) \
 	printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
 #define DMERR_LIMIT(f, arg...) \
-- 
cgit v0.10.2


From 6f3af01cb0eda0ec50fe1e4cbdf028269dc396fe Mon Sep 17 00:00:00 2001
From: Takahiro Yasui <tyasui@redhat.com>
Date: Tue, 6 Jan 2009 03:04:59 +0000
Subject: dm log: avoid reinitialising io_req on every operation

rw_header function updates three members of io_req data every time
when I/O is processed. bi_rw and notify.fn are never modified once
they get initialized, and so they can be set in advance.

header_to_disk() can also be pulled out of write_header() since only one
caller needs it and write_header() can be replaced by rw_header()
directly.

Signed-off-by: Takahiro Yasui <tyasui@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 13e2a1a..691cb9c 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -326,8 +326,6 @@ static void header_from_disk(struct log_header *core, struct log_header *disk)
 static int rw_header(struct log_c *lc, int rw)
 {
 	lc->io_req.bi_rw = rw;
-	lc->io_req.mem.ptr.vma = lc->disk_header;
-	lc->io_req.notify.fn = NULL;
 
 	return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
 }
@@ -362,12 +360,6 @@ static int read_header(struct log_c *log)
 	return 0;
 }
 
-static inline int write_header(struct log_c *log)
-{
-	header_to_disk(&log->header, log->disk_header);
-	return rw_header(log, WRITE);
-}
-
 /*----------------------------------------------------------------
  * core log constructor/destructor
  *
@@ -454,7 +446,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 		buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
 				       bitset_size, ti->limits.hardsect_size);
 		lc->header_location.count = buf_size >> SECTOR_SHIFT;
+
 		lc->io_req.mem.type = DM_IO_VMA;
+		lc->io_req.notify.fn = NULL;
 		lc->io_req.client = dm_io_client_create(dm_div_up(buf_size,
 								   PAGE_SIZE));
 		if (IS_ERR(lc->io_req.client)) {
@@ -472,6 +466,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 			return -ENOMEM;
 		}
 
+		lc->io_req.mem.ptr.vma = lc->disk_header;
 		lc->clean_bits = (void *)lc->disk_header +
 				 (LOG_OFFSET << SECTOR_SHIFT);
 	}
@@ -636,8 +631,10 @@ static int disk_resume(struct dm_dirty_log *log)
 	/* set the correct number of regions in the header */
 	lc->header.nr_regions = lc->region_count;
 
+	header_to_disk(&lc->header, lc->disk_header);
+
 	/* write the new header */
-	r = write_header(lc);
+	r = rw_header(lc, WRITE);
 	if (r) {
 		DMWARN("%s: Failed to write header on dirty region log device",
 		       lc->log_dev->name);
@@ -687,7 +684,7 @@ static int disk_flush(struct dm_dirty_log *log)
 	if (!lc->touched)
 		return 0;
 
-	r = write_header(lc);
+	r = rw_header(lc, WRITE);
 	if (r)
 		fail_log_device(lc);
 	else
-- 
cgit v0.10.2


From 2045e88edb4e0c9ce62d317f77dc59d27d9c530e Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Tue, 6 Jan 2009 03:05:01 +0000
Subject: dm log: move region_size validation

Move log size validation from mirror target to log constructor.

Removed PAGE_SIZE restriction we no longer think necessary.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 691cb9c..40ed70d 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -360,6 +360,17 @@ static int read_header(struct log_c *log)
 	return 0;
 }
 
+static int _check_region_size(struct dm_target *ti, uint32_t region_size)
+{
+	if (region_size < 2 || region_size > ti->len)
+		return 0;
+
+	if (!is_power_of_2(region_size))
+		return 0;
+
+	return 1;
+}
+
 /*----------------------------------------------------------------
  * core log constructor/destructor
  *
@@ -395,8 +406,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 		}
 	}
 
-	if (sscanf(argv[0], "%u", &region_size) != 1) {
-		DMWARN("invalid region size string");
+	if (sscanf(argv[0], "%u", &region_size) != 1 ||
+	    !_check_region_size(ti, region_size)) {
+		DMWARN("invalid region size %s", argv[0]);
 		return -EINVAL;
 	}
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 250f401..4d6bc10 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -808,12 +808,6 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
 	kfree(ms);
 }
 
-static inline int _check_region_size(struct dm_target *ti, uint32_t size)
-{
-	return !(size % (PAGE_SIZE >> 9) || !is_power_of_2(size) ||
-		 size > ti->len);
-}
-
 static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
 		      unsigned int mirror, char **argv)
 {
@@ -872,12 +866,6 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
 		return NULL;
 	}
 
-	if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
-		ti->error = "Invalid region size";
-		dm_dirty_log_destroy(dl);
-		return NULL;
-	}
-
 	return dl;
 }
 
-- 
cgit v0.10.2


From ac1f0ac22c7be908fd33407273b9808bfaedada4 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Tue, 6 Jan 2009 03:05:02 +0000
Subject: dm log: ensure log bitmap fits on log device

Check that the log bitmap will fit within the log device.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 40ed70d..737961f 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -457,6 +457,14 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 		 */
 		buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
 				       bitset_size, ti->limits.hardsect_size);
+
+		if (buf_size > dev->bdev->bd_inode->i_size) {
+			DMWARN("log device %s too small: need %llu bytes",
+				dev->name, (unsigned long long)buf_size);
+			kfree(lc);
+			return -EINVAL;
+		}
+
 		lc->header_location.count = buf_size >> SECTOR_SHIFT;
 
 		lc->io_req.mem.type = DM_IO_VMA;
-- 
cgit v0.10.2


From 23d39f63aa87e812fd879b8bc32ee6ccfe733de3 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Tue, 6 Jan 2009 03:05:04 +0000
Subject: dm ioctl: allow dm_copy_name_and_uuid to return only one field

Allow NULL buffer in dm_copy_name_and_uuid if you only want to return one of
the fields.

(Required by a following patch that adds these fields to sysfs.)

Signed-off-by: Milan Broz <mbroz@redhat.com>
Reviewed-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 777c948..8da7a01 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1550,8 +1550,10 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
 		goto out;
 	}
 
-	strcpy(name, hc->name);
-	strcpy(uuid, hc->uuid ? : "");
+	if (name)
+		strcpy(name, hc->name);
+	if (uuid)
+		strcpy(uuid, hc->uuid ? : "");
 
 out:
 	up_read(&_hash_lock);
-- 
cgit v0.10.2


From 8fbf26ad5b16ad3a826ca7fe3e86700420abed1f Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Tue, 6 Jan 2009 03:05:06 +0000
Subject: dm request: add caches

This patch prepares some kmem_caches for request-based dm.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8237141..4882ce7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -32,6 +32,7 @@ static unsigned int _major = 0;
 
 static DEFINE_SPINLOCK(_minor_lock);
 /*
+ * For bio-based dm.
  * One of these is allocated per bio.
  */
 struct dm_io {
@@ -43,6 +44,7 @@ struct dm_io {
 };
 
 /*
+ * For bio-based dm.
  * One of these is allocated per target within a bio.  Hopefully
  * this will be simplified out one day.
  */
@@ -54,6 +56,27 @@ struct dm_target_io {
 
 DEFINE_TRACE(block_bio_complete);
 
+/*
+ * For request-based dm.
+ * One of these is allocated per request.
+ */
+struct dm_rq_target_io {
+	struct mapped_device *md;
+	struct dm_target *ti;
+	struct request *orig, clone;
+	int error;
+	union map_info info;
+};
+
+/*
+ * For request-based dm.
+ * One of these is allocated per bio.
+ */
+struct dm_rq_clone_bio_info {
+	struct bio *orig;
+	struct request *rq;
+};
+
 union map_info *dm_get_mapinfo(struct bio *bio)
 {
 	if (bio && bio->bi_private)
@@ -149,6 +172,8 @@ struct mapped_device {
 #define MIN_IOS 256
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_tio_cache;
+static struct kmem_cache *_rq_tio_cache;
+static struct kmem_cache *_rq_bio_info_cache;
 
 static int __init local_init(void)
 {
@@ -164,9 +189,17 @@ static int __init local_init(void)
 	if (!_tio_cache)
 		goto out_free_io_cache;
 
+	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
+	if (!_rq_tio_cache)
+		goto out_free_tio_cache;
+
+	_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
+	if (!_rq_bio_info_cache)
+		goto out_free_rq_tio_cache;
+
 	r = dm_uevent_init();
 	if (r)
-		goto out_free_tio_cache;
+		goto out_free_rq_bio_info_cache;
 
 	_major = major;
 	r = register_blkdev(_major, _name);
@@ -180,6 +213,10 @@ static int __init local_init(void)
 
 out_uevent_exit:
 	dm_uevent_exit();
+out_free_rq_bio_info_cache:
+	kmem_cache_destroy(_rq_bio_info_cache);
+out_free_rq_tio_cache:
+	kmem_cache_destroy(_rq_tio_cache);
 out_free_tio_cache:
 	kmem_cache_destroy(_tio_cache);
 out_free_io_cache:
@@ -190,6 +227,8 @@ out_free_io_cache:
 
 static void local_exit(void)
 {
+	kmem_cache_destroy(_rq_bio_info_cache);
+	kmem_cache_destroy(_rq_tio_cache);
 	kmem_cache_destroy(_tio_cache);
 	kmem_cache_destroy(_io_cache);
 	unregister_blkdev(_major, _name);
-- 
cgit v0.10.2


From 7d76345da6ed3927c9cbf5d3f7a7021e8bba7374 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Tue, 6 Jan 2009 03:05:07 +0000
Subject: dm request: extend target interface

This patch adds the following target interfaces for request-based dm.

  map_rq    : for mapping a request

  rq_end_io : for finishing a request

  busy      : for avoiding performance regression from bio-based dm.
              Target can tell dm core not to map requests now, and
              that may help requests in the block layer queue to be
              bigger by I/O merging.
              In bio-based dm, this behavior is done by device
              drivers managing the block layer queue.
              But in request-based dm, dm core has to do that
              since dm core manages the block layer queue.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 89ff2df..c1ba76c 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -45,6 +45,8 @@ typedef void (*dm_dtr_fn) (struct dm_target *ti);
  */
 typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio,
 			  union map_info *map_context);
+typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
+				  union map_info *map_context);
 
 /*
  * Returns:
@@ -57,6 +59,9 @@ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio,
 typedef int (*dm_endio_fn) (struct dm_target *ti,
 			    struct bio *bio, int error,
 			    union map_info *map_context);
+typedef int (*dm_request_endio_fn) (struct dm_target *ti,
+				    struct request *clone, int error,
+				    union map_info *map_context);
 
 typedef void (*dm_flush_fn) (struct dm_target *ti);
 typedef void (*dm_presuspend_fn) (struct dm_target *ti);
@@ -75,6 +80,13 @@ typedef int (*dm_ioctl_fn) (struct dm_target *ti, unsigned int cmd,
 typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm,
 			    struct bio_vec *biovec, int max_size);
 
+/*
+ * Returns:
+ *    0: The target can handle the next I/O immediately.
+ *    1: The target can't handle the next I/O immediately.
+ */
+typedef int (*dm_busy_fn) (struct dm_target *ti);
+
 void dm_error(const char *message);
 
 /*
@@ -107,7 +119,9 @@ struct target_type {
 	dm_ctr_fn ctr;
 	dm_dtr_fn dtr;
 	dm_map_fn map;
+	dm_map_request_fn map_rq;
 	dm_endio_fn end_io;
+	dm_request_endio_fn rq_end_io;
 	dm_flush_fn flush;
 	dm_presuspend_fn presuspend;
 	dm_postsuspend_fn postsuspend;
@@ -117,6 +131,7 @@ struct target_type {
 	dm_message_fn message;
 	dm_ioctl_fn ioctl;
 	dm_merge_fn merge;
+	dm_busy_fn busy;
 };
 
 struct io_restrictions {
-- 
cgit v0.10.2


From ab4c1424882be9cd70b89abf2b484add355712fa Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 6 Jan 2009 03:05:09 +0000
Subject: dm: support barriers on simple devices

Implement barrier support for single device DM devices

This patch implements barrier support in DM for the common case of dm linear
just remapping a single underlying device. In this case we can safely
pass the barrier through because there can be no reordering between
devices.

 NB. Any DM device might cease to support barriers if it gets
     reconfigured so code must continue to allow for a possible
     -EOPNOTSUPP on every barrier bio submitted.  - agk

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 79fb53e..bfa107f 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -142,6 +142,7 @@ static struct target_type linear_target = {
 	.status = linear_status,
 	.ioctl  = linear_ioctl,
 	.merge  = linear_merge,
+	.features = DM_TARGET_SUPPORTS_BARRIERS,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 04e5fd7..ebaaf72 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -38,6 +38,8 @@ struct dm_table {
 	sector_t *highs;
 	struct dm_target *targets;
 
+	unsigned barriers_supported:1;
+
 	/*
 	 * Indicates the rw permissions for the new logical
 	 * device.  This should be a combination of FMODE_READ
@@ -227,6 +229,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 
 	INIT_LIST_HEAD(&t->devices);
 	atomic_set(&t->holders, 1);
+	t->barriers_supported = 1;
 
 	if (!num_targets)
 		num_targets = KEYS_PER_NODE;
@@ -728,6 +731,10 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 	/* FIXME: the plan is to combine high here and then have
 	 * the merge fn apply the target level restrictions. */
 	combine_restrictions_low(&t->limits, &tgt->limits);
+
+	if (!(tgt->type->features & DM_TARGET_SUPPORTS_BARRIERS))
+		t->barriers_supported = 0;
+
 	return 0;
 
  bad:
@@ -772,6 +779,12 @@ int dm_table_complete(struct dm_table *t)
 
 	check_for_valid_limits(&t->limits);
 
+	/*
+	 * We only support barriers if there is exactly one underlying device.
+	 */
+	if (!list_is_singular(&t->devices))
+		t->barriers_supported = 0;
+
 	/* how many indexes will the btree have ? */
 	leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
 	t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -986,6 +999,12 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
 	return t->md;
 }
 
+int dm_table_barrier_ok(struct dm_table *t)
+{
+	return t->barriers_supported;
+}
+EXPORT_SYMBOL(dm_table_barrier_ok);
+
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4882ce7..dd953b1 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -835,7 +835,11 @@ static int __split_bio(struct mapped_device *md, struct bio *bio)
 	ci.map = dm_get_table(md);
 	if (unlikely(!ci.map))
 		return -EIO;
-
+	if (unlikely(bio_barrier(bio) && !dm_table_barrier_ok(ci.map))) {
+		dm_table_put(ci.map);
+		bio_endio(bio, -EOPNOTSUPP);
+		return 0;
+	}
 	ci.md = md;
 	ci.bio = bio;
 	ci.io = alloc_io(md);
@@ -919,15 +923,6 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	struct mapped_device *md = q->queuedata;
 	int cpu;
 
-	/*
-	 * There is no use in forwarding any barrier request since we can't
-	 * guarantee it is (or can be) handled by the targets correctly.
-	 */
-	if (unlikely(bio_barrier(bio))) {
-		bio_endio(bio, -EOPNOTSUPP);
-		return 0;
-	}
-
 	down_read(&md->io_lock);
 
 	cpu = part_stat_lock();
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0ade60c..5b5d08b 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -51,6 +51,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
  * To check the return value from dm_table_find_target().
  */
 #define dm_target_is_valid(t) ((t)->table)
+int dm_table_barrier_ok(struct dm_table *t);
 
 /*-----------------------------------------------------------------
  * A registry of target types.
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index c1ba76c..8209e08 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -112,7 +112,14 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d);
 /*
  * Information about a target type
  */
+
+/*
+ * Target features
+ */
+#define DM_TARGET_SUPPORTS_BARRIERS 0x00000001
+
 struct target_type {
+	uint64_t features;
 	const char *name;
 	struct module *module;
 	unsigned version[3];
-- 
cgit v0.10.2


From d58168763f74d1edbc296d7038c60efe6493fdd4 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 6 Jan 2009 03:05:10 +0000
Subject: dm table: rework reference counting

Rework table reference counting.

The existing code uses a reference counter. When the last reference is
dropped and the counter reaches zero, the table destructor is called.
Table reference counters are acquired/released from upcalls from other
kernel code (dm_any_congested, dm_merge_bvec, dm_unplug_all).
If the reference counter reaches zero in one of the upcalls, the table
destructor is called from almost random kernel code.

This leads to various problems:
* dm_any_congested being called under a spinlock, which calls the
  destructor, which calls some sleeping function.
* the destructor attempting to take a lock that is already taken by the
  same process.
* stale reference from some other kernel code keeps the table
  constructed, which keeps some devices open, even after successful
  return from "dmsetup remove". This can confuse lvm and prevent closing
  of underlying devices or reusing device minor numbers.

The patch changes reference counting so that the table destructor can be
called only at predetermined places.

The table has always exactly one reference from either mapped_device->map
or hash_cell->new_map. After this patch, this reference is not counted
in table->holders.  A pair of dm_create_table/dm_destroy_table functions
is used for table creation/destruction.

Temporary references from the other code increase table->holders. A pair
of dm_table_get/dm_table_put functions is used to manipulate it.

When the table is about to be destroyed, we wait for table->holders to
reach 0. Then, we call the table destructor.  We use active waiting with
msleep(1), because the situation happens rarely (to one user in 5 years)
and removing the device isn't performance-critical task: the user doesn't
care if it takes one tick more or not.

This way, the destructor is called only at specific points
(dm_table_destroy function) and the above problems associated with lazy
destruction can't happen.

Finally remove the temporary protection added to dm_any_congested().

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 8da7a01..54d0588 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -233,7 +233,7 @@ static void __hash_remove(struct hash_cell *hc)
 	}
 
 	if (hc->new_map)
-		dm_table_put(hc->new_map);
+		dm_table_destroy(hc->new_map);
 	dm_put(hc->md);
 	free_cell(hc);
 }
@@ -827,8 +827,8 @@ static int do_resume(struct dm_ioctl *param)
 
 		r = dm_swap_table(md, new_map);
 		if (r) {
+			dm_table_destroy(new_map);
 			dm_put(md);
-			dm_table_put(new_map);
 			return r;
 		}
 
@@ -836,8 +836,6 @@ static int do_resume(struct dm_ioctl *param)
 			set_disk_ro(dm_disk(md), 0);
 		else
 			set_disk_ro(dm_disk(md), 1);
-
-		dm_table_put(new_map);
 	}
 
 	if (dm_suspended(md))
@@ -1080,7 +1078,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 	}
 
 	if (hc->new_map)
-		dm_table_put(hc->new_map);
+		dm_table_destroy(hc->new_map);
 	hc->new_map = t;
 	up_write(&_hash_lock);
 
@@ -1109,7 +1107,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
 	}
 
 	if (hc->new_map) {
-		dm_table_put(hc->new_map);
+		dm_table_destroy(hc->new_map);
 		hc->new_map = NULL;
 	}
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ebaaf72..2fd66c3 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2001 Sistina Software (UK) Limited.
- * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/mutex.h>
+#include <linux/delay.h>
 #include <asm/atomic.h>
 
 #define DM_MSG_PREFIX "table"
@@ -24,6 +25,19 @@
 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
 
+/*
+ * The table has always exactly one reference from either mapped_device->map
+ * or hash_cell->new_map. This reference is not counted in table->holders.
+ * A pair of dm_create_table/dm_destroy_table functions is used for table
+ * creation/destruction.
+ *
+ * Temporary references from the other code increase table->holders. A pair
+ * of dm_table_get/dm_table_put functions is used to manipulate it.
+ *
+ * When the table is about to be destroyed, we wait for table->holders to
+ * drop to zero.
+ */
+
 struct dm_table {
 	struct mapped_device *md;
 	atomic_t holders;
@@ -228,7 +242,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&t->devices);
-	atomic_set(&t->holders, 1);
+	atomic_set(&t->holders, 0);
 	t->barriers_supported = 1;
 
 	if (!num_targets)
@@ -259,10 +273,14 @@ static void free_devices(struct list_head *devices)
 	}
 }
 
-static void table_destroy(struct dm_table *t)
+void dm_table_destroy(struct dm_table *t)
 {
 	unsigned int i;
 
+	while (atomic_read(&t->holders))
+		msleep(1);
+	smp_mb();
+
 	/* free the indexes (see dm_table_complete) */
 	if (t->depth >= 2)
 		vfree(t->index[t->depth - 2]);
@@ -300,8 +318,8 @@ void dm_table_put(struct dm_table *t)
 	if (!t)
 		return;
 
-	if (atomic_dec_and_test(&t->holders))
-		table_destroy(t);
+	smp_mb__before_atomic_dec();
+	atomic_dec(&t->holders);
 }
 
 /*
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dd953b1..9f9aa64 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -977,8 +977,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 	struct mapped_device *md = congested_data;
 	struct dm_table *map;
 
-	atomic_inc(&md->pending);
-
 	if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
 		map = dm_get_table(md);
 		if (map) {
@@ -987,10 +985,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 		}
 	}
 
-	if (!atomic_dec_return(&md->pending))
-		/* nudge anyone waiting on suspend queue */
-		wake_up(&md->wait);
-
 	return r;
 }
 
@@ -1250,10 +1244,12 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
 
 	if (md->suspended_bdev)
 		__set_size(md, size);
-	if (size == 0)
+
+	if (!size) {
+		dm_table_destroy(t);
 		return 0;
+	}
 
-	dm_table_get(t);
 	dm_table_event_callback(t, event_callback, md);
 
 	write_lock(&md->map_lock);
@@ -1275,7 +1271,7 @@ static void __unbind(struct mapped_device *md)
 	write_lock(&md->map_lock);
 	md->map = NULL;
 	write_unlock(&md->map_lock);
-	dm_table_put(map);
+	dm_table_destroy(map);
 }
 
 /*
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 5b5d08b..bbbe911 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -36,6 +36,7 @@ struct dm_table;
 /*-----------------------------------------------------------------
  * Internal table functions.
  *---------------------------------------------------------------*/
+void dm_table_destroy(struct dm_table *t);
 void dm_table_event_callback(struct dm_table *t,
 			     void (*fn)(void *), void *context);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
-- 
cgit v0.10.2


From 784aae735d9b0bba3f8b9faef4c8b30df3bf0128 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Tue, 6 Jan 2009 03:05:12 +0000
Subject: dm: add name and uuid to sysfs

Implement simple read-only sysfs entry for device-mapper block device.

This patch adds a simple sysfs directory named "dm" under block device
properties and implements
	- name attribute (string containing mapped device name)
	- uuid attribute (string containing UUID, or empty string if not set)

The kobject is embedded in mapped_device struct, so no additional
memory allocation is needed for initializing sysfs entry.

During the processing of sysfs attribute we need to lock mapped device
which is done by a new function dm_get_from_kobj, which returns the md
associated with kobject and increases the usage count.

Each 'show attribute' function is responsible for its own locking.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 1c61580..63f0ae9 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -3,7 +3,7 @@
 #
 
 dm-mod-objs	:= dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
-		   dm-ioctl.o dm-io.o dm-kcopyd.o
+		   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
 dm-multipath-objs := dm-path-selector.o dm-mpath.o
 dm-snapshot-objs := dm-snap.o dm-exception-store.o
 dm-mirror-objs	:= dm-raid1.o
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
new file mode 100644
index 0000000..a2a45e6
--- /dev/null
+++ b/drivers/md/dm-sysfs.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/sysfs.h>
+#include <linux/dm-ioctl.h>
+#include "dm.h"
+
+struct dm_sysfs_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct mapped_device *, char *);
+	ssize_t (*store)(struct mapped_device *, char *);
+};
+
+#define DM_ATTR_RO(_name) \
+struct dm_sysfs_attr dm_attr_##_name = \
+	__ATTR(_name, S_IRUGO, dm_attr_##_name##_show, NULL)
+
+static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr,
+			    char *page)
+{
+	struct dm_sysfs_attr *dm_attr;
+	struct mapped_device *md;
+	ssize_t ret;
+
+	dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
+	if (!dm_attr->show)
+		return -EIO;
+
+	md = dm_get_from_kobject(kobj);
+	if (!md)
+		return -EINVAL;
+
+	ret = dm_attr->show(md, page);
+	dm_put(md);
+
+	return ret;
+}
+
+static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf)
+{
+	if (dm_copy_name_and_uuid(md, buf, NULL))
+		return -EIO;
+
+	strcat(buf, "\n");
+	return strlen(buf);
+}
+
+static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
+{
+	if (dm_copy_name_and_uuid(md, NULL, buf))
+		return -EIO;
+
+	strcat(buf, "\n");
+	return strlen(buf);
+}
+
+static DM_ATTR_RO(name);
+static DM_ATTR_RO(uuid);
+
+static struct attribute *dm_attrs[] = {
+	&dm_attr_name.attr,
+	&dm_attr_uuid.attr,
+	NULL,
+};
+
+static struct sysfs_ops dm_sysfs_ops = {
+	.show	= dm_attr_show,
+};
+
+/*
+ * dm kobject is embedded in mapped_device structure
+ * no need to define release function here
+ */
+static struct kobj_type dm_ktype = {
+	.sysfs_ops	= &dm_sysfs_ops,
+	.default_attrs	= dm_attrs,
+};
+
+/*
+ * Initialize kobj
+ * because nobody using md yet, no need to call explicit dm_get/put
+ */
+int dm_sysfs_init(struct mapped_device *md)
+{
+	return kobject_init_and_add(dm_kobject(md), &dm_ktype,
+				    &disk_to_dev(dm_disk(md))->kobj,
+				    "%s", "dm");
+}
+
+/*
+ * Remove kobj, called after all references removed
+ */
+void dm_sysfs_exit(struct mapped_device *md)
+{
+	kobject_put(dm_kobject(md));
+}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9f9aa64..51ba1db 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -167,6 +167,9 @@ struct mapped_device {
 
 	/* forced geometry settings */
 	struct hd_geometry geometry;
+
+	/* sysfs handle */
+	struct kobject kobj;
 };
 
 #define MIN_IOS 256
@@ -1285,6 +1288,8 @@ int dm_create(int minor, struct mapped_device **result)
 	if (!md)
 		return -ENXIO;
 
+	dm_sysfs_init(md);
+
 	*result = md;
 	return 0;
 }
@@ -1360,6 +1365,7 @@ void dm_put(struct mapped_device *md)
 			dm_table_presuspend_targets(map);
 			dm_table_postsuspend_targets(map);
 		}
+		dm_sysfs_exit(md);
 		dm_table_put(map);
 		__unbind(md);
 		free_dev(md);
@@ -1699,6 +1705,27 @@ struct gendisk *dm_disk(struct mapped_device *md)
 	return md->disk;
 }
 
+struct kobject *dm_kobject(struct mapped_device *md)
+{
+	return &md->kobj;
+}
+
+/*
+ * struct mapped_device should not be exported outside of dm.c
+ * so use this check to verify that kobj is part of md structure
+ */
+struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
+{
+	struct mapped_device *md;
+
+	md = container_of(kobj, struct mapped_device, kobj);
+	if (&md->kobj != kobj)
+		return NULL;
+
+	dm_get(md);
+	return md;
+}
+
 int dm_suspended(struct mapped_device *md)
 {
 	return test_bit(DMF_SUSPENDED, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index bbbe911..20194e0 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -74,6 +74,14 @@ int dm_interface_init(void);
 void dm_interface_exit(void);
 
 /*
+ * sysfs interface
+ */
+int dm_sysfs_init(struct mapped_device *md);
+void dm_sysfs_exit(struct mapped_device *md);
+struct kobject *dm_kobject(struct mapped_device *md);
+struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
+
+/*
  * Targets for linear and striped mappings
  */
 int dm_linear_init(void);
-- 
cgit v0.10.2


From fe9cf30eb8186ef267d1868dc9f12f2d0f40835a Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Tue, 6 Jan 2009 03:05:13 +0000
Subject: dm mpath: move trigger_event to system workqueue

The same workqueue is used both for sending uevents and processing queued I/O.
Deadlock has been reported in RHEL5 when sending a uevent was blocked waiting
for the queued I/O to be processed.  Use scheduled_work() for the asynchronous
uevents instead.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 345a260..095f77b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -889,7 +889,7 @@ static int fail_path(struct pgpath *pgpath)
 	dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
 		      pgpath->path.dev->name, m->nr_valid_paths);
 
-	queue_work(kmultipathd, &m->trigger_event);
+	schedule_work(&m->trigger_event);
 	queue_work(kmultipathd, &pgpath->deactivate_path);
 
 out:
@@ -932,7 +932,7 @@ static int reinstate_path(struct pgpath *pgpath)
 	dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
 		      pgpath->path.dev->name, m->nr_valid_paths);
 
-	queue_work(kmultipathd, &m->trigger_event);
+	schedule_work(&m->trigger_event);
 
 out:
 	spin_unlock_irqrestore(&m->lock, flags);
@@ -976,7 +976,7 @@ static void bypass_pg(struct multipath *m, struct priority_group *pg,
 
 	spin_unlock_irqrestore(&m->lock, flags);
 
-	queue_work(kmultipathd, &m->trigger_event);
+	schedule_work(&m->trigger_event);
 }
 
 /*
@@ -1006,7 +1006,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
 	}
 	spin_unlock_irqrestore(&m->lock, flags);
 
-	queue_work(kmultipathd, &m->trigger_event);
+	schedule_work(&m->trigger_event);
 	return 0;
 }
 
-- 
cgit v0.10.2


From aea53d92f70eeb00ae480e399a997dd55fd5055d Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 6 Jan 2009 03:05:15 +0000
Subject: dm snapshot: separate out exception store interface

Pull structures that bridge the gap between snapshot and
exception store out of dm-snap.h and put them in a new
.h file - dm-exception-store.h.  This file will define the
API for new exception stores.

Ultimately, dm-snap.h is unnecessary, since only dm-snap.c
should be using it.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 01590f3e..ef152e6 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -7,6 +7,7 @@
  * This file is released under the GPL.
  */
 
+#include "dm-exception-store.h"
 #include "dm-snap.h"
 
 #include <linux/mm.h>
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
new file mode 100644
index 0000000..d75f775
--- /dev/null
+++ b/drivers/md/dm-exception-store.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
+ *
+ * Device-mapper snapshot exception store.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _LINUX_DM_EXCEPTION_STORE
+#define _LINUX_DM_EXCEPTION_STORE
+
+#include <linux/blkdev.h>
+
+/*
+ * The snapshot code deals with largish chunks of the disk at a
+ * time. Typically 32k - 512k.
+ */
+typedef sector_t chunk_t;
+
+/*
+ * An exception is used where an old chunk of data has been
+ * replaced by a new one.
+ * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number
+ * of chunks that follow contiguously.  Remaining bits hold the number of the
+ * chunk within the device.
+ */
+struct dm_snap_exception {
+	struct list_head hash_list;
+
+	chunk_t old_chunk;
+	chunk_t new_chunk;
+};
+
+/*
+ * Abstraction to handle the meta/layout of exception stores (the
+ * COW device).
+ */
+struct exception_store {
+	/*
+	 * Destroys this object when you've finished with it.
+	 */
+	void (*destroy) (struct exception_store *store);
+
+	/*
+	 * The target shouldn't read the COW device until this is
+	 * called.
+	 */
+	int (*read_metadata) (struct exception_store *store);
+
+	/*
+	 * Find somewhere to store the next exception.
+	 */
+	int (*prepare_exception) (struct exception_store *store,
+				  struct dm_snap_exception *e);
+
+	/*
+	 * Update the metadata with this exception.
+	 */
+	void (*commit_exception) (struct exception_store *store,
+				  struct dm_snap_exception *e,
+				  void (*callback) (void *, int success),
+				  void *callback_context);
+
+	/*
+	 * The snapshot is invalid, note this in the metadata.
+	 */
+	void (*drop_snapshot) (struct exception_store *store);
+
+	/*
+	 * Return how full the snapshot is.
+	 */
+	void (*fraction_full) (struct exception_store *store,
+			       sector_t *numerator,
+			       sector_t *denominator);
+
+	struct dm_snapshot *snap;
+	void *context;
+};
+
+/*
+ * Funtions to manipulate consecutive chunks
+ */
+#  if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
+#    define DM_CHUNK_CONSECUTIVE_BITS 8
+#    define DM_CHUNK_NUMBER_BITS 56
+
+static inline chunk_t dm_chunk_number(chunk_t chunk)
+{
+	return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
+}
+
+static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
+{
+	return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
+}
+
+static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
+{
+	e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
+
+	BUG_ON(!dm_consecutive_chunk_count(e));
+}
+
+#  else
+#    define DM_CHUNK_CONSECUTIVE_BITS 0
+
+static inline chunk_t dm_chunk_number(chunk_t chunk)
+{
+	return chunk;
+}
+
+static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
+{
+	return 0;
+}
+
+static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
+{
+}
+
+#  endif
+
+/*
+ * Two exception store implementations.
+ */
+int dm_create_persistent(struct exception_store *store);
+
+int dm_create_transient(struct exception_store *store);
+
+#endif /* _LINUX_DM_EXCEPTION_STORE */
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index a8005b4..81f03a0 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -21,6 +21,7 @@
 #include <linux/log2.h>
 #include <linux/dm-kcopyd.h>
 
+#include "dm-exception-store.h"
 #include "dm-snap.h"
 #include "dm-bio-list.h"
 
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index 99c0106..6e4beaf 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -1,6 +1,4 @@
 /*
- * dm-snapshot.c
- *
  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
  *
  * This file is released under the GPL.
@@ -10,6 +8,7 @@
 #define DM_SNAPSHOT_H
 
 #include <linux/device-mapper.h>
+#include "dm-exception-store.h"
 #include "dm-bio-list.h"
 #include <linux/blkdev.h>
 #include <linux/workqueue.h>
@@ -20,116 +19,6 @@ struct exception_table {
 	struct list_head *table;
 };
 
-/*
- * The snapshot code deals with largish chunks of the disk at a
- * time. Typically 32k - 512k.
- */
-typedef sector_t chunk_t;
-
-/*
- * An exception is used where an old chunk of data has been
- * replaced by a new one.
- * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number
- * of chunks that follow contiguously.  Remaining bits hold the number of the
- * chunk within the device.
- */
-struct dm_snap_exception {
-	struct list_head hash_list;
-
-	chunk_t old_chunk;
-	chunk_t new_chunk;
-};
-
-/*
- * Funtions to manipulate consecutive chunks
- */
-#  if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
-#    define DM_CHUNK_CONSECUTIVE_BITS 8
-#    define DM_CHUNK_NUMBER_BITS 56
-
-static inline chunk_t dm_chunk_number(chunk_t chunk)
-{
-	return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
-}
-
-static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
-{
-	return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
-}
-
-static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
-{
-	e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
-
-	BUG_ON(!dm_consecutive_chunk_count(e));
-}
-
-#  else
-#    define DM_CHUNK_CONSECUTIVE_BITS 0
-
-static inline chunk_t dm_chunk_number(chunk_t chunk)
-{
-	return chunk;
-}
-
-static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
-{
-	return 0;
-}
-
-static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
-{
-}
-
-#  endif
-
-/*
- * Abstraction to handle the meta/layout of exception stores (the
- * COW device).
- */
-struct exception_store {
-
-	/*
-	 * Destroys this object when you've finished with it.
-	 */
-	void (*destroy) (struct exception_store *store);
-
-	/*
-	 * The target shouldn't read the COW device until this is
-	 * called.
-	 */
-	int (*read_metadata) (struct exception_store *store);
-
-	/*
-	 * Find somewhere to store the next exception.
-	 */
-	int (*prepare_exception) (struct exception_store *store,
-				  struct dm_snap_exception *e);
-
-	/*
-	 * Update the metadata with this exception.
-	 */
-	void (*commit_exception) (struct exception_store *store,
-				  struct dm_snap_exception *e,
-				  void (*callback) (void *, int success),
-				  void *callback_context);
-
-	/*
-	 * The snapshot is invalid, note this in the metadata.
-	 */
-	void (*drop_snapshot) (struct exception_store *store);
-
-	/*
-	 * Return how full the snapshot is.
-	 */
-	void (*fraction_full) (struct exception_store *store,
-			       sector_t *numerator,
-			       sector_t *denominator);
-
-	struct dm_snapshot *snap;
-	void *context;
-};
-
 #define DM_TRACKED_CHUNK_HASH_SIZE	16
 #define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & \
 					 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
@@ -193,14 +82,6 @@ struct dm_snapshot {
 int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
 
 /*
- * Constructor and destructor for the default persistent
- * store.
- */
-int dm_create_persistent(struct exception_store *store);
-
-int dm_create_transient(struct exception_store *store);
-
-/*
  * Return the number of sectors in the device.
  */
 static inline sector_t get_dev_size(struct block_device *bdev)
-- 
cgit v0.10.2


From 1ae25f9c933d1432fbffdf3e126051a974608abf Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 6 Jan 2009 03:05:16 +0000
Subject: dm snapshot: rename struct exception_store

Rename struct exception_store to dm_exception_store.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index ef152e6..c5c9a26 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -462,19 +462,19 @@ static int read_exceptions(struct pstore *ps)
 	return 0;
 }
 
-static struct pstore *get_info(struct exception_store *store)
+static struct pstore *get_info(struct dm_exception_store *store)
 {
 	return (struct pstore *) store->context;
 }
 
-static void persistent_fraction_full(struct exception_store *store,
+static void persistent_fraction_full(struct dm_exception_store *store,
 				     sector_t *numerator, sector_t *denominator)
 {
 	*numerator = get_info(store)->next_free * store->snap->chunk_size;
 	*denominator = get_dev_size(store->snap->cow->bdev);
 }
 
-static void persistent_destroy(struct exception_store *store)
+static void persistent_destroy(struct dm_exception_store *store)
 {
 	struct pstore *ps = get_info(store);
 
@@ -485,7 +485,7 @@ static void persistent_destroy(struct exception_store *store)
 	kfree(ps);
 }
 
-static int persistent_read_metadata(struct exception_store *store)
+static int persistent_read_metadata(struct dm_exception_store *store)
 {
 	int r, uninitialized_var(new_snapshot);
 	struct pstore *ps = get_info(store);
@@ -551,7 +551,7 @@ static int persistent_read_metadata(struct exception_store *store)
 	return 0;
 }
 
-static int persistent_prepare(struct exception_store *store,
+static int persistent_prepare(struct dm_exception_store *store,
 			      struct dm_snap_exception *e)
 {
 	struct pstore *ps = get_info(store);
@@ -578,7 +578,7 @@ static int persistent_prepare(struct exception_store *store,
 	return 0;
 }
 
-static void persistent_commit(struct exception_store *store,
+static void persistent_commit(struct dm_exception_store *store,
 			      struct dm_snap_exception *e,
 			      void (*callback) (void *, int success),
 			      void *callback_context)
@@ -640,7 +640,7 @@ static void persistent_commit(struct exception_store *store,
 	ps->callback_count = 0;
 }
 
-static void persistent_drop(struct exception_store *store)
+static void persistent_drop(struct dm_exception_store *store)
 {
 	struct pstore *ps = get_info(store);
 
@@ -649,7 +649,7 @@ static void persistent_drop(struct exception_store *store)
 		DMWARN("write header failed");
 }
 
-int dm_create_persistent(struct exception_store *store)
+int dm_create_persistent(struct dm_exception_store *store)
 {
 	struct pstore *ps;
 
@@ -694,17 +694,17 @@ struct transient_c {
 	sector_t next_free;
 };
 
-static void transient_destroy(struct exception_store *store)
+static void transient_destroy(struct dm_exception_store *store)
 {
 	kfree(store->context);
 }
 
-static int transient_read_metadata(struct exception_store *store)
+static int transient_read_metadata(struct dm_exception_store *store)
 {
 	return 0;
 }
 
-static int transient_prepare(struct exception_store *store,
+static int transient_prepare(struct dm_exception_store *store,
 			     struct dm_snap_exception *e)
 {
 	struct transient_c *tc = (struct transient_c *) store->context;
@@ -719,7 +719,7 @@ static int transient_prepare(struct exception_store *store,
 	return 0;
 }
 
-static void transient_commit(struct exception_store *store,
+static void transient_commit(struct dm_exception_store *store,
 			     struct dm_snap_exception *e,
 			     void (*callback) (void *, int success),
 			     void *callback_context)
@@ -728,14 +728,14 @@ static void transient_commit(struct exception_store *store,
 	callback(callback_context, 1);
 }
 
-static void transient_fraction_full(struct exception_store *store,
+static void transient_fraction_full(struct dm_exception_store *store,
 				    sector_t *numerator, sector_t *denominator)
 {
 	*numerator = ((struct transient_c *) store->context)->next_free;
 	*denominator = get_dev_size(store->snap->cow->bdev);
 }
 
-int dm_create_transient(struct exception_store *store)
+int dm_create_transient(struct dm_exception_store *store)
 {
 	struct transient_c *tc;
 
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index d75f775..25677df 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -36,28 +36,29 @@ struct dm_snap_exception {
  * Abstraction to handle the meta/layout of exception stores (the
  * COW device).
  */
-struct exception_store {
+struct dm_exception_store {
+
 	/*
 	 * Destroys this object when you've finished with it.
 	 */
-	void (*destroy) (struct exception_store *store);
+	void (*destroy) (struct dm_exception_store *store);
 
 	/*
 	 * The target shouldn't read the COW device until this is
 	 * called.
 	 */
-	int (*read_metadata) (struct exception_store *store);
+	int (*read_metadata) (struct dm_exception_store *store);
 
 	/*
 	 * Find somewhere to store the next exception.
 	 */
-	int (*prepare_exception) (struct exception_store *store,
+	int (*prepare_exception) (struct dm_exception_store *store,
 				  struct dm_snap_exception *e);
 
 	/*
 	 * Update the metadata with this exception.
 	 */
-	void (*commit_exception) (struct exception_store *store,
+	void (*commit_exception) (struct dm_exception_store *store,
 				  struct dm_snap_exception *e,
 				  void (*callback) (void *, int success),
 				  void *callback_context);
@@ -65,12 +66,12 @@ struct exception_store {
 	/*
 	 * The snapshot is invalid, note this in the metadata.
 	 */
-	void (*drop_snapshot) (struct exception_store *store);
+	void (*drop_snapshot) (struct dm_exception_store *store);
 
 	/*
 	 * Return how full the snapshot is.
 	 */
-	void (*fraction_full) (struct exception_store *store,
+	void (*fraction_full) (struct dm_exception_store *store,
 			       sector_t *numerator,
 			       sector_t *denominator);
 
@@ -124,8 +125,8 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
 /*
  * Two exception store implementations.
  */
-int dm_create_persistent(struct exception_store *store);
+int dm_create_persistent(struct dm_exception_store *store);
 
-int dm_create_transient(struct exception_store *store);
+int dm_create_transient(struct dm_exception_store *store);
 
 #endif /* _LINUX_DM_EXCEPTION_STORE */
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index 6e4beaf..9281236 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -61,7 +61,7 @@ struct dm_snapshot {
 	spinlock_t pe_lock;
 
 	/* The on disk metadata handler */
-	struct exception_store store;
+	struct dm_exception_store store;
 
 	struct dm_kcopyd_client *kcopyd_client;
 
-- 
cgit v0.10.2


From 4db6bfe02bdc7dc5048f46dd682a94801d029adc Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Tue, 6 Jan 2009 03:05:17 +0000
Subject: dm snapshot: split out exception store implementations

Move the existing snapshot exception store implementations out into
separate files.  Later patches will place these behind a new
interface in preparation for alternative implementations.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 63f0ae9..72880b7 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -5,7 +5,8 @@
 dm-mod-objs	:= dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
 		   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
 dm-multipath-objs := dm-path-selector.o dm-mpath.o
-dm-snapshot-objs := dm-snap.o dm-exception-store.o
+dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \
+		    dm-snap-persistent.o
 dm-mirror-objs	:= dm-raid1.o
 md-mod-objs     := md.o bitmap.o
 raid456-objs	:= raid5.o raid6algos.o raid6recov.o raid6tables.o \
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index c5c9a26..74777e0 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -1,757 +1,46 @@
 /*
- * dm-exception-store.c
- *
  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
- * Copyright (C) 2006 Red Hat GmbH
+ * Copyright (C) 2006-2008 Red Hat GmbH
  *
  * This file is released under the GPL.
  */
 
 #include "dm-exception-store.h"
-#include "dm-snap.h"
 
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
-#include <linux/dm-io.h>
-#include <linux/dm-kcopyd.h>
-
-#define DM_MSG_PREFIX "snapshots"
-#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32	/* 16KB */
-
-/*-----------------------------------------------------------------
- * Persistent snapshots, by persistent we mean that the snapshot
- * will survive a reboot.
- *---------------------------------------------------------------*/
-
-/*
- * We need to store a record of which parts of the origin have
- * been copied to the snapshot device.  The snapshot code
- * requires that we copy exception chunks to chunk aligned areas
- * of the COW store.  It makes sense therefore, to store the
- * metadata in chunk size blocks.
- *
- * There is no backward or forward compatibility implemented,
- * snapshots with different disk versions than the kernel will
- * not be usable.  It is expected that "lvcreate" will blank out
- * the start of a fresh COW device before calling the snapshot
- * constructor.
- *
- * The first chunk of the COW device just contains the header.
- * After this there is a chunk filled with exception metadata,
- * followed by as many exception chunks as can fit in the
- * metadata areas.
- *
- * All on disk structures are in little-endian format.  The end
- * of the exceptions info is indicated by an exception with a
- * new_chunk of 0, which is invalid since it would point to the
- * header chunk.
- */
-
-/*
- * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
- */
-#define SNAP_MAGIC 0x70416e53
-
-/*
- * The on-disk version of the metadata.
- */
-#define SNAPSHOT_DISK_VERSION 1
-
-struct disk_header {
-	uint32_t magic;
-
-	/*
-	 * Is this snapshot valid.  There is no way of recovering
-	 * an invalid snapshot.
-	 */
-	uint32_t valid;
-
-	/*
-	 * Simple, incrementing version. no backward
-	 * compatibility.
-	 */
-	uint32_t version;
-
-	/* In sectors */
-	uint32_t chunk_size;
-};
-
-struct disk_exception {
-	uint64_t old_chunk;
-	uint64_t new_chunk;
-};
-
-struct commit_callback {
-	void (*callback)(void *, int success);
-	void *context;
-};
-
-/*
- * The top level structure for a persistent exception store.
- */
-struct pstore {
-	struct dm_snapshot *snap;	/* up pointer to my snapshot */
-	int version;
-	int valid;
-	uint32_t exceptions_per_area;
-
-	/*
-	 * Now that we have an asynchronous kcopyd there is no
-	 * need for large chunk sizes, so it wont hurt to have a
-	 * whole chunks worth of metadata in memory at once.
-	 */
-	void *area;
-
-	/*
-	 * An area of zeros used to clear the next area.
-	 */
-	void *zero_area;
-
-	/*
-	 * Used to keep track of which metadata area the data in
-	 * 'chunk' refers to.
-	 */
-	chunk_t current_area;
-
-	/*
-	 * The next free chunk for an exception.
-	 */
-	chunk_t next_free;
-
-	/*
-	 * The index of next free exception in the current
-	 * metadata area.
-	 */
-	uint32_t current_committed;
-
-	atomic_t pending_count;
-	uint32_t callback_count;
-	struct commit_callback *callbacks;
-	struct dm_io_client *io_client;
-
-	struct workqueue_struct *metadata_wq;
-};
-
-static unsigned sectors_to_pages(unsigned sectors)
-{
-	return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
-}
-
-static int alloc_area(struct pstore *ps)
-{
-	int r = -ENOMEM;
-	size_t len;
-
-	len = ps->snap->chunk_size << SECTOR_SHIFT;
-
-	/*
-	 * Allocate the chunk_size block of memory that will hold
-	 * a single metadata area.
-	 */
-	ps->area = vmalloc(len);
-	if (!ps->area)
-		return r;
-
-	ps->zero_area = vmalloc(len);
-	if (!ps->zero_area) {
-		vfree(ps->area);
-		return r;
-	}
-	memset(ps->zero_area, 0, len);
-
-	return 0;
-}
-
-static void free_area(struct pstore *ps)
-{
-	vfree(ps->area);
-	ps->area = NULL;
-	vfree(ps->zero_area);
-	ps->zero_area = NULL;
-}
-
-struct mdata_req {
-	struct dm_io_region *where;
-	struct dm_io_request *io_req;
-	struct work_struct work;
-	int result;
-};
-
-static void do_metadata(struct work_struct *work)
-{
-	struct mdata_req *req = container_of(work, struct mdata_req, work);
-
-	req->result = dm_io(req->io_req, 1, req->where, NULL);
-}
-
-/*
- * Read or write a chunk aligned and sized block of data from a device.
- */
-static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
-{
-	struct dm_io_region where = {
-		.bdev = ps->snap->cow->bdev,
-		.sector = ps->snap->chunk_size * chunk,
-		.count = ps->snap->chunk_size,
-	};
-	struct dm_io_request io_req = {
-		.bi_rw = rw,
-		.mem.type = DM_IO_VMA,
-		.mem.ptr.vma = ps->area,
-		.client = ps->io_client,
-		.notify.fn = NULL,
-	};
-	struct mdata_req req;
-
-	if (!metadata)
-		return dm_io(&io_req, 1, &where, NULL);
-
-	req.where = &where;
-	req.io_req = &io_req;
-
-	/*
-	 * Issue the synchronous I/O from a different thread
-	 * to avoid generic_make_request recursion.
-	 */
-	INIT_WORK(&req.work, do_metadata);
-	queue_work(ps->metadata_wq, &req.work);
-	flush_workqueue(ps->metadata_wq);
-
-	return req.result;
-}
-
-/*
- * Convert a metadata area index to a chunk index.
- */
-static chunk_t area_location(struct pstore *ps, chunk_t area)
-{
-	return 1 + ((ps->exceptions_per_area + 1) * area);
-}
-
-/*
- * Read or write a metadata area.  Remembering to skip the first
- * chunk which holds the header.
- */
-static int area_io(struct pstore *ps, int rw)
-{
-	int r;
-	chunk_t chunk;
-
-	chunk = area_location(ps, ps->current_area);
-
-	r = chunk_io(ps, chunk, rw, 0);
-	if (r)
-		return r;
-
-	return 0;
-}
-
-static void zero_memory_area(struct pstore *ps)
-{
-	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
-}
-
-static int zero_disk_area(struct pstore *ps, chunk_t area)
-{
-	struct dm_io_region where = {
-		.bdev = ps->snap->cow->bdev,
-		.sector = ps->snap->chunk_size * area_location(ps, area),
-		.count = ps->snap->chunk_size,
-	};
-	struct dm_io_request io_req = {
-		.bi_rw = WRITE,
-		.mem.type = DM_IO_VMA,
-		.mem.ptr.vma = ps->zero_area,
-		.client = ps->io_client,
-		.notify.fn = NULL,
-	};
-
-	return dm_io(&io_req, 1, &where, NULL);
-}
-
-static int read_header(struct pstore *ps, int *new_snapshot)
-{
-	int r;
-	struct disk_header *dh;
-	chunk_t chunk_size;
-	int chunk_size_supplied = 1;
-
-	/*
-	 * Use default chunk size (or hardsect_size, if larger) if none supplied
-	 */
-	if (!ps->snap->chunk_size) {
-        	ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
-		    bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
-		ps->snap->chunk_mask = ps->snap->chunk_size - 1;
-		ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
-		chunk_size_supplied = 0;
-	}
-
-	ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
-							     chunk_size));
-	if (IS_ERR(ps->io_client))
-		return PTR_ERR(ps->io_client);
-
-	r = alloc_area(ps);
-	if (r)
-		return r;
-
-	r = chunk_io(ps, 0, READ, 1);
-	if (r)
-		goto bad;
-
-	dh = (struct disk_header *) ps->area;
-
-	if (le32_to_cpu(dh->magic) == 0) {
-		*new_snapshot = 1;
-		return 0;
-	}
-
-	if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
-		DMWARN("Invalid or corrupt snapshot");
-		r = -ENXIO;
-		goto bad;
-	}
-
-	*new_snapshot = 0;
-	ps->valid = le32_to_cpu(dh->valid);
-	ps->version = le32_to_cpu(dh->version);
-	chunk_size = le32_to_cpu(dh->chunk_size);
-
-	if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
-		return 0;
-
-	DMWARN("chunk size %llu in device metadata overrides "
-	       "table chunk size of %llu.",
-	       (unsigned long long)chunk_size,
-	       (unsigned long long)ps->snap->chunk_size);
-
-	/* We had a bogus chunk_size. Fix stuff up. */
-	free_area(ps);
-
-	ps->snap->chunk_size = chunk_size;
-	ps->snap->chunk_mask = chunk_size - 1;
-	ps->snap->chunk_shift = ffs(chunk_size) - 1;
-
-	r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
-				ps->io_client);
-	if (r)
-		return r;
-
-	r = alloc_area(ps);
-	return r;
-
-bad:
-	free_area(ps);
-	return r;
-}
-
-static int write_header(struct pstore *ps)
-{
-	struct disk_header *dh;
-
-	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
-
-	dh = (struct disk_header *) ps->area;
-	dh->magic = cpu_to_le32(SNAP_MAGIC);
-	dh->valid = cpu_to_le32(ps->valid);
-	dh->version = cpu_to_le32(ps->version);
-	dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
-
-	return chunk_io(ps, 0, WRITE, 1);
-}
-
-/*
- * Access functions for the disk exceptions, these do the endian conversions.
- */
-static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
-{
-	BUG_ON(index >= ps->exceptions_per_area);
-
-	return ((struct disk_exception *) ps->area) + index;
-}
+#include <linux/device-mapper.h>
 
-static void read_exception(struct pstore *ps,
-			   uint32_t index, struct disk_exception *result)
-{
-	struct disk_exception *e = get_exception(ps, index);
-
-	/* copy it */
-	result->old_chunk = le64_to_cpu(e->old_chunk);
-	result->new_chunk = le64_to_cpu(e->new_chunk);
-}
-
-static void write_exception(struct pstore *ps,
-			    uint32_t index, struct disk_exception *de)
-{
-	struct disk_exception *e = get_exception(ps, index);
-
-	/* copy it */
-	e->old_chunk = cpu_to_le64(de->old_chunk);
-	e->new_chunk = cpu_to_le64(de->new_chunk);
-}
+#define DM_MSG_PREFIX "snapshot exception stores"
 
-/*
- * Registers the exceptions that are present in the current area.
- * 'full' is filled in to indicate if the area has been
- * filled.
- */
-static int insert_exceptions(struct pstore *ps, int *full)
+int dm_exception_store_init(void)
 {
 	int r;
-	unsigned int i;
-	struct disk_exception de;
-
-	/* presume the area is full */
-	*full = 1;
-
-	for (i = 0; i < ps->exceptions_per_area; i++) {
-		read_exception(ps, i, &de);
-
-		/*
-		 * If the new_chunk is pointing at the start of
-		 * the COW device, where the first metadata area
-		 * is we know that we've hit the end of the
-		 * exceptions.  Therefore the area is not full.
-		 */
-		if (de.new_chunk == 0LL) {
-			ps->current_committed = i;
-			*full = 0;
-			break;
-		}
-
-		/*
-		 * Keep track of the start of the free chunks.
-		 */
-		if (ps->next_free <= de.new_chunk)
-			ps->next_free = de.new_chunk + 1;
-
-		/*
-		 * Otherwise we add the exception to the snapshot.
-		 */
-		r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
-		if (r)
-			return r;
-	}
-
-	return 0;
-}
-
-static int read_exceptions(struct pstore *ps)
-{
-	int r, full = 1;
-
-	/*
-	 * Keeping reading chunks and inserting exceptions until
-	 * we find a partially full area.
-	 */
-	for (ps->current_area = 0; full; ps->current_area++) {
-		r = area_io(ps, READ);
-		if (r)
-			return r;
 
-		r = insert_exceptions(ps, &full);
-		if (r)
-			return r;
+	r = dm_transient_snapshot_init();
+	if (r) {
+		DMERR("Unable to register transient exception store type.");
+		goto transient_fail;
 	}
 
-	ps->current_area--;
-
-	return 0;
-}
-
-static struct pstore *get_info(struct dm_exception_store *store)
-{
-	return (struct pstore *) store->context;
-}
-
-static void persistent_fraction_full(struct dm_exception_store *store,
-				     sector_t *numerator, sector_t *denominator)
-{
-	*numerator = get_info(store)->next_free * store->snap->chunk_size;
-	*denominator = get_dev_size(store->snap->cow->bdev);
-}
-
-static void persistent_destroy(struct dm_exception_store *store)
-{
-	struct pstore *ps = get_info(store);
-
-	destroy_workqueue(ps->metadata_wq);
-	dm_io_client_destroy(ps->io_client);
-	vfree(ps->callbacks);
-	free_area(ps);
-	kfree(ps);
-}
-
-static int persistent_read_metadata(struct dm_exception_store *store)
-{
-	int r, uninitialized_var(new_snapshot);
-	struct pstore *ps = get_info(store);
-
-	/*
-	 * Read the snapshot header.
-	 */
-	r = read_header(ps, &new_snapshot);
-	if (r)
-		return r;
-
-	/*
-	 * Now we know correct chunk_size, complete the initialisation.
-	 */
-	ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
-				  sizeof(struct disk_exception);
-	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
-			sizeof(*ps->callbacks));
-	if (!ps->callbacks)
-		return -ENOMEM;
-
-	/*
-	 * Do we need to setup a new snapshot ?
-	 */
-	if (new_snapshot) {
-		r = write_header(ps);
-		if (r) {
-			DMWARN("write_header failed");
-			return r;
-		}
-
-		ps->current_area = 0;
-		zero_memory_area(ps);
-		r = zero_disk_area(ps, 0);
-		if (r) {
-			DMWARN("zero_disk_area(0) failed");
-			return r;
-		}
-	} else {
-		/*
-		 * Sanity checks.
-		 */
-		if (ps->version != SNAPSHOT_DISK_VERSION) {
-			DMWARN("unable to handle snapshot disk version %d",
-			       ps->version);
-			return -EINVAL;
-		}
-
-		/*
-		 * Metadata are valid, but snapshot is invalidated
-		 */
-		if (!ps->valid)
-			return 1;
-
-		/*
-		 * Read the metadata.
-		 */
-		r = read_exceptions(ps);
-		if (r)
-			return r;
+	r = dm_persistent_snapshot_init();
+	if (r) {
+		DMERR("Unable to register persistent exception store type");
+		goto persistent_fail;
 	}
 
 	return 0;
-}
-
-static int persistent_prepare(struct dm_exception_store *store,
-			      struct dm_snap_exception *e)
-{
-	struct pstore *ps = get_info(store);
-	uint32_t stride;
-	chunk_t next_free;
-	sector_t size = get_dev_size(store->snap->cow->bdev);
-
-	/* Is there enough room ? */
-	if (size < ((ps->next_free + 1) * store->snap->chunk_size))
-		return -ENOSPC;
 
-	e->new_chunk = ps->next_free;
-
-	/*
-	 * Move onto the next free pending, making sure to take
-	 * into account the location of the metadata chunks.
-	 */
-	stride = (ps->exceptions_per_area + 1);
-	next_free = ++ps->next_free;
-	if (sector_div(next_free, stride) == 1)
-		ps->next_free++;
-
-	atomic_inc(&ps->pending_count);
-	return 0;
-}
-
-static void persistent_commit(struct dm_exception_store *store,
-			      struct dm_snap_exception *e,
-			      void (*callback) (void *, int success),
-			      void *callback_context)
-{
-	unsigned int i;
-	struct pstore *ps = get_info(store);
-	struct disk_exception de;
-	struct commit_callback *cb;
-
-	de.old_chunk = e->old_chunk;
-	de.new_chunk = e->new_chunk;
-	write_exception(ps, ps->current_committed++, &de);
-
-	/*
-	 * Add the callback to the back of the array.  This code
-	 * is the only place where the callback array is
-	 * manipulated, and we know that it will never be called
-	 * multiple times concurrently.
-	 */
-	cb = ps->callbacks + ps->callback_count++;
-	cb->callback = callback;
-	cb->context = callback_context;
-
-	/*
-	 * If there are exceptions in flight and we have not yet
-	 * filled this metadata area there's nothing more to do.
-	 */
-	if (!atomic_dec_and_test(&ps->pending_count) &&
-	    (ps->current_committed != ps->exceptions_per_area))
-		return;
-
-	/*
-	 * If we completely filled the current area, then wipe the next one.
-	 */
-	if ((ps->current_committed == ps->exceptions_per_area) &&
-	     zero_disk_area(ps, ps->current_area + 1))
-		ps->valid = 0;
-
-	/*
-	 * Commit exceptions to disk.
-	 */
-	if (ps->valid && area_io(ps, WRITE))
-		ps->valid = 0;
-
-	/*
-	 * Advance to the next area if this one is full.
-	 */
-	if (ps->current_committed == ps->exceptions_per_area) {
-		ps->current_committed = 0;
-		ps->current_area++;
-		zero_memory_area(ps);
-	}
-
-	for (i = 0; i < ps->callback_count; i++) {
-		cb = ps->callbacks + i;
-		cb->callback(cb->context, ps->valid);
-	}
-
-	ps->callback_count = 0;
-}
-
-static void persistent_drop(struct dm_exception_store *store)
-{
-	struct pstore *ps = get_info(store);
-
-	ps->valid = 0;
-	if (write_header(ps))
-		DMWARN("write header failed");
-}
-
-int dm_create_persistent(struct dm_exception_store *store)
-{
-	struct pstore *ps;
-
-	/* allocate the pstore */
-	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
-	if (!ps)
-		return -ENOMEM;
-
-	ps->snap = store->snap;
-	ps->valid = 1;
-	ps->version = SNAPSHOT_DISK_VERSION;
-	ps->area = NULL;
-	ps->next_free = 2;	/* skipping the header and first area */
-	ps->current_committed = 0;
-
-	ps->callback_count = 0;
-	atomic_set(&ps->pending_count, 0);
-	ps->callbacks = NULL;
-
-	ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
-	if (!ps->metadata_wq) {
-		kfree(ps);
-		DMERR("couldn't start header metadata update thread");
-		return -ENOMEM;
-	}
-
-	store->destroy = persistent_destroy;
-	store->read_metadata = persistent_read_metadata;
-	store->prepare_exception = persistent_prepare;
-	store->commit_exception = persistent_commit;
-	store->drop_snapshot = persistent_drop;
-	store->fraction_full = persistent_fraction_full;
-	store->context = ps;
-
-	return 0;
-}
-
-/*-----------------------------------------------------------------
- * Implementation of the store for non-persistent snapshots.
- *---------------------------------------------------------------*/
-struct transient_c {
-	sector_t next_free;
-};
-
-static void transient_destroy(struct dm_exception_store *store)
-{
-	kfree(store->context);
-}
-
-static int transient_read_metadata(struct dm_exception_store *store)
-{
-	return 0;
-}
-
-static int transient_prepare(struct dm_exception_store *store,
-			     struct dm_snap_exception *e)
-{
-	struct transient_c *tc = (struct transient_c *) store->context;
-	sector_t size = get_dev_size(store->snap->cow->bdev);
-
-	if (size < (tc->next_free + store->snap->chunk_size))
-		return -1;
-
-	e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
-	tc->next_free += store->snap->chunk_size;
-
-	return 0;
-}
-
-static void transient_commit(struct dm_exception_store *store,
-			     struct dm_snap_exception *e,
-			     void (*callback) (void *, int success),
-			     void *callback_context)
-{
-	/* Just succeed */
-	callback(callback_context, 1);
-}
-
-static void transient_fraction_full(struct dm_exception_store *store,
-				    sector_t *numerator, sector_t *denominator)
-{
-	*numerator = ((struct transient_c *) store->context)->next_free;
-	*denominator = get_dev_size(store->snap->cow->bdev);
+persistent_fail:
+	dm_persistent_snapshot_exit();
+transient_fail:
+	return r;
 }
 
-int dm_create_transient(struct dm_exception_store *store)
+void dm_exception_store_exit(void)
 {
-	struct transient_c *tc;
-
-	store->destroy = transient_destroy;
-	store->read_metadata = transient_read_metadata;
-	store->prepare_exception = transient_prepare;
-	store->commit_exception = transient_commit;
-	store->drop_snapshot = NULL;
-	store->fraction_full = transient_fraction_full;
-
-	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
-	if (!tc)
-		return -ENOMEM;
-
-	tc->next_free = 0;
-	store->context = tc;
-
-	return 0;
+	dm_persistent_snapshot_exit();
+	dm_transient_snapshot_exit();
 }
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 25677df..78d1ace 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -122,9 +122,18 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
 
 #  endif
 
+int dm_exception_store_init(void);
+void dm_exception_store_exit(void);
+
 /*
  * Two exception store implementations.
  */
+int dm_persistent_snapshot_init(void);
+void dm_persistent_snapshot_exit(void);
+
+int dm_transient_snapshot_init(void);
+void dm_transient_snapshot_exit(void);
+
 int dm_create_persistent(struct dm_exception_store *store);
 
 int dm_create_transient(struct dm_exception_store *store);
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
new file mode 100644
index 0000000..57c946c
--- /dev/null
+++ b/drivers/md/dm-snap-persistent.c
@@ -0,0 +1,694 @@
+/*
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006-2008 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-exception-store.h"
+#include "dm-snap.h"
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/dm-io.h>
+
+#define DM_MSG_PREFIX "persistent snapshot"
+#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32	/* 16KB */
+
+/*-----------------------------------------------------------------
+ * Persistent snapshots, by persistent we mean that the snapshot
+ * will survive a reboot.
+ *---------------------------------------------------------------*/
+
+/*
+ * We need to store a record of which parts of the origin have
+ * been copied to the snapshot device.  The snapshot code
+ * requires that we copy exception chunks to chunk aligned areas
+ * of the COW store.  It makes sense therefore, to store the
+ * metadata in chunk size blocks.
+ *
+ * There is no backward or forward compatibility implemented,
+ * snapshots with different disk versions than the kernel will
+ * not be usable.  It is expected that "lvcreate" will blank out
+ * the start of a fresh COW device before calling the snapshot
+ * constructor.
+ *
+ * The first chunk of the COW device just contains the header.
+ * After this there is a chunk filled with exception metadata,
+ * followed by as many exception chunks as can fit in the
+ * metadata areas.
+ *
+ * All on disk structures are in little-endian format.  The end
+ * of the exceptions info is indicated by an exception with a
+ * new_chunk of 0, which is invalid since it would point to the
+ * header chunk.
+ */
+
+/*
+ * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
+ */
+#define SNAP_MAGIC 0x70416e53
+
+/*
+ * The on-disk version of the metadata.
+ */
+#define SNAPSHOT_DISK_VERSION 1
+
+struct disk_header {
+	uint32_t magic;
+
+	/*
+	 * Is this snapshot valid.  There is no way of recovering
+	 * an invalid snapshot.
+	 */
+	uint32_t valid;
+
+	/*
+	 * Simple, incrementing version. no backward
+	 * compatibility.
+	 */
+	uint32_t version;
+
+	/* In sectors */
+	uint32_t chunk_size;
+};
+
+struct disk_exception {
+	uint64_t old_chunk;
+	uint64_t new_chunk;
+};
+
+struct commit_callback {
+	void (*callback)(void *, int success);
+	void *context;
+};
+
+/*
+ * The top level structure for a persistent exception store.
+ */
+struct pstore {
+	struct dm_snapshot *snap;	/* up pointer to my snapshot */
+	int version;
+	int valid;
+	uint32_t exceptions_per_area;
+
+	/*
+	 * Now that we have an asynchronous kcopyd there is no
+	 * need for large chunk sizes, so it wont hurt to have a
+	 * whole chunks worth of metadata in memory at once.
+	 */
+	void *area;
+
+	/*
+	 * An area of zeros used to clear the next area.
+	 */
+	void *zero_area;
+
+	/*
+	 * Used to keep track of which metadata area the data in
+	 * 'chunk' refers to.
+	 */
+	chunk_t current_area;
+
+	/*
+	 * The next free chunk for an exception.
+	 */
+	chunk_t next_free;
+
+	/*
+	 * The index of next free exception in the current
+	 * metadata area.
+	 */
+	uint32_t current_committed;
+
+	atomic_t pending_count;
+	uint32_t callback_count;
+	struct commit_callback *callbacks;
+	struct dm_io_client *io_client;
+
+	struct workqueue_struct *metadata_wq;
+};
+
+static unsigned sectors_to_pages(unsigned sectors)
+{
+	return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
+}
+
+static int alloc_area(struct pstore *ps)
+{
+	int r = -ENOMEM;
+	size_t len;
+
+	len = ps->snap->chunk_size << SECTOR_SHIFT;
+
+	/*
+	 * Allocate the chunk_size block of memory that will hold
+	 * a single metadata area.
+	 */
+	ps->area = vmalloc(len);
+	if (!ps->area)
+		return r;
+
+	ps->zero_area = vmalloc(len);
+	if (!ps->zero_area) {
+		vfree(ps->area);
+		return r;
+	}
+	memset(ps->zero_area, 0, len);
+
+	return 0;
+}
+
+static void free_area(struct pstore *ps)
+{
+	vfree(ps->area);
+	ps->area = NULL;
+	vfree(ps->zero_area);
+	ps->zero_area = NULL;
+}
+
+struct mdata_req {
+	struct dm_io_region *where;
+	struct dm_io_request *io_req;
+	struct work_struct work;
+	int result;
+};
+
+static void do_metadata(struct work_struct *work)
+{
+	struct mdata_req *req = container_of(work, struct mdata_req, work);
+
+	req->result = dm_io(req->io_req, 1, req->where, NULL);
+}
+
+/*
+ * Read or write a chunk aligned and sized block of data from a device.
+ */
+static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
+{
+	struct dm_io_region where = {
+		.bdev = ps->snap->cow->bdev,
+		.sector = ps->snap->chunk_size * chunk,
+		.count = ps->snap->chunk_size,
+	};
+	struct dm_io_request io_req = {
+		.bi_rw = rw,
+		.mem.type = DM_IO_VMA,
+		.mem.ptr.vma = ps->area,
+		.client = ps->io_client,
+		.notify.fn = NULL,
+	};
+	struct mdata_req req;
+
+	if (!metadata)
+		return dm_io(&io_req, 1, &where, NULL);
+
+	req.where = &where;
+	req.io_req = &io_req;
+
+	/*
+	 * Issue the synchronous I/O from a different thread
+	 * to avoid generic_make_request recursion.
+	 */
+	INIT_WORK(&req.work, do_metadata);
+	queue_work(ps->metadata_wq, &req.work);
+	flush_workqueue(ps->metadata_wq);
+
+	return req.result;
+}
+
+/*
+ * Convert a metadata area index to a chunk index.
+ */
+static chunk_t area_location(struct pstore *ps, chunk_t area)
+{
+	return 1 + ((ps->exceptions_per_area + 1) * area);
+}
+
+/*
+ * Read or write a metadata area.  Remembering to skip the first
+ * chunk which holds the header.
+ */
+static int area_io(struct pstore *ps, int rw)
+{
+	int r;
+	chunk_t chunk;
+
+	chunk = area_location(ps, ps->current_area);
+
+	r = chunk_io(ps, chunk, rw, 0);
+	if (r)
+		return r;
+
+	return 0;
+}
+
+static void zero_memory_area(struct pstore *ps)
+{
+	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
+}
+
+static int zero_disk_area(struct pstore *ps, chunk_t area)
+{
+	struct dm_io_region where = {
+		.bdev = ps->snap->cow->bdev,
+		.sector = ps->snap->chunk_size * area_location(ps, area),
+		.count = ps->snap->chunk_size,
+	};
+	struct dm_io_request io_req = {
+		.bi_rw = WRITE,
+		.mem.type = DM_IO_VMA,
+		.mem.ptr.vma = ps->zero_area,
+		.client = ps->io_client,
+		.notify.fn = NULL,
+	};
+
+	return dm_io(&io_req, 1, &where, NULL);
+}
+
+static int read_header(struct pstore *ps, int *new_snapshot)
+{
+	int r;
+	struct disk_header *dh;
+	chunk_t chunk_size;
+	int chunk_size_supplied = 1;
+
+	/*
+	 * Use default chunk size (or hardsect_size, if larger) if none supplied
+	 */
+	if (!ps->snap->chunk_size) {
+		ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
+		    bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
+		ps->snap->chunk_mask = ps->snap->chunk_size - 1;
+		ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
+		chunk_size_supplied = 0;
+	}
+
+	ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
+							     chunk_size));
+	if (IS_ERR(ps->io_client))
+		return PTR_ERR(ps->io_client);
+
+	r = alloc_area(ps);
+	if (r)
+		return r;
+
+	r = chunk_io(ps, 0, READ, 1);
+	if (r)
+		goto bad;
+
+	dh = (struct disk_header *) ps->area;
+
+	if (le32_to_cpu(dh->magic) == 0) {
+		*new_snapshot = 1;
+		return 0;
+	}
+
+	if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
+		DMWARN("Invalid or corrupt snapshot");
+		r = -ENXIO;
+		goto bad;
+	}
+
+	*new_snapshot = 0;
+	ps->valid = le32_to_cpu(dh->valid);
+	ps->version = le32_to_cpu(dh->version);
+	chunk_size = le32_to_cpu(dh->chunk_size);
+
+	if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
+		return 0;
+
+	DMWARN("chunk size %llu in device metadata overrides "
+	       "table chunk size of %llu.",
+	       (unsigned long long)chunk_size,
+	       (unsigned long long)ps->snap->chunk_size);
+
+	/* We had a bogus chunk_size. Fix stuff up. */
+	free_area(ps);
+
+	ps->snap->chunk_size = chunk_size;
+	ps->snap->chunk_mask = chunk_size - 1;
+	ps->snap->chunk_shift = ffs(chunk_size) - 1;
+
+	r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
+				ps->io_client);
+	if (r)
+		return r;
+
+	r = alloc_area(ps);
+	return r;
+
+bad:
+	free_area(ps);
+	return r;
+}
+
+static int write_header(struct pstore *ps)
+{
+	struct disk_header *dh;
+
+	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
+
+	dh = (struct disk_header *) ps->area;
+	dh->magic = cpu_to_le32(SNAP_MAGIC);
+	dh->valid = cpu_to_le32(ps->valid);
+	dh->version = cpu_to_le32(ps->version);
+	dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
+
+	return chunk_io(ps, 0, WRITE, 1);
+}
+
+/*
+ * Access functions for the disk exceptions, these do the endian conversions.
+ */
+static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
+{
+	BUG_ON(index >= ps->exceptions_per_area);
+
+	return ((struct disk_exception *) ps->area) + index;
+}
+
+static void read_exception(struct pstore *ps,
+			   uint32_t index, struct disk_exception *result)
+{
+	struct disk_exception *e = get_exception(ps, index);
+
+	/* copy it */
+	result->old_chunk = le64_to_cpu(e->old_chunk);
+	result->new_chunk = le64_to_cpu(e->new_chunk);
+}
+
+static void write_exception(struct pstore *ps,
+			    uint32_t index, struct disk_exception *de)
+{
+	struct disk_exception *e = get_exception(ps, index);
+
+	/* copy it */
+	e->old_chunk = cpu_to_le64(de->old_chunk);
+	e->new_chunk = cpu_to_le64(de->new_chunk);
+}
+
+/*
+ * Registers the exceptions that are present in the current area.
+ * 'full' is filled in to indicate if the area has been
+ * filled.
+ */
+static int insert_exceptions(struct pstore *ps, int *full)
+{
+	int r;
+	unsigned int i;
+	struct disk_exception de;
+
+	/* presume the area is full */
+	*full = 1;
+
+	for (i = 0; i < ps->exceptions_per_area; i++) {
+		read_exception(ps, i, &de);
+
+		/*
+		 * If the new_chunk is pointing at the start of
+		 * the COW device, where the first metadata area
+		 * is we know that we've hit the end of the
+		 * exceptions.  Therefore the area is not full.
+		 */
+		if (de.new_chunk == 0LL) {
+			ps->current_committed = i;
+			*full = 0;
+			break;
+		}
+
+		/*
+		 * Keep track of the start of the free chunks.
+		 */
+		if (ps->next_free <= de.new_chunk)
+			ps->next_free = de.new_chunk + 1;
+
+		/*
+		 * Otherwise we add the exception to the snapshot.
+		 */
+		r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static int read_exceptions(struct pstore *ps)
+{
+	int r, full = 1;
+
+	/*
+	 * Keeping reading chunks and inserting exceptions until
+	 * we find a partially full area.
+	 */
+	for (ps->current_area = 0; full; ps->current_area++) {
+		r = area_io(ps, READ);
+		if (r)
+			return r;
+
+		r = insert_exceptions(ps, &full);
+		if (r)
+			return r;
+	}
+
+	ps->current_area--;
+
+	return 0;
+}
+
+static struct pstore *get_info(struct dm_exception_store *store)
+{
+	return (struct pstore *) store->context;
+}
+
+static void persistent_fraction_full(struct dm_exception_store *store,
+				     sector_t *numerator, sector_t *denominator)
+{
+	*numerator = get_info(store)->next_free * store->snap->chunk_size;
+	*denominator = get_dev_size(store->snap->cow->bdev);
+}
+
+static void persistent_destroy(struct dm_exception_store *store)
+{
+	struct pstore *ps = get_info(store);
+
+	destroy_workqueue(ps->metadata_wq);
+	dm_io_client_destroy(ps->io_client);
+	vfree(ps->callbacks);
+	free_area(ps);
+	kfree(ps);
+}
+
+static int persistent_read_metadata(struct dm_exception_store *store)
+{
+	int r, uninitialized_var(new_snapshot);
+	struct pstore *ps = get_info(store);
+
+	/*
+	 * Read the snapshot header.
+	 */
+	r = read_header(ps, &new_snapshot);
+	if (r)
+		return r;
+
+	/*
+	 * Now we know correct chunk_size, complete the initialisation.
+	 */
+	ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
+				  sizeof(struct disk_exception);
+	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
+			sizeof(*ps->callbacks));
+	if (!ps->callbacks)
+		return -ENOMEM;
+
+	/*
+	 * Do we need to setup a new snapshot ?
+	 */
+	if (new_snapshot) {
+		r = write_header(ps);
+		if (r) {
+			DMWARN("write_header failed");
+			return r;
+		}
+
+		ps->current_area = 0;
+		zero_memory_area(ps);
+		r = zero_disk_area(ps, 0);
+		if (r) {
+			DMWARN("zero_disk_area(0) failed");
+			return r;
+		}
+	} else {
+		/*
+		 * Sanity checks.
+		 */
+		if (ps->version != SNAPSHOT_DISK_VERSION) {
+			DMWARN("unable to handle snapshot disk version %d",
+			       ps->version);
+			return -EINVAL;
+		}
+
+		/*
+		 * Metadata are valid, but snapshot is invalidated
+		 */
+		if (!ps->valid)
+			return 1;
+
+		/*
+		 * Read the metadata.
+		 */
+		r = read_exceptions(ps);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static int persistent_prepare(struct dm_exception_store *store,
+			      struct dm_snap_exception *e)
+{
+	struct pstore *ps = get_info(store);
+	uint32_t stride;
+	chunk_t next_free;
+	sector_t size = get_dev_size(store->snap->cow->bdev);
+
+	/* Is there enough room ? */
+	if (size < ((ps->next_free + 1) * store->snap->chunk_size))
+		return -ENOSPC;
+
+	e->new_chunk = ps->next_free;
+
+	/*
+	 * Move onto the next free pending, making sure to take
+	 * into account the location of the metadata chunks.
+	 */
+	stride = (ps->exceptions_per_area + 1);
+	next_free = ++ps->next_free;
+	if (sector_div(next_free, stride) == 1)
+		ps->next_free++;
+
+	atomic_inc(&ps->pending_count);
+	return 0;
+}
+
+static void persistent_commit(struct dm_exception_store *store,
+			      struct dm_snap_exception *e,
+			      void (*callback) (void *, int success),
+			      void *callback_context)
+{
+	unsigned int i;
+	struct pstore *ps = get_info(store);
+	struct disk_exception de;
+	struct commit_callback *cb;
+
+	de.old_chunk = e->old_chunk;
+	de.new_chunk = e->new_chunk;
+	write_exception(ps, ps->current_committed++, &de);
+
+	/*
+	 * Add the callback to the back of the array.  This code
+	 * is the only place where the callback array is
+	 * manipulated, and we know that it will never be called
+	 * multiple times concurrently.
+	 */
+	cb = ps->callbacks + ps->callback_count++;
+	cb->callback = callback;
+	cb->context = callback_context;
+
+	/*
+	 * If there are exceptions in flight and we have not yet
+	 * filled this metadata area there's nothing more to do.
+	 */
+	if (!atomic_dec_and_test(&ps->pending_count) &&
+	    (ps->current_committed != ps->exceptions_per_area))
+		return;
+
+	/*
+	 * If we completely filled the current area, then wipe the next one.
+	 */
+	if ((ps->current_committed == ps->exceptions_per_area) &&
+	     zero_disk_area(ps, ps->current_area + 1))
+		ps->valid = 0;
+
+	/*
+	 * Commit exceptions to disk.
+	 */
+	if (ps->valid && area_io(ps, WRITE))
+		ps->valid = 0;
+
+	/*
+	 * Advance to the next area if this one is full.
+	 */
+	if (ps->current_committed == ps->exceptions_per_area) {
+		ps->current_committed = 0;
+		ps->current_area++;
+		zero_memory_area(ps);
+	}
+
+	for (i = 0; i < ps->callback_count; i++) {
+		cb = ps->callbacks + i;
+		cb->callback(cb->context, ps->valid);
+	}
+
+	ps->callback_count = 0;
+}
+
+static void persistent_drop(struct dm_exception_store *store)
+{
+	struct pstore *ps = get_info(store);
+
+	ps->valid = 0;
+	if (write_header(ps))
+		DMWARN("write header failed");
+}
+
+int dm_create_persistent(struct dm_exception_store *store)
+{
+	struct pstore *ps;
+
+	/* allocate the pstore */
+	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		return -ENOMEM;
+
+	ps->snap = store->snap;
+	ps->valid = 1;
+	ps->version = SNAPSHOT_DISK_VERSION;
+	ps->area = NULL;
+	ps->next_free = 2;	/* skipping the header and first area */
+	ps->current_committed = 0;
+
+	ps->callback_count = 0;
+	atomic_set(&ps->pending_count, 0);
+	ps->callbacks = NULL;
+
+	ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
+	if (!ps->metadata_wq) {
+		kfree(ps);
+		DMERR("couldn't start header metadata update thread");
+		return -ENOMEM;
+	}
+
+	store->destroy = persistent_destroy;
+	store->read_metadata = persistent_read_metadata;
+	store->prepare_exception = persistent_prepare;
+	store->commit_exception = persistent_commit;
+	store->drop_snapshot = persistent_drop;
+	store->fraction_full = persistent_fraction_full;
+	store->context = ps;
+
+	return 0;
+}
+
+int dm_persistent_snapshot_init(void)
+{
+	return 0;
+}
+
+void dm_persistent_snapshot_exit(void)
+{
+}
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
new file mode 100644
index 0000000..2a781df
--- /dev/null
+++ b/drivers/md/dm-snap-transient.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006-2008 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-exception-store.h"
+#include "dm-snap.h"
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/dm-io.h>
+
+#define DM_MSG_PREFIX "transient snapshot"
+
+/*-----------------------------------------------------------------
+ * Implementation of the store for non-persistent snapshots.
+ *---------------------------------------------------------------*/
+struct transient_c {
+	sector_t next_free;
+};
+
+static void transient_destroy(struct dm_exception_store *store)
+{
+	kfree(store->context);
+}
+
+static int transient_read_metadata(struct dm_exception_store *store)
+{
+	return 0;
+}
+
+static int transient_prepare(struct dm_exception_store *store,
+			     struct dm_snap_exception *e)
+{
+	struct transient_c *tc = (struct transient_c *) store->context;
+	sector_t size = get_dev_size(store->snap->cow->bdev);
+
+	if (size < (tc->next_free + store->snap->chunk_size))
+		return -1;
+
+	e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
+	tc->next_free += store->snap->chunk_size;
+
+	return 0;
+}
+
+static void transient_commit(struct dm_exception_store *store,
+			     struct dm_snap_exception *e,
+			     void (*callback) (void *, int success),
+			     void *callback_context)
+{
+	/* Just succeed */
+	callback(callback_context, 1);
+}
+
+static void transient_fraction_full(struct dm_exception_store *store,
+				    sector_t *numerator, sector_t *denominator)
+{
+	*numerator = ((struct transient_c *) store->context)->next_free;
+	*denominator = get_dev_size(store->snap->cow->bdev);
+}
+
+int dm_create_transient(struct dm_exception_store *store)
+{
+	struct transient_c *tc;
+
+	store->destroy = transient_destroy;
+	store->read_metadata = transient_read_metadata;
+	store->prepare_exception = transient_prepare;
+	store->commit_exception = transient_commit;
+	store->drop_snapshot = NULL;
+	store->fraction_full = transient_fraction_full;
+
+	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
+	if (!tc)
+		return -ENOMEM;
+
+	tc->next_free = 0;
+	store->context = tc;
+
+	return 0;
+}
+
+int dm_transient_snapshot_init(void)
+{
+	return 0;
+}
+
+void dm_transient_snapshot_exit(void)
+{
+}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 81f03a0..018b567 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1406,6 +1406,12 @@ static int __init dm_snapshot_init(void)
 {
 	int r;
 
+	r = dm_exception_store_init();
+	if (r) {
+		DMERR("Failed to initialize exception stores");
+		return r;
+	}
+
 	r = dm_register_target(&snapshot_target);
 	if (r) {
 		DMERR("snapshot target register failed %d", r);
@@ -1454,17 +1460,17 @@ static int __init dm_snapshot_init(void)
 
 	return 0;
 
-      bad_pending_pool:
+bad_pending_pool:
 	kmem_cache_destroy(tracked_chunk_cache);
-      bad5:
+bad5:
 	kmem_cache_destroy(pending_cache);
-      bad4:
+bad4:
 	kmem_cache_destroy(exception_cache);
-      bad3:
+bad3:
 	exit_origin_hash();
-      bad2:
+bad2:
 	dm_unregister_target(&origin_target);
-      bad1:
+bad1:
 	dm_unregister_target(&snapshot_target);
 	return r;
 }
@@ -1480,6 +1486,8 @@ static void __exit dm_snapshot_exit(void)
 	kmem_cache_destroy(pending_cache);
 	kmem_cache_destroy(exception_cache);
 	kmem_cache_destroy(tracked_chunk_cache);
+
+	dm_exception_store_exit();
 }
 
 /* Module hooks */
-- 
cgit v0.10.2


From a159c1ac5f33c6cf0f5aa3c9d1ccdc82c907ee46 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 6 Jan 2009 03:05:19 +0000
Subject: dm snapshot: extend exception store functions

Supply dm_add_exception as a callback to the read_metadata function.
Add a status function ready for a later patch and name the functions
consistently.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 74777e0..dccbfb0 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -11,7 +11,6 @@
 #include <linux/pagemap.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
-#include <linux/device-mapper.h>
 
 #define DM_MSG_PREFIX "snapshot exception stores"
 
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 78d1ace..bb9f33d 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -11,6 +11,7 @@
 #define _LINUX_DM_EXCEPTION_STORE
 
 #include <linux/blkdev.h>
+#include <linux/device-mapper.h>
 
 /*
  * The snapshot code deals with largish chunks of the disk at a
@@ -37,7 +38,6 @@ struct dm_snap_exception {
  * COW device).
  */
 struct dm_exception_store {
-
 	/*
 	 * Destroys this object when you've finished with it.
 	 */
@@ -45,9 +45,13 @@ struct dm_exception_store {
 
 	/*
 	 * The target shouldn't read the COW device until this is
-	 * called.
+	 * called.  As exceptions are read from the COW, they are
+	 * reported back via the callback.
 	 */
-	int (*read_metadata) (struct dm_exception_store *store);
+	int (*read_metadata) (struct dm_exception_store *store,
+			      int (*callback)(void *callback_context,
+					      chunk_t old, chunk_t new),
+			      void *callback_context);
 
 	/*
 	 * Find somewhere to store the next exception.
@@ -68,6 +72,9 @@ struct dm_exception_store {
 	 */
 	void (*drop_snapshot) (struct dm_exception_store *store);
 
+	int (*status) (struct dm_exception_store *store, status_type_t status,
+		       char *result, unsigned int maxlen);
+
 	/*
 	 * Return how full the snapshot is.
 	 */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 57c946c..936b34e 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -395,7 +395,11 @@ static void write_exception(struct pstore *ps,
  * 'full' is filled in to indicate if the area has been
  * filled.
  */
-static int insert_exceptions(struct pstore *ps, int *full)
+static int insert_exceptions(struct pstore *ps,
+			     int (*callback)(void *callback_context,
+					     chunk_t old, chunk_t new),
+			     void *callback_context,
+			     int *full)
 {
 	int r;
 	unsigned int i;
@@ -428,7 +432,7 @@ static int insert_exceptions(struct pstore *ps, int *full)
 		/*
 		 * Otherwise we add the exception to the snapshot.
 		 */
-		r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
+		r = callback(callback_context, de.old_chunk, de.new_chunk);
 		if (r)
 			return r;
 	}
@@ -436,7 +440,10 @@ static int insert_exceptions(struct pstore *ps, int *full)
 	return 0;
 }
 
-static int read_exceptions(struct pstore *ps)
+static int read_exceptions(struct pstore *ps,
+			   int (*callback)(void *callback_context, chunk_t old,
+					   chunk_t new),
+			   void *callback_context)
 {
 	int r, full = 1;
 
@@ -449,7 +456,7 @@ static int read_exceptions(struct pstore *ps)
 		if (r)
 			return r;
 
-		r = insert_exceptions(ps, &full);
+		r = insert_exceptions(ps, callback, callback_context, &full);
 		if (r)
 			return r;
 	}
@@ -482,7 +489,10 @@ static void persistent_destroy(struct dm_exception_store *store)
 	kfree(ps);
 }
 
-static int persistent_read_metadata(struct dm_exception_store *store)
+static int persistent_read_metadata(struct dm_exception_store *store,
+				    int (*callback)(void *callback_context,
+						    chunk_t old, chunk_t new),
+				    void *callback_context)
 {
 	int r, uninitialized_var(new_snapshot);
 	struct pstore *ps = get_info(store);
@@ -540,7 +550,7 @@ static int persistent_read_metadata(struct dm_exception_store *store)
 		/*
 		 * Read the metadata.
 		 */
-		r = read_exceptions(ps);
+		r = read_exceptions(ps, callback, callback_context);
 		if (r)
 			return r;
 	}
@@ -548,8 +558,8 @@ static int persistent_read_metadata(struct dm_exception_store *store)
 	return 0;
 }
 
-static int persistent_prepare(struct dm_exception_store *store,
-			      struct dm_snap_exception *e)
+static int persistent_prepare_exception(struct dm_exception_store *store,
+					struct dm_snap_exception *e)
 {
 	struct pstore *ps = get_info(store);
 	uint32_t stride;
@@ -575,10 +585,10 @@ static int persistent_prepare(struct dm_exception_store *store,
 	return 0;
 }
 
-static void persistent_commit(struct dm_exception_store *store,
-			      struct dm_snap_exception *e,
-			      void (*callback) (void *, int success),
-			      void *callback_context)
+static void persistent_commit_exception(struct dm_exception_store *store,
+					struct dm_snap_exception *e,
+					void (*callback) (void *, int success),
+					void *callback_context)
 {
 	unsigned int i;
 	struct pstore *ps = get_info(store);
@@ -637,7 +647,7 @@ static void persistent_commit(struct dm_exception_store *store,
 	ps->callback_count = 0;
 }
 
-static void persistent_drop(struct dm_exception_store *store)
+static void persistent_drop_snapshot(struct dm_exception_store *store)
 {
 	struct pstore *ps = get_info(store);
 
@@ -675,9 +685,9 @@ int dm_create_persistent(struct dm_exception_store *store)
 
 	store->destroy = persistent_destroy;
 	store->read_metadata = persistent_read_metadata;
-	store->prepare_exception = persistent_prepare;
-	store->commit_exception = persistent_commit;
-	store->drop_snapshot = persistent_drop;
+	store->prepare_exception = persistent_prepare_exception;
+	store->commit_exception = persistent_commit_exception;
+	store->drop_snapshot = persistent_drop_snapshot;
 	store->fraction_full = persistent_fraction_full;
 	store->context = ps;
 
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index 2a781df..7f6e2e6 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -28,13 +28,16 @@ static void transient_destroy(struct dm_exception_store *store)
 	kfree(store->context);
 }
 
-static int transient_read_metadata(struct dm_exception_store *store)
+static int transient_read_metadata(struct dm_exception_store *store,
+				   int (*callback)(void *callback_context,
+						   chunk_t old, chunk_t new),
+				   void *callback_context)
 {
 	return 0;
 }
 
-static int transient_prepare(struct dm_exception_store *store,
-			     struct dm_snap_exception *e)
+static int transient_prepare_exception(struct dm_exception_store *store,
+				       struct dm_snap_exception *e)
 {
 	struct transient_c *tc = (struct transient_c *) store->context;
 	sector_t size = get_dev_size(store->snap->cow->bdev);
@@ -48,10 +51,10 @@ static int transient_prepare(struct dm_exception_store *store,
 	return 0;
 }
 
-static void transient_commit(struct dm_exception_store *store,
-			     struct dm_snap_exception *e,
-			     void (*callback) (void *, int success),
-			     void *callback_context)
+static void transient_commit_exception(struct dm_exception_store *store,
+				       struct dm_snap_exception *e,
+				       void (*callback) (void *, int success),
+				       void *callback_context)
 {
 	/* Just succeed */
 	callback(callback_context, 1);
@@ -70,8 +73,8 @@ int dm_create_transient(struct dm_exception_store *store)
 
 	store->destroy = transient_destroy;
 	store->read_metadata = transient_read_metadata;
-	store->prepare_exception = transient_prepare;
-	store->commit_exception = transient_commit;
+	store->prepare_exception = transient_prepare_exception;
+	store->commit_exception = transient_commit_exception;
 	store->drop_snapshot = NULL;
 	store->fraction_full = transient_fraction_full;
 
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 018b567..65ff82f 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -430,8 +430,13 @@ out:
 	list_add(&new_e->hash_list, e ? &e->hash_list : l);
 }
 
-int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
+/*
+ * Callback used by the exception stores to load exceptions when
+ * initialising.
+ */
+static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 {
+	struct dm_snapshot *s = context;
 	struct dm_snap_exception *e;
 
 	e = alloc_exception();
@@ -660,7 +665,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	spin_lock_init(&s->tracked_chunk_lock);
 
 	/* Metadata must only be loaded into one table at once */
-	r = s->store.read_metadata(&s->store);
+	r = s->store.read_metadata(&s->store, dm_add_exception, (void *)s);
 	if (r < 0) {
 		ti->error = "Failed to read snapshot metadata";
 		goto bad_load_and_register;
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index 9281236..d9e62b4 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -76,12 +76,6 @@ struct dm_snapshot {
 };
 
 /*
- * Used by the exception stores to load exceptions hen
- * initialising.
- */
-int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
-
-/*
  * Return the number of sectors in the device.
  */
 static inline sector_t get_dev_size(struct block_device *bdev)
-- 
cgit v0.10.2