From cfae7529b525c3fa86deb71cf2036659240a865e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 11 Apr 2016 12:05:38 -0400
Subject: dm: remove unused mapped_device argument from free_tio()

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3d3ac13..1b2f962 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -674,7 +674,7 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
 	mempool_free(io, md->io_pool);
 }
 
-static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
+static void free_tio(struct dm_target_io *tio)
 {
 	bio_put(&tio->clone);
 }
@@ -1055,7 +1055,7 @@ static void clone_endio(struct bio *bio)
 		     !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
 		disable_write_same(md);
 
-	free_tio(md, tio);
+	free_tio(tio);
 	dec_pending(io, error);
 }
 
@@ -1517,7 +1517,6 @@ static void __map_bio(struct dm_target_io *tio)
 {
 	int r;
 	sector_t sector;
-	struct mapped_device *md;
 	struct bio *clone = &tio->clone;
 	struct dm_target *ti = tio->ti;
 
@@ -1540,9 +1539,8 @@ static void __map_bio(struct dm_target_io *tio)
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
 		/* error the io and bail out, or requeue it if needed */
-		md = tio->io->md;
 		dec_pending(tio->io, r);
-		free_tio(md, tio);
+		free_tio(tio);
 	} else if (r != DM_MAPIO_SUBMITTED) {
 		DMWARN("unimplemented target map return value: %d", r);
 		BUG();
@@ -1663,7 +1661,7 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
 		tio->len_ptr = len;
 		r = clone_bio(tio, bio, sector, *len);
 		if (r < 0) {
-			free_tio(ci->md, tio);
+			free_tio(tio);
 			break;
 		}
 		__map_bio(tio);
-- 
cgit v0.10.2


From 813923b1a21b73f3068d89b817a86bea5371108c Mon Sep 17 00:00:00 2001
From: Amitoj Kaur Chawla <amitoj1606@gmail.com>
Date: Mon, 11 Apr 2016 11:37:08 +0530
Subject: dm thin: Remove return statement from void function

Return statement at the end of a void function is useless.

The Coccinelle semantic patch used to make this change is as follows:
//<smpl>
@@
identifier f;
expression e;
@@
void f(...) {
<...
- return
  e;
...>
}
//</smpl>

Signed-off-by: Amitoj Kaur Chawla <amitoj1606@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 92237b6..04e7f3b 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -632,7 +632,7 @@ static void error_retry_list(struct pool *pool)
 {
 	int error = get_pool_io_error_code(pool);
 
-	return error_retry_list_with_code(pool, error);
+	error_retry_list_with_code(pool, error);
 }
 
 /*
-- 
cgit v0.10.2


From 518257b13276d07a19e6ae0608b8e5ee73383ce4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 17 Mar 2016 16:32:10 -0400
Subject: dm mpath: switch to using bitops for state flags

Mechanical change that doesn't make any real effort to reduce the use of
m->lock; that will come later (once atomics are used for counters, etc).

Suggested-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 677ba22..598d4a1 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -83,13 +83,7 @@ struct multipath {
 	struct priority_group *current_pg;
 	struct priority_group *next_pg;	/* Switch to this PG if set */
 
-	bool queue_io:1;		/* Must we queue all I/O? */
-	bool queue_if_no_path:1;	/* Queue I/O if last path fails? */
-	bool saved_queue_if_no_path:1;	/* Saved state during suspension */
-	bool retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
-	bool pg_init_disabled:1;	/* pg_init is not currently allowed */
-	bool pg_init_required:1;	/* pg_init needs calling? */
-	bool pg_init_delay_retry:1;	/* Delay pg_init retry? */
+	unsigned long flags;		/* Multipath state flags */
 
 	unsigned pg_init_retries;	/* Number of times to retry pg_init */
 	unsigned pg_init_count;		/* Number of times pg_init called */
@@ -122,6 +116,17 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
 
+/*-----------------------------------------------
+ * Multipath state flags.
+ *-----------------------------------------------*/
+
+#define MPATHF_QUEUE_IO 0			/* Must we queue all I/O? */
+#define MPATHF_QUEUE_IF_NO_PATH 1		/* Queue I/O if last path fails? */
+#define MPATHF_SAVED_QUEUE_IF_NO_PATH 2		/* Saved state during suspension */
+#define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3	/* If there's already a hw_handler present, don't change it. */
+#define MPATHF_PG_INIT_DISABLED 4		/* pg_init is not currently allowed */
+#define MPATHF_PG_INIT_REQUIRED 5		/* pg_init needs calling? */
+#define MPATHF_PG_INIT_DELAY_RETRY 6		/* Delay pg_init retry? */
 
 /*-----------------------------------------------
  * Allocation routines
@@ -189,7 +194,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq)
 	if (m) {
 		INIT_LIST_HEAD(&m->priority_groups);
 		spin_lock_init(&m->lock);
-		m->queue_io = true;
+		set_bit(MPATHF_QUEUE_IO, &m->flags);
 		m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
 		INIT_WORK(&m->trigger_event, trigger_event);
 		init_waitqueue_head(&m->pg_init_wait);
@@ -274,17 +279,17 @@ static int __pg_init_all_paths(struct multipath *m)
 	struct pgpath *pgpath;
 	unsigned long pg_init_delay = 0;
 
-	if (m->pg_init_in_progress || m->pg_init_disabled)
+	if (m->pg_init_in_progress || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
 		return 0;
 
 	m->pg_init_count++;
-	m->pg_init_required = false;
+	clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
 
 	/* Check here to reset pg_init_required */
 	if (!m->current_pg)
 		return 0;
 
-	if (m->pg_init_delay_retry)
+	if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags))
 		pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
 						 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
 	list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
@@ -304,11 +309,11 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
 
 	/* Must we initialise the PG first, and queue I/O till it's ready? */
 	if (m->hw_handler_name) {
-		m->pg_init_required = true;
-		m->queue_io = true;
+		set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
+		set_bit(MPATHF_QUEUE_IO, &m->flags);
 	} else {
-		m->pg_init_required = false;
-		m->queue_io = false;
+		clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
+		clear_bit(MPATHF_QUEUE_IO, &m->flags);
 	}
 
 	m->pg_init_count = 0;
@@ -337,7 +342,7 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
 	bool bypassed = true;
 
 	if (!m->nr_valid_paths) {
-		m->queue_io = false;
+		clear_bit(MPATHF_QUEUE_IO, &m->flags);
 		goto failed;
 	}
 
@@ -365,7 +370,7 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
 				continue;
 			if (!__choose_path_in_pg(m, pg, nr_bytes)) {
 				if (!bypassed)
-					m->pg_init_delay_retry = true;
+					set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
 				return;
 			}
 		}
@@ -389,8 +394,9 @@ failed:
  */
 static int __must_push_back(struct multipath *m)
 {
-	return (m->queue_if_no_path ||
-		(m->queue_if_no_path != m->saved_queue_if_no_path &&
+	return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
+		((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
+		  test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) &&
 		 dm_noflush_suspending(m->ti)));
 }
 
@@ -411,7 +417,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 	spin_lock_irq(&m->lock);
 
 	/* Do we need to select a new pgpath? */
-	if (!m->current_pgpath || !m->queue_io)
+	if (!m->current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
 		__choose_pgpath(m, nr_bytes);
 
 	pgpath = m->current_pgpath;
@@ -420,7 +426,8 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 		if (!__must_push_back(m))
 			r = -EIO;	/* Failed */
 		goto out_unlock;
-	} else if (m->queue_io || m->pg_init_required) {
+	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
+		   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
 		__pg_init_all_paths(m);
 		goto out_unlock;
 	}
@@ -503,11 +510,22 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
 
 	spin_lock_irqsave(&m->lock, flags);
 
-	if (save_old_value)
-		m->saved_queue_if_no_path = m->queue_if_no_path;
+	if (save_old_value) {
+		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+			set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
+		else
+			clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
+	} else {
+		if (queue_if_no_path)
+			set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
+		else
+			clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
+	}
+	if (queue_if_no_path)
+		set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
 	else
-		m->saved_queue_if_no_path = queue_if_no_path;
-	m->queue_if_no_path = queue_if_no_path;
+		clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
+
 	spin_unlock_irqrestore(&m->lock, flags);
 
 	if (!queue_if_no_path)
@@ -600,10 +618,10 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
 		goto bad;
 	}
 
-	if (m->retain_attached_hw_handler || m->hw_handler_name)
+	if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) || m->hw_handler_name)
 		q = bdev_get_queue(p->path.dev->bdev);
 
-	if (m->retain_attached_hw_handler) {
+	if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) {
 retain:
 		attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
 		if (attached_handler_name) {
@@ -808,7 +826,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 		}
 
 		if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
-			m->retain_attached_hw_handler = true;
+			set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
 			continue;
 		}
 
@@ -944,20 +962,16 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m)
 
 static void flush_multipath_work(struct multipath *m)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&m->lock, flags);
-	m->pg_init_disabled = true;
-	spin_unlock_irqrestore(&m->lock, flags);
+	set_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
+	smp_mb__after_atomic();
 
 	flush_workqueue(kmpath_handlerd);
 	multipath_wait_for_pg_init_completion(m);
 	flush_workqueue(kmultipathd);
 	flush_work(&m->trigger_event);
 
-	spin_lock_irqsave(&m->lock, flags);
-	m->pg_init_disabled = false;
-	spin_unlock_irqrestore(&m->lock, flags);
+	clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
+	smp_mb__after_atomic();
 }
 
 static void multipath_dtr(struct dm_target *ti)
@@ -1152,8 +1166,8 @@ static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
 
 	spin_lock_irqsave(&m->lock, flags);
 
-	if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
-		m->pg_init_required = true;
+	if (m->pg_init_count <= m->pg_init_retries && !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
+		set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
 	else
 		limit_reached = true;
 
@@ -1219,19 +1233,23 @@ static void pg_init_done(void *data, int errors)
 			m->current_pgpath = NULL;
 			m->current_pg = NULL;
 		}
-	} else if (!m->pg_init_required)
+	} else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
 		pg->bypassed = false;
 
 	if (--m->pg_init_in_progress)
 		/* Activations of other paths are still on going */
 		goto out;
 
-	if (m->pg_init_required) {
-		m->pg_init_delay_retry = delay_retry;
+	if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
+		if (delay_retry)
+			set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
+		else
+			clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
+
 		if (__pg_init_all_paths(m))
 			goto out;
 	}
-	m->queue_io = false;
+	clear_bit(MPATHF_QUEUE_IO, &m->flags);
 
 	/*
 	 * Wake up any thread waiting to suspend.
@@ -1300,7 +1318,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
 
 	spin_lock_irqsave(&m->lock, flags);
 	if (!m->nr_valid_paths) {
-		if (!m->queue_if_no_path) {
+		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
 			if (!__must_push_back(m))
 				r = -EIO;
 		} else {
@@ -1364,11 +1382,12 @@ static void multipath_postsuspend(struct dm_target *ti)
 static void multipath_resume(struct dm_target *ti)
 {
 	struct multipath *m = ti->private;
-	unsigned long flags;
 
-	spin_lock_irqsave(&m->lock, flags);
-	m->queue_if_no_path = m->saved_queue_if_no_path;
-	spin_unlock_irqrestore(&m->lock, flags);
+	if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags))
+		set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
+	else
+		clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
+	smp_mb__after_atomic();
 }
 
 /*
@@ -1402,19 +1421,19 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 
 	/* Features */
 	if (type == STATUSTYPE_INFO)
-		DMEMIT("2 %u %u ", m->queue_io, m->pg_init_count);
+		DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags), m->pg_init_count);
 	else {
-		DMEMIT("%u ", m->queue_if_no_path +
+		DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
 			      (m->pg_init_retries > 0) * 2 +
 			      (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
-			      m->retain_attached_hw_handler);
-		if (m->queue_if_no_path)
+			      test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags));
+		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
 			DMEMIT("queue_if_no_path ");
 		if (m->pg_init_retries)
 			DMEMIT("pg_init_retries %u ", m->pg_init_retries);
 		if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
 			DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
-		if (m->retain_attached_hw_handler)
+		if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags))
 			DMEMIT("retain_attached_hw_handler ");
 	}
 
@@ -1572,7 +1591,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 		__choose_pgpath(m, 0);
 
 	if (m->current_pgpath) {
-		if (!m->queue_io) {
+		if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
 			*bdev = m->current_pgpath->path.dev->bdev;
 			*mode = m->current_pgpath->path.dev->mode;
 			r = 0;
@@ -1582,7 +1601,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 		}
 	} else {
 		/* No path is available */
-		if (m->queue_if_no_path)
+		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
 			r = -ENOTCONN;
 		else
 			r = -EIO;
@@ -1596,7 +1615,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 			/* Path status changed, redo selection */
 			__choose_pgpath(m, 0);
 		}
-		if (m->pg_init_required)
+		if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
 			__pg_init_all_paths(m);
 		spin_unlock_irqrestore(&m->lock, flags);
 		dm_table_run_md_queue_async(m->ti->table);
@@ -1657,7 +1676,7 @@ static int multipath_busy(struct dm_target *ti)
 
 	/* pg_init in progress or no paths available */
 	if (m->pg_init_in_progress ||
-	    (!m->nr_valid_paths && m->queue_if_no_path)) {
+	    (!m->nr_valid_paths && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
 		busy = true;
 		goto out;
 	}
-- 
cgit v0.10.2


From 91e968aa6015d7366281b532dad2e48855b91fe3 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 17 Mar 2016 17:10:15 -0400
Subject: dm mpath: use atomic_t for counting members of 'struct multipath'

The use of atomic_t for nr_valid_paths, pg_init_in_progress and
pg_init_count will allow relaxing the use of the m->lock spinlock.

Suggested-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 598d4a1..780e5d0 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -76,9 +76,6 @@ struct multipath {
 
 	wait_queue_head_t pg_init_wait;	/* Wait for pg_init completion */
 
-	unsigned pg_init_in_progress;	/* Only one pg_init allowed at once */
-
-	unsigned nr_valid_paths;	/* Total number of usable paths */
 	struct pgpath *current_pgpath;
 	struct priority_group *current_pg;
 	struct priority_group *next_pg;	/* Switch to this PG if set */
@@ -86,9 +83,12 @@ struct multipath {
 	unsigned long flags;		/* Multipath state flags */
 
 	unsigned pg_init_retries;	/* Number of times to retry pg_init */
-	unsigned pg_init_count;		/* Number of times pg_init called */
 	unsigned pg_init_delay_msecs;	/* Number of msecs before pg_init retry */
 
+	atomic_t nr_valid_paths;	/* Total number of usable paths */
+	atomic_t pg_init_in_progress;	/* Only one pg_init allowed at once */
+	atomic_t pg_init_count;		/* Number of times pg_init called */
+
 	struct work_struct trigger_event;
 
 	/*
@@ -195,6 +195,9 @@ static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq)
 		INIT_LIST_HEAD(&m->priority_groups);
 		spin_lock_init(&m->lock);
 		set_bit(MPATHF_QUEUE_IO, &m->flags);
+		atomic_set(&m->nr_valid_paths, 0);
+		atomic_set(&m->pg_init_in_progress, 0);
+		atomic_set(&m->pg_init_count, 0);
 		m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
 		INIT_WORK(&m->trigger_event, trigger_event);
 		init_waitqueue_head(&m->pg_init_wait);
@@ -279,10 +282,10 @@ static int __pg_init_all_paths(struct multipath *m)
 	struct pgpath *pgpath;
 	unsigned long pg_init_delay = 0;
 
-	if (m->pg_init_in_progress || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
+	if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
 		return 0;
 
-	m->pg_init_count++;
+	atomic_inc(&m->pg_init_count);
 	clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
 
 	/* Check here to reset pg_init_required */
@@ -298,9 +301,9 @@ static int __pg_init_all_paths(struct multipath *m)
 			continue;
 		if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
 				       pg_init_delay))
-			m->pg_init_in_progress++;
+			atomic_inc(&m->pg_init_in_progress);
 	}
-	return m->pg_init_in_progress;
+	return atomic_read(&m->pg_init_in_progress);
 }
 
 static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
@@ -316,7 +319,7 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
 		clear_bit(MPATHF_QUEUE_IO, &m->flags);
 	}
 
-	m->pg_init_count = 0;
+	atomic_set(&m->pg_init_count, 0);
 }
 
 static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
@@ -341,7 +344,7 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
 	struct priority_group *pg;
 	bool bypassed = true;
 
-	if (!m->nr_valid_paths) {
+	if (!atomic_read(&m->nr_valid_paths)) {
 		clear_bit(MPATHF_QUEUE_IO, &m->flags);
 		goto failed;
 	}
@@ -902,6 +905,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	/* parse the priority groups */
 	while (as.argc) {
 		struct priority_group *pg;
+		unsigned nr_valid_paths = atomic_read(&m->nr_valid_paths);
 
 		pg = parse_priority_group(&as, m);
 		if (IS_ERR(pg)) {
@@ -909,7 +913,9 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 			goto bad;
 		}
 
-		m->nr_valid_paths += pg->nr_pgpaths;
+		nr_valid_paths += pg->nr_pgpaths;
+		atomic_set(&m->nr_valid_paths, nr_valid_paths);
+
 		list_add_tail(&pg->list, &m->priority_groups);
 		pg_count++;
 		pg->pg_num = pg_count;
@@ -939,19 +945,14 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 static void multipath_wait_for_pg_init_completion(struct multipath *m)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	unsigned long flags;
 
 	add_wait_queue(&m->pg_init_wait, &wait);
 
 	while (1) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 
-		spin_lock_irqsave(&m->lock, flags);
-		if (!m->pg_init_in_progress) {
-			spin_unlock_irqrestore(&m->lock, flags);
+		if (!atomic_read(&m->pg_init_in_progress))
 			break;
-		}
-		spin_unlock_irqrestore(&m->lock, flags);
 
 		io_schedule();
 	}
@@ -1001,13 +1002,13 @@ static int fail_path(struct pgpath *pgpath)
 	pgpath->is_active = false;
 	pgpath->fail_count++;
 
-	m->nr_valid_paths--;
+	atomic_dec(&m->nr_valid_paths);
 
 	if (pgpath == m->current_pgpath)
 		m->current_pgpath = NULL;
 
 	dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
-		      pgpath->path.dev->name, m->nr_valid_paths);
+		       pgpath->path.dev->name, atomic_read(&m->nr_valid_paths));
 
 	schedule_work(&m->trigger_event);
 
@@ -1025,6 +1026,7 @@ static int reinstate_path(struct pgpath *pgpath)
 	int r = 0, run_queue = 0;
 	unsigned long flags;
 	struct multipath *m = pgpath->pg->m;
+	unsigned nr_valid_paths;
 
 	spin_lock_irqsave(&m->lock, flags);
 
@@ -1039,16 +1041,17 @@ static int reinstate_path(struct pgpath *pgpath)
 
 	pgpath->is_active = true;
 
-	if (!m->nr_valid_paths++) {
+	nr_valid_paths = atomic_inc_return(&m->nr_valid_paths);
+	if (nr_valid_paths == 1) {
 		m->current_pgpath = NULL;
 		run_queue = 1;
 	} else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
 		if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
-			m->pg_init_in_progress++;
+			atomic_inc(&m->pg_init_in_progress);
 	}
 
 	dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
-		      pgpath->path.dev->name, m->nr_valid_paths);
+		       pgpath->path.dev->name, nr_valid_paths);
 
 	schedule_work(&m->trigger_event);
 
@@ -1166,7 +1169,8 @@ static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
 
 	spin_lock_irqsave(&m->lock, flags);
 
-	if (m->pg_init_count <= m->pg_init_retries && !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
+	if (atomic_read(&m->pg_init_count) <= m->pg_init_retries &&
+	    !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
 		set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
 	else
 		limit_reached = true;
@@ -1236,7 +1240,7 @@ static void pg_init_done(void *data, int errors)
 	} else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
 		pg->bypassed = false;
 
-	if (--m->pg_init_in_progress)
+	if (atomic_dec_return(&m->pg_init_in_progress) > 0)
 		/* Activations of other paths are still on going */
 		goto out;
 
@@ -1317,7 +1321,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
 		fail_path(mpio->pgpath);
 
 	spin_lock_irqsave(&m->lock, flags);
-	if (!m->nr_valid_paths) {
+	if (!atomic_read(&m->nr_valid_paths)) {
 		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
 			if (!__must_push_back(m))
 				r = -EIO;
@@ -1421,7 +1425,8 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 
 	/* Features */
 	if (type == STATUSTYPE_INFO)
-		DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags), m->pg_init_count);
+		DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags),
+		       atomic_read(&m->pg_init_count));
 	else {
 		DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
 			      (m->pg_init_retries > 0) * 2 +
@@ -1675,8 +1680,8 @@ static int multipath_busy(struct dm_target *ti)
 	spin_lock_irqsave(&m->lock, flags);
 
 	/* pg_init in progress or no paths available */
-	if (m->pg_init_in_progress ||
-	    (!m->nr_valid_paths && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
+	if (atomic_read(&m->pg_init_in_progress) ||
+	    (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
 		busy = true;
 		goto out;
 	}
-- 
cgit v0.10.2


From 20800cb3450ee44ec1827d7e8bbfd5a9dc02e6cd Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 17 Mar 2016 17:13:10 -0400
Subject: dm mpath: move trigger_event member to the end of 'struct multipath'

Allows the 'work_mutex' member to no longer cross a cacheline.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 780e5d0..54daf96 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -89,8 +89,6 @@ struct multipath {
 	atomic_t pg_init_in_progress;	/* Only one pg_init allowed at once */
 	atomic_t pg_init_count;		/* Number of times pg_init called */
 
-	struct work_struct trigger_event;
-
 	/*
 	 * We must use a mempool of dm_mpath_io structs so that we
 	 * can resubmit bios on error.
@@ -98,6 +96,7 @@ struct multipath {
 	mempool_t *mpio_pool;
 
 	struct mutex work_mutex;
+	struct work_struct trigger_event;
 };
 
 /*
-- 
cgit v0.10.2


From 2da1610ae20e995e53658c3b10166d2ad74e30bd Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 17 Mar 2016 18:38:17 -0400
Subject: dm mpath: eliminate use of spinlock in IO fast-paths

The primary motivation of this commit is to improve the scalability of
DM multipath on large NUMA systems where m->lock spinlock contention has
been proven to be a serious bottleneck on really fast storage.

The ability to atomically read a pointer, using lockless_dereference(),
is leveraged in this commit.  But all pointer writes are still protected
by the m->lock spinlock (which is fine since these all now occur in the
slow-path).

The following functions no longer require the m->lock spinlock in their
fast-path: multipath_busy(), __multipath_map(), and do_end_io()

And choose_pgpath() is modified to _not_ update m->current_pgpath unless
it also switches the path-group.  This is done to avoid needing to take
the m->lock everytime __multipath_map() calls choose_pgpath().
But m->current_pgpath will be reset if it is failed via fail_path().

Suggested-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 54daf96..52baf8a 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -305,9 +305,21 @@ static int __pg_init_all_paths(struct multipath *m)
 	return atomic_read(&m->pg_init_in_progress);
 }
 
-static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
+static int pg_init_all_paths(struct multipath *m)
 {
-	m->current_pg = pgpath->pg;
+	int r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&m->lock, flags);
+	r = __pg_init_all_paths(m);
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	return r;
+}
+
+static void __switch_pg(struct multipath *m, struct priority_group *pg)
+{
+	m->current_pg = pg;
 
 	/* Must we initialise the PG first, and queue I/O till it's ready? */
 	if (m->hw_handler_name) {
@@ -321,26 +333,36 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
 	atomic_set(&m->pg_init_count, 0);
 }
 
-static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
-			       size_t nr_bytes)
+static struct pgpath *choose_path_in_pg(struct multipath *m,
+					struct priority_group *pg,
+					size_t nr_bytes)
 {
+	unsigned long flags;
 	struct dm_path *path;
+	struct pgpath *pgpath;
 
 	path = pg->ps.type->select_path(&pg->ps, nr_bytes);
 	if (!path)
-		return -ENXIO;
+		return ERR_PTR(-ENXIO);
 
-	m->current_pgpath = path_to_pgpath(path);
+	pgpath = path_to_pgpath(path);
 
-	if (m->current_pg != pg)
-		__switch_pg(m, m->current_pgpath);
+	if (unlikely(lockless_dereference(m->current_pg) != pg)) {
+		/* Only update current_pgpath if pg changed */
+		spin_lock_irqsave(&m->lock, flags);
+		m->current_pgpath = pgpath;
+		__switch_pg(m, pg);
+		spin_unlock_irqrestore(&m->lock, flags);
+	}
 
-	return 0;
+	return pgpath;
 }
 
-static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
+static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
 {
+	unsigned long flags;
 	struct priority_group *pg;
+	struct pgpath *pgpath;
 	bool bypassed = true;
 
 	if (!atomic_read(&m->nr_valid_paths)) {
@@ -349,16 +371,28 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
 	}
 
 	/* Were we instructed to switch PG? */
-	if (m->next_pg) {
+	if (lockless_dereference(m->next_pg)) {
+		spin_lock_irqsave(&m->lock, flags);
 		pg = m->next_pg;
+		if (!pg) {
+			spin_unlock_irqrestore(&m->lock, flags);
+			goto check_current_pg;
+		}
 		m->next_pg = NULL;
-		if (!__choose_path_in_pg(m, pg, nr_bytes))
-			return;
+		spin_unlock_irqrestore(&m->lock, flags);
+		pgpath = choose_path_in_pg(m, pg, nr_bytes);
+		if (!IS_ERR_OR_NULL(pgpath))
+			return pgpath;
 	}
 
 	/* Don't change PG until it has no remaining paths */
-	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
-		return;
+check_current_pg:
+	pg = lockless_dereference(m->current_pg);
+	if (pg) {
+		pgpath = choose_path_in_pg(m, pg, nr_bytes);
+		if (!IS_ERR_OR_NULL(pgpath))
+			return pgpath;
+	}
 
 	/*
 	 * Loop through priority groups until we find a valid path.
@@ -370,31 +404,34 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
 		list_for_each_entry(pg, &m->priority_groups, list) {
 			if (pg->bypassed == bypassed)
 				continue;
-			if (!__choose_path_in_pg(m, pg, nr_bytes)) {
+			pgpath = choose_path_in_pg(m, pg, nr_bytes);
+			if (!IS_ERR_OR_NULL(pgpath)) {
 				if (!bypassed)
 					set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
-				return;
+				return pgpath;
 			}
 		}
 	} while (bypassed--);
 
 failed:
+	spin_lock_irqsave(&m->lock, flags);
 	m->current_pgpath = NULL;
 	m->current_pg = NULL;
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	return NULL;
 }
 
 /*
  * Check whether bios must be queued in the device-mapper core rather
  * than here in the target.
  *
- * m->lock must be held on entry.
- *
  * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
  * same value then we are not between multipath_presuspend()
  * and multipath_resume() calls and we have no need to check
  * for the DMF_NOFLUSH_SUSPENDING flag.
  */
-static int __must_push_back(struct multipath *m)
+static int must_push_back(struct multipath *m)
 {
 	return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
 		((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
@@ -416,36 +453,31 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 	struct block_device *bdev;
 	struct dm_mpath_io *mpio;
 
-	spin_lock_irq(&m->lock);
-
 	/* Do we need to select a new pgpath? */
-	if (!m->current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
-		__choose_pgpath(m, nr_bytes);
-
-	pgpath = m->current_pgpath;
+	pgpath = lockless_dereference(m->current_pgpath);
+	if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
+		pgpath = choose_pgpath(m, nr_bytes);
 
 	if (!pgpath) {
-		if (!__must_push_back(m))
+		if (!must_push_back(m))
 			r = -EIO;	/* Failed */
-		goto out_unlock;
+		return r;
 	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
 		   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
-		__pg_init_all_paths(m);
-		goto out_unlock;
+		pg_init_all_paths(m);
+		return r;
 	}
 
 	mpio = set_mpio(m, map_context);
 	if (!mpio)
 		/* ENOMEM, requeue */
-		goto out_unlock;
+		return r;
 
 	mpio->pgpath = pgpath;
 	mpio->nr_bytes = nr_bytes;
 
 	bdev = pgpath->path.dev->bdev;
 
-	spin_unlock_irq(&m->lock);
-
 	if (clone) {
 		/*
 		 * Old request-based interface: allocated clone is passed in.
@@ -477,11 +509,6 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 					      &pgpath->path,
 					      nr_bytes);
 	return DM_MAPIO_REMAPPED;
-
-out_unlock:
-	spin_unlock_irq(&m->lock);
-
-	return r;
 }
 
 static int multipath_map(struct dm_target *ti, struct request *clone,
@@ -1308,7 +1335,6 @@ static int do_end_io(struct multipath *m, struct request *clone,
 	 * clone bios for it and resubmit it later.
 	 */
 	int r = DM_ENDIO_REQUEUE;
-	unsigned long flags;
 
 	if (!error && !clone->errors)
 		return 0;	/* I/O complete */
@@ -1319,17 +1345,15 @@ static int do_end_io(struct multipath *m, struct request *clone,
 	if (mpio->pgpath)
 		fail_path(mpio->pgpath);
 
-	spin_lock_irqsave(&m->lock, flags);
 	if (!atomic_read(&m->nr_valid_paths)) {
 		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
-			if (!__must_push_back(m))
+			if (!must_push_back(m))
 				r = -EIO;
 		} else {
 			if (error == -EBADE)
 				r = error;
 		}
 	}
-	spin_unlock_irqrestore(&m->lock, flags);
 
 	return r;
 }
@@ -1586,18 +1610,17 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 		struct block_device **bdev, fmode_t *mode)
 {
 	struct multipath *m = ti->private;
-	unsigned long flags;
+	struct pgpath *current_pgpath;
 	int r;
 
-	spin_lock_irqsave(&m->lock, flags);
-
-	if (!m->current_pgpath)
-		__choose_pgpath(m, 0);
+	current_pgpath = lockless_dereference(m->current_pgpath);
+	if (!current_pgpath)
+		current_pgpath = choose_pgpath(m, 0);
 
-	if (m->current_pgpath) {
+	if (current_pgpath) {
 		if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
-			*bdev = m->current_pgpath->path.dev->bdev;
-			*mode = m->current_pgpath->path.dev->mode;
+			*bdev = current_pgpath->path.dev->bdev;
+			*mode = current_pgpath->path.dev->mode;
 			r = 0;
 		} else {
 			/* pg_init has not started or completed */
@@ -1611,17 +1634,13 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 			r = -EIO;
 	}
 
-	spin_unlock_irqrestore(&m->lock, flags);
-
 	if (r == -ENOTCONN) {
-		spin_lock_irqsave(&m->lock, flags);
-		if (!m->current_pg) {
+		if (!lockless_dereference(m->current_pg)) {
 			/* Path status changed, redo selection */
-			__choose_pgpath(m, 0);
+			(void) choose_pgpath(m, 0);
 		}
 		if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
-			__pg_init_all_paths(m);
-		spin_unlock_irqrestore(&m->lock, flags);
+			pg_init_all_paths(m);
 		dm_table_run_md_queue_async(m->ti->table);
 	}
 
@@ -1672,39 +1691,37 @@ static int multipath_busy(struct dm_target *ti)
 {
 	bool busy = false, has_active = false;
 	struct multipath *m = ti->private;
-	struct priority_group *pg;
+	struct priority_group *pg, *next_pg;
 	struct pgpath *pgpath;
-	unsigned long flags;
-
-	spin_lock_irqsave(&m->lock, flags);
 
 	/* pg_init in progress or no paths available */
 	if (atomic_read(&m->pg_init_in_progress) ||
-	    (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
-		busy = true;
-		goto out;
-	}
+	    (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)))
+		return true;
+
 	/* Guess which priority_group will be used at next mapping time */
-	if (unlikely(!m->current_pgpath && m->next_pg))
-		pg = m->next_pg;
-	else if (likely(m->current_pg))
-		pg = m->current_pg;
-	else
+	pg = lockless_dereference(m->current_pg);
+	next_pg = lockless_dereference(m->next_pg);
+	if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg))
+		pg = next_pg;
+
+	if (!pg) {
 		/*
 		 * We don't know which pg will be used at next mapping time.
-		 * We don't call __choose_pgpath() here to avoid to trigger
+		 * We don't call choose_pgpath() here to avoid to trigger
 		 * pg_init just by busy checking.
 		 * So we don't know whether underlying devices we will be using
 		 * at next mapping time are busy or not. Just try mapping.
 		 */
-		goto out;
+		return busy;
+	}
 
 	/*
 	 * If there is one non-busy active path at least, the path selector
 	 * will be able to select it. So we consider such a pg as not busy.
 	 */
 	busy = true;
-	list_for_each_entry(pgpath, &pg->pgpaths, list)
+	list_for_each_entry(pgpath, &pg->pgpaths, list) {
 		if (pgpath->is_active) {
 			has_active = true;
 			if (!pgpath_busy(pgpath)) {
@@ -1712,17 +1729,16 @@ static int multipath_busy(struct dm_target *ti)
 				break;
 			}
 		}
+	}
 
-	if (!has_active)
+	if (!has_active) {
 		/*
 		 * No active path in this pg, so this pg won't be used and
 		 * the current_pg will be changed at next mapping time.
 		 * We need to try mapping to determine it.
 		 */
 		busy = false;
-
-out:
-	spin_unlock_irqrestore(&m->lock, flags);
+	}
 
 	return busy;
 }
-- 
cgit v0.10.2


From 492d48db8db0f89fb8f2cf7d31d22221cd9194e7 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 20 Apr 2016 21:11:25 -0400
Subject: dm cache: update cache-policies.txt now that mq is an alias for smq

Also fix some typos and make all "smq" and "mq" references consistently
lowercase.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>

diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.txt
index e5062ad..d3ca8af 100644
--- a/Documentation/device-mapper/cache-policies.txt
+++ b/Documentation/device-mapper/cache-policies.txt
@@ -11,7 +11,7 @@ Every bio that is mapped by the target is referred to the policy.
 The policy can return a simple HIT or MISS or issue a migration.
 
 Currently there's no way for the policy to issue background work,
-e.g. to start writing back dirty blocks that are going to be evicte
+e.g. to start writing back dirty blocks that are going to be evicted
 soon.
 
 Because we map bios, rather than requests it's easy for the policy
@@ -48,7 +48,7 @@ with the multiqueue (mq) policy.
 
 The smq policy (vs mq) offers the promise of less memory utilization,
 improved performance and increased adaptability in the face of changing
-workloads.  SMQ also does not have any cumbersome tuning knobs.
+workloads.  smq also does not have any cumbersome tuning knobs.
 
 Users may switch from "mq" to "smq" simply by appropriately reloading a
 DM table that is using the cache target.  Doing so will cause all of the
@@ -57,47 +57,45 @@ degrade slightly until smq recalculates the origin device's hotspots
 that should be cached.
 
 Memory usage:
-The mq policy uses a lot of memory; 88 bytes per cache block on a 64
+The mq policy used a lot of memory; 88 bytes per cache block on a 64
 bit machine.
 
-SMQ uses 28bit indexes to implement it's data structures rather than
+smq uses 28bit indexes to implement it's data structures rather than
 pointers.  It avoids storing an explicit hit count for each block.  It
-has a 'hotspot' queue rather than a pre cache which uses a quarter of
+has a 'hotspot' queue, rather than a pre-cache, which uses a quarter of
 the entries (each hotspot block covers a larger area than a single
 cache block).
 
-All these mean smq uses ~25bytes per cache block.  Still a lot of
+All this means smq uses ~25bytes per cache block.  Still a lot of
 memory, but a substantial improvement nontheless.
 
 Level balancing:
-MQ places entries in different levels of the multiqueue structures
-based on their hit count (~ln(hit count)).  This means the bottom
-levels generally have the most entries, and the top ones have very
-few.  Having unbalanced levels like this reduces the efficacy of the
+mq placed entries in different levels of the multiqueue structures
+based on their hit count (~ln(hit count)).  This meant the bottom
+levels generally had the most entries, and the top ones had very
+few.  Having unbalanced levels like this reduced the efficacy of the
 multiqueue.
 
-SMQ does not maintain a hit count, instead it swaps hit entries with
-the least recently used entry from the level above.  The over all
+smq does not maintain a hit count, instead it swaps hit entries with
+the least recently used entry from the level above.  The overall
 ordering being a side effect of this stochastic process.  With this
 scheme we can decide how many entries occupy each multiqueue level,
 resulting in better promotion/demotion decisions.
 
 Adaptability:
-The MQ policy maintains a hit count for each cache block.  For a
+The mq policy maintained a hit count for each cache block.  For a
 different block to get promoted to the cache it's hit count has to
-exceed the lowest currently in the cache.  This means it can take a
+exceed the lowest currently in the cache.  This meant it could take a
 long time for the cache to adapt between varying IO patterns.
-Periodically degrading the hit counts could help with this, but I
-haven't found a nice general solution.
 
-SMQ doesn't maintain hit counts, so a lot of this problem just goes
+smq doesn't maintain hit counts, so a lot of this problem just goes
 away.  In addition it tracks performance of the hotspot queue, which
 is used to decide which blocks to promote.  If the hotspot queue is
 performing badly then it starts moving entries more quickly between
 levels.  This lets it adapt to new IO patterns very quickly.
 
 Performance:
-Testing SMQ shows substantially better performance than MQ.
+Testing smq shows substantially better performance than mq.
 
 cleaner
 -------
-- 
cgit v0.10.2


From 52813d4046851ceab56ad6d09291e4fce00d1a0c Mon Sep 17 00:00:00 2001
From: Eric Engestrom <eric@engestrom.ch>
Date: Mon, 25 Apr 2016 01:24:03 +0100
Subject: dm stats: fix spelling mistake in Documentation

Signed-off-by: Eric Engestrom <eric@engestrom.ch>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

diff --git a/Documentation/device-mapper/statistics.txt b/Documentation/device-mapper/statistics.txt
index 6f5ef94..170ac02 100644
--- a/Documentation/device-mapper/statistics.txt
+++ b/Documentation/device-mapper/statistics.txt
@@ -205,7 +205,7 @@ statistics on them:
 
   dmsetup message vol 0 @stats_create - /100
 
-Set the auxillary data string to "foo bar baz" (the escape for each
+Set the auxiliary data string to "foo bar baz" (the escape for each
 space must also be escaped, otherwise the shell will consume them):
 
   dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz
-- 
cgit v0.10.2


From 72f6d8d8c9b3592da7109cad3415559864ee9f2d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 28 Apr 2016 15:24:03 +0200
Subject: dm ioctl: drop use of __GFP_REPEAT in copy_params()'s __vmalloc()
 call

copy_params()'s use of __GFP_REPEAT for the __vmalloc() call doesn't make much
sense because vmalloc doesn't rely on costly high order allocations.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 2adf81d..2c7ca25 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1723,7 +1723,7 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
 	if (!dmi) {
 		unsigned noio_flag;
 		noio_flag = memalloc_noio_save();
-		dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
+		dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
 		memalloc_noio_restore(noio_flag);
 		if (dmi)
 			*param_flags |= DM_PARAMS_VMALLOC;
-- 
cgit v0.10.2


From 4c9971ca6a172e70f52a7f9b6796e843c3f70293 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Fri, 29 Apr 2016 18:59:56 +0200
Subject: dm raid: make sure no feature flags are set in metadata

Given we don't yet support any feature flags in the dm-raid ondisk
metadata (see: 'features' member of 'struct dm_raid_superblock'),
add a check to ensure no flags are actually set, if any features are
set reject the activation of the RAID mapping.

This is to prevent possible data corruption in case of a kernel
downgrade when there'll potentially be feature flags set by a future
dm-raid target.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index a090121..5253274 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1037,6 +1037,11 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 	if (!mddev->events && super_init_validation(mddev, rdev))
 		return -EINVAL;
 
+	if (le32_to_cpu(sb->features)) {
+		rs->ti->error = "Unable to assemble array: No feature flags supported yet";
+		return -EINVAL;
+	}
+
 	/* Enable bitmap creation for RAID levels != 0 */
 	mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0;
 	rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
@@ -1718,7 +1723,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 7, 0},
+	.version = {1, 8, 0},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
-- 
cgit v0.10.2


From 13e4f8a695aa1dc7c94525047fc2ffb9abc8125e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 4 May 2016 15:05:44 -0400
Subject: dm thin: remove __bio_inc_remaining() and switch to using
 bio_inc_remaining()

DM thinp's use of bio_inc_remaining() is critical to ensure the original
parent discard bio isn't completed before sub-discards have.  DM thinp
needs this due to the extra quiescing that occurs, via multiple DM thinp
mappings, while processing large discards.  As such DM thinp must build
the async discard bio chain after some delay -- so bio_inc_remaining()
is used to enable DM thinp to take a reference on the original parent
discard bio for each mapping.  This allows the immediate use of
bio_endio() on that discard bio; but with the understanding that the
actual completion won't occur until each of the sub-discards'
per-mapping references are dropped.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 04e7f3b..da42c49 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1494,17 +1494,6 @@ static void process_discard_cell_no_passdown(struct thin_c *tc,
 		pool->process_prepared_discard(m);
 }
 
-/*
- * __bio_inc_remaining() is used to defer parent bios's end_io until
- * we _know_ all chained sub range discard bios have completed.
- */
-static inline void __bio_inc_remaining(struct bio *bio)
-{
-	bio->bi_flags |= (1 << BIO_CHAIN);
-	smp_mb__before_atomic();
-	atomic_inc(&bio->__bi_remaining);
-}
-
 static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
 				 struct bio *bio)
 {
@@ -1560,7 +1549,7 @@ static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t
 		 * the implicit decrement that occurs via bio_endio() in
 		 * process_prepared_discard_{passdown,no_passdown}.
 		 */
-		__bio_inc_remaining(bio);
+		bio_inc_remaining(bio);
 		if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
 			pool->process_prepared_discard(m);
 
-- 
cgit v0.10.2


From 3dba53a958a758fe7bed5002f6a2846e1acefe8e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 2 May 2016 20:16:21 -0400
Subject: dm thin: use __blkdev_issue_discard for async discard support

With commit 38f25255330 ("block: add __blkdev_issue_discard") DM thinp
no longer needs to carry its own async discard method.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index da42c49..598a78b 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -322,56 +322,6 @@ struct thin_c {
 
 /*----------------------------------------------------------------*/
 
-/**
- * __blkdev_issue_discard_async - queue a discard with async completion
- * @bdev:	blockdev to issue discard for
- * @sector:	start sector
- * @nr_sects:	number of sectors to discard
- * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @flags:	BLKDEV_IFL_* flags to control behaviour
- * @parent_bio: parent discard bio that all sub discards get chained to
- *
- * Description:
- *    Asynchronously issue a discard request for the sectors in question.
- */
-static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector,
-					sector_t nr_sects, gfp_t gfp_mask, unsigned long flags,
-					struct bio *parent_bio)
-{
-	struct request_queue *q = bdev_get_queue(bdev);
-	int type = REQ_WRITE | REQ_DISCARD;
-	struct bio *bio;
-
-	if (!q || !nr_sects)
-		return -ENXIO;
-
-	if (!blk_queue_discard(q))
-		return -EOPNOTSUPP;
-
-	if (flags & BLKDEV_DISCARD_SECURE) {
-		if (!blk_queue_secdiscard(q))
-			return -EOPNOTSUPP;
-		type |= REQ_SECURE;
-	}
-
-	/*
-	 * Required bio_put occurs in bio_endio thanks to bio_chain below
-	 */
-	bio = bio_alloc(gfp_mask, 1);
-	if (!bio)
-		return -ENOMEM;
-
-	bio_chain(bio, parent_bio);
-
-	bio->bi_iter.bi_sector = sector;
-	bio->bi_bdev = bdev;
-	bio->bi_iter.bi_size = nr_sects << 9;
-
-	submit_bio(type, bio);
-
-	return 0;
-}
-
 static bool block_size_is_power_of_two(struct pool *pool)
 {
 	return pool->sectors_per_block_shift >= 0;
@@ -387,11 +337,23 @@ static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
 static int issue_discard(struct thin_c *tc, dm_block_t data_b, dm_block_t data_e,
 			 struct bio *parent_bio)
 {
+	int type = REQ_WRITE | REQ_DISCARD;
 	sector_t s = block_to_sectors(tc->pool, data_b);
 	sector_t len = block_to_sectors(tc->pool, data_e - data_b);
+	struct bio *bio = NULL;
+	struct blk_plug plug;
+	int ret;
+
+	blk_start_plug(&plug);
+	ret = __blkdev_issue_discard(tc->pool_dev->bdev, s, len,
+				     GFP_NOWAIT, type, &bio);
+	if (!ret && bio) {
+		bio_chain(bio, parent_bio);
+		submit_bio(type, bio);
+	}
+	blk_finish_plug(&plug);
 
-	return __blkdev_issue_discard_async(tc->pool_dev->bdev, s, len,
-					    GFP_NOWAIT, 0, parent_bio);
+	return ret;
 }
 
 /*----------------------------------------------------------------*/
@@ -1543,11 +1505,11 @@ static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t
 
 		/*
 		 * The parent bio must not complete before sub discard bios are
-		 * chained to it (see __blkdev_issue_discard_async's bio_chain)!
+		 * chained to it (see issue_discard's bio_chain)!
 		 *
 		 * This per-mapping bi_remaining increment is paired with
 		 * the implicit decrement that occurs via bio_endio() in
-		 * process_prepared_discard_{passdown,no_passdown}.
+		 * process_prepared_discard_passdown().
 		 */
 		bio_inc_remaining(bio);
 		if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
-- 
cgit v0.10.2


From 202bae52934d4eb79ffaebf49f49b1cc64d8e40b Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 4 May 2016 14:12:42 -0400
Subject: dm thin: unroll issue_discard() to create longer discard bio chains

There is little benefit to doing this but it does structure DM thinp's
code to more cleanly use the __blkdev_issue_discard() interface --
particularly in passdown_double_checking_shared_status().

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 598a78b..fc803d5 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -334,26 +334,55 @@ static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
 		(b * pool->sectors_per_block);
 }
 
-static int issue_discard(struct thin_c *tc, dm_block_t data_b, dm_block_t data_e,
-			 struct bio *parent_bio)
+/*----------------------------------------------------------------*/
+
+struct discard_op {
+	struct thin_c *tc;
+	struct blk_plug plug;
+	struct bio *parent_bio;
+	struct bio *bio;
+};
+
+static void begin_discard(struct discard_op *op, struct thin_c *tc, struct bio *parent)
+{
+	BUG_ON(!parent);
+
+	op->tc = tc;
+	blk_start_plug(&op->plug);
+	op->parent_bio = parent;
+	op->bio = NULL;
+}
+
+static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
 {
-	int type = REQ_WRITE | REQ_DISCARD;
+	struct thin_c *tc = op->tc;
 	sector_t s = block_to_sectors(tc->pool, data_b);
 	sector_t len = block_to_sectors(tc->pool, data_e - data_b);
-	struct bio *bio = NULL;
-	struct blk_plug plug;
-	int ret;
 
-	blk_start_plug(&plug);
-	ret = __blkdev_issue_discard(tc->pool_dev->bdev, s, len,
-				     GFP_NOWAIT, type, &bio);
-	if (!ret && bio) {
-		bio_chain(bio, parent_bio);
-		submit_bio(type, bio);
+	return __blkdev_issue_discard(tc->pool_dev->bdev, s, len,
+				      GFP_NOWAIT, REQ_WRITE | REQ_DISCARD, &op->bio);
+}
+
+static void end_discard(struct discard_op *op, int r)
+{
+	if (op->bio) {
+		/*
+		 * Even if one of the calls to issue_discard failed, we
+		 * need to wait for the chain to complete.
+		 */
+		bio_chain(op->bio, op->parent_bio);
+		submit_bio(REQ_WRITE | REQ_DISCARD, op->bio);
 	}
-	blk_finish_plug(&plug);
 
-	return ret;
+	blk_finish_plug(&op->plug);
+
+	/*
+	 * Even if r is set, there could be sub discards in flight that we
+	 * need to wait for.
+	 */
+	if (r && !op->parent_bio->bi_error)
+		op->parent_bio->bi_error = r;
+	bio_endio(op->parent_bio);
 }
 
 /*----------------------------------------------------------------*/
@@ -968,24 +997,28 @@ static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
 	mempool_free(m, tc->pool->mapping_pool);
 }
 
-static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
+/*----------------------------------------------------------------*/
+
+static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
 {
 	/*
 	 * We've already unmapped this range of blocks, but before we
 	 * passdown we have to check that these blocks are now unused.
 	 */
-	int r;
+	int r = 0;
 	bool used = true;
 	struct thin_c *tc = m->tc;
 	struct pool *pool = tc->pool;
 	dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
+	struct discard_op op;
 
+	begin_discard(&op, tc, m->bio);
 	while (b != end) {
 		/* find start of unmapped run */
 		for (; b < end; b++) {
 			r = dm_pool_block_is_used(pool->pmd, b, &used);
 			if (r)
-				return r;
+				goto out;
 
 			if (!used)
 				break;
@@ -998,20 +1031,20 @@ static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
 		for (e = b + 1; e != end; e++) {
 			r = dm_pool_block_is_used(pool->pmd, e, &used);
 			if (r)
-				return r;
+				goto out;
 
 			if (used)
 				break;
 		}
 
-		r = issue_discard(tc, b, e, m->bio);
+		r = issue_discard(&op, b, e);
 		if (r)
-			return r;
+			goto out;
 
 		b = e;
 	}
-
-	return 0;
+out:
+	end_discard(&op, r);
 }
 
 static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
@@ -1021,20 +1054,21 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
 	struct pool *pool = tc->pool;
 
 	r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
-	if (r)
+	if (r) {
 		metadata_operation_failed(pool, "dm_thin_remove_range", r);
+		bio_io_error(m->bio);
 
-	else if (m->maybe_shared)
-		r = passdown_double_checking_shared_status(m);
-	else
-		r = issue_discard(tc, m->data_block, m->data_block + (m->virt_end - m->virt_begin), m->bio);
+	} else if (m->maybe_shared) {
+		passdown_double_checking_shared_status(m);
+
+	} else {
+		struct discard_op op;
+		begin_discard(&op, tc, m->bio);
+		r = issue_discard(&op, m->data_block,
+				  m->data_block + (m->virt_end - m->virt_begin));
+		end_discard(&op, r);
+	}
 
-	/*
-	 * Even if r is set, there could be sub discards in flight that we
-	 * need to wait for.
-	 */
-	m->bio->bi_error = r;
-	bio_endio(m->bio);
 	cell_defer_no_holder(tc, m->cell);
 	mempool_free(m, pool->mapping_pool);
 }
@@ -1505,11 +1539,11 @@ static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t
 
 		/*
 		 * The parent bio must not complete before sub discard bios are
-		 * chained to it (see issue_discard's bio_chain)!
+		 * chained to it (see end_discard's bio_chain)!
 		 *
 		 * This per-mapping bi_remaining increment is paired with
 		 * the implicit decrement that occurs via bio_endio() in
-		 * process_prepared_discard_passdown().
+		 * end_discard().
 		 */
 		bio_inc_remaining(bio);
 		if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
@@ -3850,7 +3884,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 18, 0},
+	.version = {1, 19, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -4224,7 +4258,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 18, 0},
+	.version = {1, 19, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
-- 
cgit v0.10.2