From e7597e69dec59b65c5525db1626b9d34afdfa678 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <Jes.Sorensen@redhat.com>
Date: Tue, 16 Feb 2016 16:44:24 -0500
Subject: md/raid5: Compare apples to apples (or sectors to sectors)

'max_discard_sectors' is in sectors, while 'stripe' is in bytes.

This fixes the problem where DISCARD would get disabled on some larger
RAID5 configurations (6 or more drives in my testing), while it worked
as expected with smaller configurations.

Fixes: 620125f2bf8 ("MD: raid5 trim support")
Cc: stable@vger.kernel.org v3.7+
Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b4f02c9..7f770b0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7014,8 +7014,8 @@ static int raid5_run(struct mddev *mddev)
 		}
 
 		if (discard_supported &&
-		   mddev->queue->limits.max_discard_sectors >= stripe &&
-		   mddev->queue->limits.discard_granularity >= stripe)
+		    mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
+		    mddev->queue->limits.discard_granularity >= stripe)
 			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
 						mddev->queue);
 		else
-- 
cgit v0.10.2


From 27a353c026a879a1001e5eac4bda75b16262c44a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 24 Feb 2016 17:38:28 -0800
Subject: RAID5: check_reshape() shouldn't call mddev_suspend

check_reshape() is called from raid5d thread. raid5d thread shouldn't
call mddev_suspend(), because mddev_suspend() waits for all IO finish
but IO is handled in raid5d thread, we could easily deadlock here.

This issue is introduced by
738a273 ("md/raid5: fix allocation of 'scribble' array.")

Cc: stable@vger.kernel.org (v4.1+)
Reported-and-tested-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7f770b0..b628cd5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2089,6 +2089,14 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
 	unsigned long cpu;
 	int err = 0;
 
+	/*
+	 * Never shrink. And mddev_suspend() could deadlock if this is called
+	 * from raid5d. In that case, scribble_disks and scribble_sectors
+	 * should equal to new_disks and new_sectors
+	 */
+	if (conf->scribble_disks >= new_disks &&
+	    conf->scribble_sectors >= new_sectors)
+		return 0;
 	mddev_suspend(conf->mddev);
 	get_online_cpus();
 	for_each_present_cpu(cpu) {
@@ -2110,6 +2118,10 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
 	}
 	put_online_cpus();
 	mddev_resume(conf->mddev);
+	if (!err) {
+		conf->scribble_disks = new_disks;
+		conf->scribble_sectors = new_sectors;
+	}
 	return err;
 }
 
@@ -6413,6 +6425,12 @@ static int raid5_alloc_percpu(struct r5conf *conf)
 	}
 	put_online_cpus();
 
+	if (!err) {
+		conf->scribble_disks = max(conf->raid_disks,
+			conf->previous_raid_disks);
+		conf->scribble_sectors = max(conf->chunk_sectors,
+			conf->prev_chunk_sectors);
+	}
 	return err;
 }
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index a415e1c..ae6068d 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -510,6 +510,8 @@ struct r5conf {
 					      * conversions
 					      */
 	} __percpu *percpu;
+	int scribble_disks;
+	int scribble_sectors;
 #ifdef CONFIG_HOTPLUG_CPU
 	struct notifier_block	cpu_notify;
 #endif
-- 
cgit v0.10.2


From 6ab2a4b806ae21b6c3e47c5ff1285ec06d505325 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 25 Feb 2016 16:24:42 -0800
Subject: RAID5: revert e9e4c377e2f563 to fix a livelock

Revert commit
e9e4c377e2f563(md/raid5: per hash value and exclusive wait_for_stripe)

The problem is raid5_get_active_stripe waits on
conf->wait_for_stripe[hash]. Assume hash is 0. My test release stripes
in this order:
- release all stripes with hash 0
- raid5_get_active_stripe still sleeps since active_stripes >
  max_nr_stripes * 3 / 4
- release all stripes with hash other than 0. active_stripes becomes 0
- raid5_get_active_stripe still sleeps, since nobody wakes up
  wait_for_stripe[0]
The system live locks. The problem is active_stripes isn't a per-hash
count. Revert the patch makes the live lock go away.

Cc: stable@vger.kernel.org (v4.2+)
Cc: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Cc: NeilBrown <neilb@suse.de>
Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b628cd5..c25f2fb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -340,8 +340,7 @@ static void release_inactive_stripe_list(struct r5conf *conf,
 					 int hash)
 {
 	int size;
-	unsigned long do_wakeup = 0;
-	int i = 0;
+	bool do_wakeup = false;
 	unsigned long flags;
 
 	if (hash == NR_STRIPE_HASH_LOCKS) {
@@ -362,19 +361,15 @@ static void release_inactive_stripe_list(struct r5conf *conf,
 			    !list_empty(list))
 				atomic_dec(&conf->empty_inactive_list_nr);
 			list_splice_tail_init(list, conf->inactive_list + hash);
-			do_wakeup |= 1 << hash;
+			do_wakeup = true;
 			spin_unlock_irqrestore(conf->hash_locks + hash, flags);
 		}
 		size--;
 		hash--;
 	}
 
-	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
-		if (do_wakeup & (1 << i))
-			wake_up(&conf->wait_for_stripe[i]);
-	}
-
 	if (do_wakeup) {
+		wake_up(&conf->wait_for_stripe);
 		if (atomic_read(&conf->active_stripes) == 0)
 			wake_up(&conf->wait_for_quiescent);
 		if (conf->retry_read_aligned)
@@ -687,15 +682,14 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 			if (!sh) {
 				set_bit(R5_INACTIVE_BLOCKED,
 					&conf->cache_state);
-				wait_event_exclusive_cmd(
-					conf->wait_for_stripe[hash],
+				wait_event_lock_irq(
+					conf->wait_for_stripe,
 					!list_empty(conf->inactive_list + hash) &&
 					(atomic_read(&conf->active_stripes)
 					 < (conf->max_nr_stripes * 3 / 4)
 					 || !test_bit(R5_INACTIVE_BLOCKED,
 						      &conf->cache_state)),
-					spin_unlock_irq(conf->hash_locks + hash),
-					spin_lock_irq(conf->hash_locks + hash));
+					*(conf->hash_locks + hash));
 				clear_bit(R5_INACTIVE_BLOCKED,
 					  &conf->cache_state);
 			} else {
@@ -720,9 +714,6 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 		}
 	} while (sh == NULL);
 
-	if (!list_empty(conf->inactive_list + hash))
-		wake_up(&conf->wait_for_stripe[hash]);
-
 	spin_unlock_irq(conf->hash_locks + hash);
 	return sh;
 }
@@ -2202,7 +2193,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	cnt = 0;
 	list_for_each_entry(nsh, &newstripes, lru) {
 		lock_device_hash_lock(conf, hash);
-		wait_event_exclusive_cmd(conf->wait_for_stripe[hash],
+		wait_event_cmd(conf->wait_for_stripe,
 				    !list_empty(conf->inactive_list + hash),
 				    unlock_device_hash_lock(conf, hash),
 				    lock_device_hash_lock(conf, hash));
@@ -6521,9 +6512,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	seqcount_init(&conf->gen_lock);
 	mutex_init(&conf->cache_size_mutex);
 	init_waitqueue_head(&conf->wait_for_quiescent);
-	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
-		init_waitqueue_head(&conf->wait_for_stripe[i]);
-	}
+	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
 	INIT_LIST_HEAD(&conf->hold_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ae6068d..517d4b6 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -524,7 +524,7 @@ struct r5conf {
 	atomic_t		empty_inactive_list_nr;
 	struct llist_head	released_stripes;
 	wait_queue_head_t	wait_for_quiescent;
-	wait_queue_head_t	wait_for_stripe[NR_STRIPE_HASH_LOCKS];
+	wait_queue_head_t	wait_for_stripe;
 	wait_queue_head_t	wait_for_overlap;
 	unsigned long		cache_state;
 #define R5_INACTIVE_BLOCKED	1	/* release of inactive stripes blocked,
-- 
cgit v0.10.2


From 399146b80ed6fb9f1ebe5a07234f00dff446d2b4 Mon Sep 17 00:00:00 2001
From: Sebastian Parschauer <sebastian.riemer@profitbricks.com>
Date: Wed, 17 Feb 2016 17:25:00 +0100
Subject: md: Drop sending a change uevent when stopping

When stopping an MD device, then its device node /dev/mdX may still
exist afterwards or it is recreated by udev. The next open() call
can lead to creation of an inoperable MD device. The reason for
this is that a change event (KOBJ_CHANGE) is sent to udev which
races against the remove event (KOBJ_REMOVE) from md_free().
So drop sending the change event.

A change is likely also required in mdadm as many versions send the
change event to udev as well.

Neil mentioned the change event is a workaround for old kernel
Commit: 934d9c23b4c7 ("md: destroy partitions and notify udev when md array is stopped.")
new mdadm can handle device remove now, so this isn't required any more.

Cc: NeilBrown <neilb@suse.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Jes Sorensen <Jes.Sorensen@redhat.com>
Signed-off-by: Sebastian Parschauer <sebastian.riemer@profitbricks.com>
Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e55e6cf..464627b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5671,7 +5671,6 @@ static int do_md_stop(struct mddev *mddev, int mode,
 		export_array(mddev);
 
 		md_clean(mddev);
-		kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
 		if (mddev->hold_active == UNTIL_STOP)
 			mddev->hold_active = 0;
 	}
-- 
cgit v0.10.2


From 70d9798b95562abac005d4ba71d28820f9a201eb Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 24 Feb 2016 17:41:01 -0800
Subject: MD: warn for potential deadlock

The personality thread shouldn't call mddev_suspend(). Because
mddev_suspend() will for all IO finish, but IO is handled in personality
thread, so this could cause deadlock. To trigger this early, add a
warning if mddev_suspend() is called from personality thread.

Suggested-by: NeilBrown <neilb@suse.com>
Cc: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 464627b..c068f17 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -305,6 +305,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
  */
 void mddev_suspend(struct mddev *mddev)
 {
+	WARN_ON_ONCE(current == mddev->thread->tsk);
 	if (mddev->suspended++)
 		return;
 	synchronize_rcu();
-- 
cgit v0.10.2


From c97e0602bcfeaabf303b125e07ed7ba35c656495 Mon Sep 17 00:00:00 2001
From: Eric Engestrom <eric.engestrom@imgtec.com>
Date: Mon, 7 Mar 2016 12:01:05 +0000
Subject: md/bitmap: remove redundant check

daemon_sleep is an unsigned, so testing if it's 0 or less than 1 does
the same thing.

Signed-off-by: Eric Engestrom <eric.engestrom@imgtec.com>
Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index d80cce4..bbe7b64 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -510,8 +510,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
 	sb->chunksize = cpu_to_le32(chunksize);
 
 	daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep;
-	if (!daemon_sleep ||
-	    (daemon_sleep < 1) || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) {
+	if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) {
 		printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n");
 		daemon_sleep = 5 * HZ;
 	}
-- 
cgit v0.10.2


From 0f9ce866e0938f2e5f9c77277834b429661844b1 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 7 Mar 2016 09:41:09 -0800
Subject: Update MD git tree URL

Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index 2fa3303..d930bcc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10147,7 +10147,7 @@ F:	drivers/media/pci/solo6x10/
 SOFTWARE RAID (Multiple Disks) SUPPORT
 M:	Shaohua Li <shli@kernel.org>
 L:	linux-raid@vger.kernel.org
-T:	git git://neil.brown.name/md
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git
 S:	Supported
 F:	drivers/md/
 F:	include/linux/raid/
-- 
cgit v0.10.2


From 550da24f8d62fe81f3c13e3ec27602d6e44d43dc Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 9 Mar 2016 12:58:25 +1100
Subject: md/raid5: preserve STRIPE_PREREAD_ACTIVE in break_stripe_batch_list

break_stripe_batch_list breaks up a batch and copies some flags from
the batch head to the members, preserving others.

It doesn't preserve or copy STRIPE_PREREAD_ACTIVE.  This is not
normally a problem as STRIPE_PREREAD_ACTIVE is cleared when a
stripe_head is added to a batch, and is not set on stripe_heads
already in a batch.

However there is no locking to ensure one thread doesn't set the flag
after it has just been cleared in another.  This does occasionally happen.

md/raid5 maintains a count of the number of stripe_heads with
STRIPE_PREREAD_ACTIVE set: conf->preread_active_stripes.  When
break_stripe_batch_list clears STRIPE_PREREAD_ACTIVE inadvertently
this could becomes incorrect and will never again return to zero.

md/raid5 delays the handling of some stripe_heads until
preread_active_stripes becomes zero.  So when the above mention race
happens, those stripe_heads become blocked and never progress,
resulting is write to the array handing.

So: change break_stripe_batch_list to preserve STRIPE_PREREAD_ACTIVE
in the members of a batch.

URL: https://bugzilla.kernel.org/show_bug.cgi?id=108741
URL: https://bugzilla.redhat.com/show_bug.cgi?id=1258153
URL: http://thread.gmane.org/5649C0E9.2030204@zoner.cz
Reported-by: Martin Svec <martin.svec@zoner.cz> (and others)
Tested-by: Tom Weber <linux@junkyard.4t2.com>
Fixes: 1b956f7a8f9a ("md/raid5: be more selective about distributing flags across batch.")
Cc: stable@vger.kernel.org (v4.1 and later)
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c25f2fb..32d5287 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4239,7 +4239,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
 		WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
 					  (1 << STRIPE_SYNCING) |
 					  (1 << STRIPE_REPLACED) |
-					  (1 << STRIPE_PREREAD_ACTIVE) |
 					  (1 << STRIPE_DELAYED) |
 					  (1 << STRIPE_BIT_DELAY) |
 					  (1 << STRIPE_FULL_WRITE) |
@@ -4254,6 +4253,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
 					      (1 << STRIPE_REPLACED)));
 
 		set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
+					    (1 << STRIPE_PREREAD_ACTIVE) |
 					    (1 << STRIPE_DEGRADED)),
 			      head_sh->state & (1 << STRIPE_INSYNC));
 
-- 
cgit v0.10.2


From fb3229d5cd4c8e65178afe55f6727ff645cdb81a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 9 Mar 2016 10:08:38 -0800
Subject: md/raid5: output stripe state for debug

Neil recently fixed an obscure race in break_stripe_batch_list. Debug would be
quite convenient if we know the stripe state. This is what this patch does.

Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 32d5287..31ac0f0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4236,7 +4236,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
 
 		list_del_init(&sh->batch_list);
 
-		WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
+		WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
 					  (1 << STRIPE_SYNCING) |
 					  (1 << STRIPE_REPLACED) |
 					  (1 << STRIPE_DELAYED) |
@@ -4248,9 +4248,11 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
 					  (1 << STRIPE_DISCARD) |
 					  (1 << STRIPE_BATCH_READY) |
 					  (1 << STRIPE_BATCH_ERR) |
-					  (1 << STRIPE_BITMAP_PENDING)));
-		WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
-					      (1 << STRIPE_REPLACED)));
+					  (1 << STRIPE_BITMAP_PENDING)),
+			"stripe state: %lx\n", sh->state);
+		WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
+					      (1 << STRIPE_REPLACED)),
+			"head stripe state: %lx\n", head_sh->state);
 
 		set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
 					    (1 << STRIPE_PREREAD_ACTIVE) |
-- 
cgit v0.10.2


From fafcde3ac1a418688a734365203a12483b83907a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Sat, 12 Mar 2016 09:29:40 +0800
Subject: md: multipath: don't hardcopy bio in .make_request path

Inside multipath_make_request(), multipath maps the incoming
bio into low level device's bio, but it is totally wrong to
copy the bio into mapped bio via '*mapped_bio = *bio'. For
example, .__bi_remaining is kept in the copy, especially if
the incoming bio is chained to via bio splitting, so .bi_end_io
can't be called for the mapped bio at all in the completing path
in this kind of situation.

This patch fixes the issue by using clone style.

Cc: stable@vger.kernel.org (v3.14+)
Reported-and-tested-by: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 0a72ab6..dd483bb 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -129,7 +129,9 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
 	}
 	multipath = conf->multipaths + mp_bh->path;
 
-	mp_bh->bio = *bio;
+	bio_init(&mp_bh->bio);
+	__bio_clone_fast(&mp_bh->bio, bio);
+
 	mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
 	mp_bh->bio.bi_bdev = multipath->rdev->bdev;
 	mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT;
-- 
cgit v0.10.2


From b3c95b425e0991840262dba8dd095f2682dbf848 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 14 Mar 2016 17:01:37 +0800
Subject: md/raid1: remove unnecessary BUG_ON

Since bitmap_start_sync will not return until
sync_blocks is not less than PAGE_SIZE>>9, so
the BUG_ON is not needed anymore.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4e3843f..d1d5363 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2695,7 +2695,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 			    !conf->fullsync &&
 			    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
 				break;
-			BUG_ON(sync_blocks < (PAGE_SIZE>>9));
 			if ((len >> 9) > sync_blocks)
 				len = sync_blocks<<9;
 		}
-- 
cgit v0.10.2


From c6f0b9f195416e2d3c474e51190b6aa0c238aa36 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 14 Mar 2016 17:01:38 +0800
Subject: md/bitmap: remove redundant return in bitmap_checkpage

The "return 0" is not needed since bitmap_checkpage
will finally return 0 for the case.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index bbe7b64..7df6b4f 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -98,7 +98,6 @@ __acquires(bitmap->lock)
 		   bitmap->bp[page].hijacked) {
 		/* somebody beat us to getting the page */
 		kfree(mappage);
-		return 0;
 	} else {
 
 		/* no page was in place and we have one, so install it */
-- 
cgit v0.10.2


From d85326cf86d7f52d1ac23c38f745470066acc76f Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 14 Mar 2016 17:01:39 +0800
Subject: md: fix typos for stipe

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 7d5c3a6..5e3fcd6 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -49,8 +49,8 @@
  * When we set a bit, or in the counter (to start a write), if the fields is
  * 0, we first set the disk bit and set the counter to 1.
  *
- * If the counter is 0, the on-disk bit is clear and the stipe is clean
- * Anything that dirties the stipe pushes the counter to 2 (at least)
+ * If the counter is 0, the on-disk bit is clear and the stripe is clean
+ * Anything that dirties the stripe pushes the counter to 2 (at least)
  * and sets the on-disk bit (lazily).
  * If a periodic sweep find the counter at 2, it is decremented to 1.
  * If the sweep find the counter at 1, the on-disk bit is cleared and the
-- 
cgit v0.10.2


From ccfc7bf1f09d6190ef86693ddc761d5fe3fa47cb Mon Sep 17 00:00:00 2001
From: Nate Dailey <nate.dailey@stratus.com>
Date: Mon, 29 Feb 2016 10:43:58 -0500
Subject: raid1: include bio_end_io_list in nr_queued to prevent freeze_array
 hang

If raid1d is handling a mix of read and write errors, handle_read_error's
call to freeze_array can get stuck.

This can happen because, though the bio_end_io_list is initially drained,
writes can be added to it via handle_write_finished as the retry_list
is processed. These writes contribute to nr_pending but are not included
in nr_queued.

If a later entry on the retry_list triggers a call to handle_read_error,
freeze array hangs waiting for nr_pending == nr_queued+extra. The writes
on the bio_end_io_list aren't included in nr_queued so the condition will
never be satisfied.

To prevent the hang, include bio_end_io_list writes in nr_queued.

There's probably a better way to handle decrementing nr_queued, but this
seemed like the safest way to avoid breaking surrounding code.

I'm happy to supply the script I used to repro this hang.

Fixes: 55ce74d4bfe1b(md/raid1: ensure device failure recorded before write request returns.)
Cc: stable@vger.kernel.org (v4.3+)
Signed-off-by: Nate Dailey <nate.dailey@stratus.com>
Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d1d5363..39fb21e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2274,6 +2274,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 	if (fail) {
 		spin_lock_irq(&conf->device_lock);
 		list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
+		conf->nr_queued++;
 		spin_unlock_irq(&conf->device_lock);
 		md_wakeup_thread(conf->mddev->thread);
 	} else {
@@ -2391,8 +2392,10 @@ static void raid1d(struct md_thread *thread)
 		LIST_HEAD(tmp);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
-			list_add(&tmp, &conf->bio_end_io_list);
-			list_del_init(&conf->bio_end_io_list);
+			while (!list_empty(&conf->bio_end_io_list)) {
+				list_move(conf->bio_end_io_list.prev, &tmp);
+				conf->nr_queued--;
+			}
 		}
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 		while (!list_empty(&tmp)) {
-- 
cgit v0.10.2


From 23ddba80ebe836476bb2fa1f5ef305dd1c63dc0b Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 14 Mar 2016 11:49:32 -0700
Subject: raid10: include bio_end_io_list in nr_queued to prevent freeze_array
 hang

This is the raid10 counterpart of the bug fixed by Nate
(raid1: include bio_end_io_list in nr_queued to prevent freeze_array hang)

Fixes: 95af587e95(md/raid10: ensure device failure recorded before write request returns)
Cc: stable@vger.kernel.org (V4.3+)
Cc: Nate Dailey <nate.dailey@stratus.com>
Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1c1447d..e3fd725 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2664,6 +2664,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 		if (fail) {
 			spin_lock_irq(&conf->device_lock);
 			list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
+			conf->nr_queued++;
 			spin_unlock_irq(&conf->device_lock);
 			md_wakeup_thread(conf->mddev->thread);
 		} else {
@@ -2691,8 +2692,10 @@ static void raid10d(struct md_thread *thread)
 		LIST_HEAD(tmp);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
-			list_add(&tmp, &conf->bio_end_io_list);
-			list_del_init(&conf->bio_end_io_list);
+			while (!list_empty(&conf->bio_end_io_list)) {
+				list_move(conf->bio_end_io_list.prev, &tmp);
+				conf->nr_queued--;
+			}
 		}
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 		while (!list_empty(&tmp)) {
-- 
cgit v0.10.2


From 1d034e68e2c256640eb1f44bd7dcd89f90806ccf Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Wed, 16 Mar 2016 09:25:23 +0100
Subject: md/raid5: Cleanup cpu hotplug notifier

The raid456_cpu_notify() hotplug callback lacks handling of the
CPU_UP_CANCELED case. That means if CPU_UP_PREPARE fails, the scratch
buffer is leaked.

Add handling for CPU_UP_CANCELED[_FROZEN] hotplug notifier transitions
to free the scratch buffer.

CC: Shaohua Li <shli@kernel.org>
CC: linux-raid@vger.kernel.org
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 31ac0f0..8ab8b65 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6381,6 +6381,8 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 		free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
 		break;
 	default:
-- 
cgit v0.10.2