From 60559da4d8c3259ea41a14ca4cfcd83022bff6bd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 16 Jul 2013 16:44:29 +1000
Subject: md: don't call md_allow_write in get_bitmap_file.

There is no really need as GFP_NOIO is very likely sufficient,
and failure is not catastrophic.

Calling md_allow_write here will convert a read-auto array to
read/write which could be confusing when you are just performing
a read operation.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9f13e13..bba8732 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5628,10 +5628,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
 	char *ptr, *buf = NULL;
 	int err = -ENOMEM;
 
-	if (md_allow_write(mddev))
-		file = kmalloc(sizeof(*file), GFP_NOIO);
-	else
-		file = kmalloc(sizeof(*file), GFP_KERNEL);
+	file = kmalloc(sizeof(*file), GFP_NOIO);
 
 	if (!file)
 		goto out;
-- 
cgit v0.10.2


From 275c51c4e34ed776d40a99dd97c1deee50303b07 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 8 Aug 2013 09:20:32 +1000
Subject: md: fix safe_mode buglet.

Whe we set the safe_mode_timeout to a smaller value we trigger a timeout
immediately - otherwise the small value might not be honoured.
However if the previous timeout was 0 meaning "no timeout", we didn't.
This would mean that no timeout happens until the next write completes,
which could be a long time.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index bba8732..16b0822 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3429,7 +3429,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
 		mddev->safemode_delay = (msec*HZ)/1000;
 		if (mddev->safemode_delay == 0)
 			mddev->safemode_delay = 1;
-		if (mddev->safemode_delay < old_delay)
+		if (mddev->safemode_delay < old_delay || old_delay == 0)
 			md_safemode_timeout((unsigned long)mddev);
 	}
 	return len;
-- 
cgit v0.10.2


From ae77cbc1e7b90473a2b0963bce0e1eb163873214 Mon Sep 17 00:00:00 2001
From: Ken Steele <ken@tilera.com>
Date: Wed, 7 Aug 2013 12:39:56 -0400
Subject: RAID: add tilegx SIMD implementation of raid6

This change adds TILE-Gx SIMD instructions to the software raid
(md), modeling the Altivec implementation. This is only for Syndrome
generation; there is more that could be done to improve recovery,
as in the recent Intel SSE3 recovery implementation.

The code unrolls 8 times; this turns out to be the best on tilegx
hardware among the set 1, 2, 4, 8 or 16.  The code reads one
cache-line of data from each disk, stores P and Q then goes to the
next cache-line.

The test code in sys/linux/lib/raid6/test reports 2008 MB/s data
read rate for syndrome generation using 18 disks (16 data and 2
parity). It was 1512 MB/s before this SIMD optimizations. This is
running on 1 core with all the data in cache.

This is based on the paper The Mathematics of RAID-6.
(http://kernel.org/pub/linux/kernel/people/hpa/raid6.pdf).

Signed-off-by: Ken Steele <ken@tilera.com>
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 8dfaa2c..6d5df8a 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -101,6 +101,7 @@ extern const struct raid6_calls raid6_altivec8;
 extern const struct raid6_calls raid6_avx2x1;
 extern const struct raid6_calls raid6_avx2x2;
 extern const struct raid6_calls raid6_avx2x4;
+extern const struct raid6_calls raid6_tilegx8;
 
 struct raid6_recov_calls {
 	void (*data2)(int, size_t, int, int, void **);
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 9f7c184..e5e9021 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -5,6 +5,7 @@ raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
 
 raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
+raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
 
 hostprogs-y	+= mktables
 
@@ -70,6 +71,11 @@ $(obj)/altivec8.c:   UNROLL := 8
 $(obj)/altivec8.c:   $(src)/altivec.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
+targets += tilegx8.c
+$(obj)/tilegx8.c:   UNROLL := 8
+$(obj)/tilegx8.c:   $(src)/tilegx.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
 quiet_cmd_mktable = TABLE   $@
       cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )
 
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 6d7316f..b9f3401 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -66,6 +66,9 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_altivec4,
 	&raid6_altivec8,
 #endif
+#if defined(CONFIG_TILEGX)
+	&raid6_tilegx8,
+#endif
 	&raid6_intx1,
 	&raid6_intx2,
 	&raid6_intx4,
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 087332d..78f89d8 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -35,6 +35,9 @@ else
                 OBJS += altivec1.o altivec2.o altivec4.o altivec8.o
         endif
 endif
+ifeq ($(ARCH),tilegx)
+OBJS += tilegx8.o
+endif
 
 .c.o:
 	$(CC) $(CFLAGS) -c -o $@ $<
@@ -85,11 +88,15 @@ int16.c: int.uc ../unroll.awk
 int32.c: int.uc ../unroll.awk
 	$(AWK) ../unroll.awk -vN=32 < int.uc > $@
 
+tilegx8.c: tilegx.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=8 < tilegx.uc > $@
+
 tables.c: mktables
 	./mktables > tables.c
 
 clean:
 	rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c tables.c raid6test
+	rm -f tilegx*.c
 
 spotless: clean
 	rm -f *~
diff --git a/lib/raid6/tilegx.uc b/lib/raid6/tilegx.uc
new file mode 100644
index 0000000..e7c2945
--- /dev/null
+++ b/lib/raid6/tilegx.uc
@@ -0,0 +1,86 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *   Copyright 2012 Tilera Corporation - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * tilegx$#.c
+ *
+ * $#-way unrolled TILE-Gx SIMD for RAID-6 math.
+ *
+ * This file is postprocessed using unroll.awk.
+ *
+ */
+
+#include <linux/raid/pq.h>
+
+/* Create 8 byte copies of constant byte */
+# define NBYTES(x) (__insn_v1addi(0, x))
+# define NSIZE  8
+
+/*
+ * The SHLBYTE() operation shifts each byte left by 1, *not*
+ * rolling over into the next byte
+ */
+static inline __attribute_const__ u64 SHLBYTE(u64 v)
+{
+	/* Vector One Byte Shift Left Immediate. */
+	return __insn_v1shli(v, 1);
+}
+
+/*
+ * The MASK() operation returns 0xFF in any byte for which the high
+ * bit is 1, 0x00 for any byte for which the high bit is 0.
+ */
+static inline __attribute_const__ u64 MASK(u64 v)
+{
+	/* Vector One Byte Shift Right Signed Immediate. */
+	return __insn_v1shrsi(v, 7);
+}
+
+
+void raid6_tilegx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u64 *p, *q;
+	int d, z, z0;
+
+	u64 wd$$, wq$$, wp$$, w1$$, w2$$;
+	u64 x1d = NBYTES(0x1d);
+	u64 * z0ptr;
+
+	z0 = disks - 3;			/* Highest data disk */
+	p = (u64 *)dptr[z0+1];	/* XOR parity */
+	q = (u64 *)dptr[z0+2];	/* RS syndrome */
+
+	z0ptr = (u64 *)&dptr[z0][0];
+	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+		wq$$ = wp$$ = *z0ptr++;
+		for ( z = z0-1 ; z >= 0 ; z-- ) {
+			wd$$ = *(u64 *)&dptr[z][d+$$*NSIZE];
+			wp$$ = wp$$ ^ wd$$;
+			w2$$ = MASK(wq$$);
+			w1$$ = SHLBYTE(wq$$);
+			w2$$ = w2$$ & x1d;
+			w1$$ = w1$$ ^ w2$$;
+			wq$$ = w1$$ ^ wd$$;
+		}
+		*p++ = wp$$;
+		*q++ = wq$$;
+	}
+}
+
+const struct raid6_calls raid6_tilegx$# = {
+	raid6_tilegx$#_gen_syndrome,
+	NULL,
+	"tilegx$#",
+	0
+};
-- 
cgit v0.10.2


From c28399b5943a24a214a44e973a3a8002fd36442d Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Thu, 22 Aug 2013 18:53:06 +0400
Subject: raid6/test: replace echo -e with printf

-e is a non-standard echo option, echo output is
implementation-dependent when it is used. Replace echo -e with printf as
suggested by POSIX echo manual.

Cc: NeilBrown <neilb@suse.de>
Cc: Jim Kukunas <james.t.kukunas@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 78f89d8..cf49592 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -28,7 +28,7 @@ ifeq ($(IS_X86),yes)
                     gcc -c -x assembler - >&/dev/null &&	\
                     rm ./-.o && echo -DCONFIG_AS_AVX2=1)
 else
-        HAS_ALTIVEC := $(shell echo -e '\#include <altivec.h>\nvector int a;' |\
+        HAS_ALTIVEC := $(shell printf '\#include <altivec.h>\nvector int a;\n' |\
                          gcc -c -x c - >&/dev/null && \
                          rm ./-.o && echo yes)
         ifeq ($(HAS_ALTIVEC),yes)
-- 
cgit v0.10.2


From c9ad020fec895bf1e5fcc322d0ab9e67efd3e3a0 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Mon, 19 Aug 2013 22:26:32 -0400
Subject: md: Fix apparent cut-and-paste error in super_90_validate

Setting a variable to itself probably wasn't the intention here.

Signed-off-by: Dave Jones <davej@fedoraproject.org>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 16b0822..a2678d8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1180,7 +1180,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
 			mddev->bitmap_info.offset =
 				mddev->bitmap_info.default_offset;
 			mddev->bitmap_info.space =
-				mddev->bitmap_info.space;
+				mddev->bitmap_info.default_space;
 		}
 
 	} else if (mddev->pers == NULL) {
-- 
cgit v0.10.2


From 7a0a5355cbc71efa430c3730ffbd67ae04abfe31 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 27 Aug 2013 16:28:23 +1000
Subject: md: Don't test all of mddev->flags at once.

mddev->flags is mostly used to record if an update of the
metadata is needed.  Sometimes the whole field is tested
instead of just the important bits.  This makes it difficult
to introduce more state bits.

So replace all bare tests of mddev->flags with tests for the bits
that actually need testing.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a2678d8..084a654 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5144,7 +5144,7 @@ int md_run(struct mddev *mddev)
 	
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	
-	if (mddev->flags)
+	if (mddev->flags & MD_UPDATE_SB_FLAGS)
 		md_update_sb(mddev, 0);
 
 	md_new_event(mddev);
@@ -5289,7 +5289,7 @@ static void __md_stop_writes(struct mddev *mddev)
 	md_super_wait(mddev);
 
 	if (mddev->ro == 0 &&
-	    (!mddev->in_sync || mddev->flags)) {
+	    (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) {
 		/* mark array as shutdown cleanly */
 		mddev->in_sync = 1;
 		md_update_sb(mddev, 1);
@@ -7814,7 +7814,7 @@ void md_check_recovery(struct mddev *mddev)
 				sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
 
-		if (mddev->flags)
+		if (mddev->flags & MD_UPDATE_SB_FLAGS)
 			md_update_sb(mddev, 0);
 
 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 20f02c0..53283be 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -204,11 +204,12 @@ struct mddev {
 	struct md_personality		*pers;
 	dev_t				unit;
 	int				md_minor;
-	struct list_head 		disks;
+	struct list_head		disks;
 	unsigned long			flags;
 #define MD_CHANGE_DEVS	0	/* Some device status has changed */
 #define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
 #define MD_CHANGE_PENDING 2	/* switch from 'clean' to 'active' in progress */
+#define MD_UPDATE_SB_FLAGS (1 | 2 | 4)	/* If these are set, md_update_sb needed */
 #define MD_ARRAY_FIRST_USE 3    /* First use of array, needs initialization */
 
 	int				suspended;
@@ -218,7 +219,7 @@ struct mddev {
 						       * are happening, so run/
 						       * takeover/stop are not safe
 						       */
-	int				ready; /* See when safe to pass 
+	int				ready; /* See when safe to pass
 						* IO requests down */
 	struct gendisk			*gendisk;
 
-- 
cgit v0.10.2


From 260fa034ef7a4ff8b73068b48ac497edd5217491 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 27 Aug 2013 16:44:13 +1000
Subject: md: avoid deadlock when dirty buffers during md_stop.

When the last process closes /dev/mdX sync_blockdev will be called so
that all buffers get flushed.
So if it is then opened for the STOP_ARRAY ioctl to be sent there will
be nothing to flush.

However if we open /dev/mdX in order to send the STOP_ARRAY ioctl just
moments before some other process which was writing closes their file
descriptor, then there won't be a 'last close' and the buffers might
not get flushed.

So do_md_stop() calls sync_blockdev().  However at this point it is
holding ->reconfig_mutex.  So if the array is currently 'clean' then
the writes from sync_blockdev() will not complete until the array
can be marked dirty and that won't happen until some other thread
can get ->reconfig_mutex.  So we deadlock.

We need to move the sync_blockdev() call to before we take
->reconfig_mutex.
However then some other thread could open /dev/mdX and write to it
after we call sync_blockdev() and before we actually stop the array.
This can leave dirty data in the page cache which is awkward.

So introduce new flag MD_STILL_CLOSED.  Set it before calling
sync_blockdev(), clear it if anyone does open the file, and abort the
STOP_ARRAY attempt if it gets set before we lock against further
opens.

It is still possible to get problems if you open /dev/mdX, write to
it, then issue the STOP_ARRAY ioctl.  Just don't do that.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 084a654..adf4d7e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5337,8 +5337,14 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
 		err = -EBUSY;
 		goto out;
 	}
-	if (bdev)
-		sync_blockdev(bdev);
+	if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
+		/* Someone opened the device since we flushed it
+		 * so page cache could be dirty and it is too late
+		 * to flush.  So abort
+		 */
+		mutex_unlock(&mddev->open_mutex);
+		return -EBUSY;
+	}
 	if (mddev->pers) {
 		__md_stop_writes(mddev);
 
@@ -5373,14 +5379,14 @@ static int do_md_stop(struct mddev * mddev, int mode,
 		mutex_unlock(&mddev->open_mutex);
 		return -EBUSY;
 	}
-	if (bdev)
-		/* It is possible IO was issued on some other
-		 * open file which was closed before we took ->open_mutex.
-		 * As that was not the last close __blkdev_put will not
-		 * have called sync_blockdev, so we must.
+	if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
+		/* Someone opened the device since we flushed it
+		 * so page cache could be dirty and it is too late
+		 * to flush.  So abort
 		 */
-		sync_blockdev(bdev);
-
+		mutex_unlock(&mddev->open_mutex);
+		return -EBUSY;
+	}
 	if (mddev->pers) {
 		if (mddev->ro)
 			set_disk_ro(disk, 0);
@@ -6417,6 +6423,20 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 						 !test_bit(MD_RECOVERY_NEEDED,
 							   &mddev->flags),
 						 msecs_to_jiffies(5000));
+	if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
+		/* Need to flush page cache, and ensure no-one else opens
+		 * and writes
+		 */
+		mutex_lock(&mddev->open_mutex);
+		if (atomic_read(&mddev->openers) > 1) {
+			mutex_unlock(&mddev->open_mutex);
+			err = -EBUSY;
+			goto abort;
+		}
+		set_bit(MD_STILL_CLOSED, &mddev->flags);
+		mutex_unlock(&mddev->open_mutex);
+		sync_blockdev(bdev);
+	}
 	err = mddev_lock(mddev);
 	if (err) {
 		printk(KERN_INFO 
@@ -6670,6 +6690,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 
 	err = 0;
 	atomic_inc(&mddev->openers);
+	clear_bit(MD_STILL_CLOSED, &mddev->flags);
 	mutex_unlock(&mddev->open_mutex);
 
 	check_disk_change(bdev);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 53283be..608050c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -211,6 +211,9 @@ struct mddev {
 #define MD_CHANGE_PENDING 2	/* switch from 'clean' to 'active' in progress */
 #define MD_UPDATE_SB_FLAGS (1 | 2 | 4)	/* If these are set, md_update_sb needed */
 #define MD_ARRAY_FIRST_USE 3    /* First use of array, needs initialization */
+#define MD_STILL_CLOSED	4	/* If set, then array has not been opened since
+				 * md_ioctl checked on it.
+				 */
 
 	int				suspended;
 	atomic_t			active_io;
-- 
cgit v0.10.2


From 773ca82fa1ee58dd1bf88b6a5ca385ec83a2cac6 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Tue, 27 Aug 2013 17:50:39 +0800
Subject: raid5: make release_stripe lockless

release_stripe still has big lock contention. We just add the stripe to a llist
without taking device_lock. We let the raid5d thread to do the real stripe
release, which must hold device_lock anyway. In this way, release_stripe
doesn't hold any locks.

The side effect is the released stripes order is changed. But sounds not a big
deal, stripes are never handled in order. And I thought block layer can already
do nice request merge, which means order isn't that important.

I kept the unplug release batch, which is unnecessary with this patch from lock
contention avoid point of view, and actually if we delete it, the stripe_head
release_list and lru can share storage. But the unplug release batch is also
helpful for request merge. We probably can delay wakeup raid5d till unplug, but
I'm still afraid of the case which raid5d is running.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 78ea443..287cc3b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -239,12 +239,47 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
 		do_release_stripe(conf, sh);
 }
 
+/* should hold conf->device_lock already */
+static int release_stripe_list(struct r5conf *conf)
+{
+	struct stripe_head *sh;
+	int count = 0;
+	struct llist_node *head;
+
+	head = llist_del_all(&conf->released_stripes);
+	while (head) {
+		sh = llist_entry(head, struct stripe_head, release_list);
+		head = llist_next(head);
+		/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
+		smp_mb();
+		clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
+		/*
+		 * Don't worry the bit is set here, because if the bit is set
+		 * again, the count is always > 1. This is true for
+		 * STRIPE_ON_UNPLUG_LIST bit too.
+		 */
+		__release_stripe(conf, sh);
+		count++;
+	}
+
+	return count;
+}
+
 static void release_stripe(struct stripe_head *sh)
 {
 	struct r5conf *conf = sh->raid_conf;
 	unsigned long flags;
+	bool wakeup;
 
+	if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
+		goto slow_path;
+	wakeup = llist_add(&sh->release_list, &conf->released_stripes);
+	if (wakeup)
+		md_wakeup_thread(conf->mddev->thread);
+	return;
+slow_path:
 	local_irq_save(flags);
+	/* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
 	if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
 		do_release_stripe(conf, sh);
 		spin_unlock(&conf->device_lock);
@@ -491,7 +526,8 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 			if (atomic_read(&sh->count)) {
 				BUG_ON(!list_empty(&sh->lru)
 				    && !test_bit(STRIPE_EXPANDING, &sh->state)
-				    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state));
+				    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
+				    && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
 			} else {
 				if (!test_bit(STRIPE_HANDLE, &sh->state))
 					atomic_inc(&conf->active_stripes);
@@ -4127,6 +4163,10 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
 			 */
 			smp_mb__before_clear_bit();
 			clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
+			/*
+			 * STRIPE_ON_RELEASE_LIST could be set here. In that
+			 * case, the count is always > 1 here
+			 */
 			__release_stripe(conf, sh);
 			cnt++;
 		}
@@ -4836,7 +4876,9 @@ static void raid5d(struct md_thread *thread)
 	spin_lock_irq(&conf->device_lock);
 	while (1) {
 		struct bio *bio;
-		int batch_size;
+		int batch_size, released;
+
+		released = release_stripe_list(conf);
 
 		if (
 		    !list_empty(&conf->bitmap_list)) {
@@ -4861,7 +4903,7 @@ static void raid5d(struct md_thread *thread)
 		}
 
 		batch_size = handle_active_stripes(conf);
-		if (!batch_size)
+		if (!batch_size && !released)
 			break;
 		handled += batch_size;
 
@@ -5176,6 +5218,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->bitmap_list);
 	INIT_LIST_HEAD(&conf->inactive_list);
+	init_llist_head(&conf->released_stripes);
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
 	atomic_set(&conf->active_aligned_reads, 0);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 70c4932..a98f99d 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -197,6 +197,7 @@ enum reconstruct_states {
 struct stripe_head {
 	struct hlist_node	hash;
 	struct list_head	lru;	      /* inactive_list or handle_list */
+	struct llist_node	release_list;
 	struct r5conf		*raid_conf;
 	short			generation;	/* increments with every
 						 * reshape */
@@ -321,6 +322,7 @@ enum {
 	STRIPE_OPS_REQ_PENDING,
 	STRIPE_ON_UNPLUG_LIST,
 	STRIPE_DISCARD,
+	STRIPE_ON_RELEASE_LIST,
 };
 
 /*
@@ -445,6 +447,7 @@ struct r5conf {
 	 */
 	atomic_t		active_stripes;
 	struct list_head	inactive_list;
+	struct llist_head	released_stripes;
 	wait_queue_head_t	wait_for_stripe;
 	wait_queue_head_t	wait_for_overlap;
 	int			inactive_blocked;	/* release of inactive stripes blocked,
-- 
cgit v0.10.2


From d265d9dc1d25a69affc21ae9fe5004b9d09c10ef Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Wed, 28 Aug 2013 14:29:05 +0800
Subject: raid5: fix stripe release order

patch "make release_stripe lockless" changes the order stripes are released.
Originally I thought block layer can take care of request merge, but it appears
there are still some requests not merged. It's easy to fix the order.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 287cc3b..d87a2de 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -239,6 +239,20 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
 		do_release_stripe(conf, sh);
 }
 
+static struct llist_node *llist_reverse_order(struct llist_node *head)
+{
+	struct llist_node *new_head = NULL;
+
+	while (head) {
+		struct llist_node *tmp = head;
+		head = head->next;
+		tmp->next = new_head;
+		new_head = tmp;
+	}
+
+	return new_head;
+}
+
 /* should hold conf->device_lock already */
 static int release_stripe_list(struct r5conf *conf)
 {
@@ -247,6 +261,7 @@ static int release_stripe_list(struct r5conf *conf)
 	struct llist_node *head;
 
 	head = llist_del_all(&conf->released_stripes);
+	head = llist_reverse_order(head);
 	while (head) {
 		sh = llist_entry(head, struct stripe_head, release_list);
 		head = llist_next(head);
-- 
cgit v0.10.2


From 851c30c9badfc6b294c98e887624bff53644ad21 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Wed, 28 Aug 2013 14:30:16 +0800
Subject: raid5: offload stripe handle to workqueue

This is another attempt to create multiple threads to handle raid5 stripes.
This time I use workqueue.

raid5 handles request (especially write) in stripe unit. A stripe is page size
aligned/long and acrosses all disks. Writing to any disk sector, raid5 runs a
state machine for the corresponding stripe, which includes reading some disks
of the stripe, calculating parity, and writing some disks of the stripe. The
state machine is running in raid5d thread currently. Since there is only one
thread, it doesn't scale well for high speed storage. An obvious solution is
multi-threading.

To get better performance, we have some requirements:
a. locality. stripe corresponding to request submitted from one cpu is better
handled in thread in local cpu or local node. local cpu is preferred but some
times could be a bottleneck, for example, parity calculation is too heavy.
local node running has wide adaptability.
b. configurablity. Different setup of raid5 array might need diffent
configuration. Especially the thread number. More threads don't always mean
better performance because of lock contentions.

My original implementation is creating some kernel threads. There are
interfaces to control which cpu's stripe each thread should handle. And
userspace can set affinity of the threads. This provides biggest flexibility
and configurability. But it's hard to use and apparently a new thread pool
implementation is disfavor.

Recent workqueue improvement is quite promising. unbound workqueue will be
bound to numa node. If WQ_SYSFS is set in workqueue, there are sysfs option to
do affinity setting. For example, we can only include one HT sibling in
affinity. Since work is non-reentrant by default, and we can control running
thread number by limiting dispatched work_struct number.

In this patch, I created several stripe worker group. A group is a numa node.
stripes from cpus of one node will be added to a group list. Workqueue thread
of one node will only handle stripes of worker group of the node. In this way,
stripe handling has numa node locality. And as I said, we can control thread
number by limiting dispatched work_struct number.

The work_struct callback function handles several stripes in one run. A typical
work queue usage is to run one unit in each work_struct. In raid5 case, the
unit is a stripe. But we can't do that:
a. Though handling a stripe doesn't need lock because of reference accounting
and stripe isn't in any list, queuing a work_struct for each stripe will make
workqueue lock contended very heavily.
b. blk_start_plug()/blk_finish_plug() should surround stripe handle, as we
might dispatch request. If each work_struct only handles one stripe, such block
plug is meaningless.

This implementation can't do very fine grained configuration. But the numa
binding is most popular usage model, should be enough for most workloads.

Note: since we have only one stripe queue, switching to multi-thread might
decrease request size dispatching down to low level layer. The impact depends
on thread number, raid configuration and workload. So multi-thread raid5 might
not be proper for all setups.

Changes V1 -> V2:
1. remove WQ_NON_REENTRANT
2. disabling multi-threading by default
3. Add more descriptions in changelog

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d87a2de..32fa113 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -53,6 +53,7 @@
 #include <linux/cpu.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
+#include <linux/nodemask.h>
 #include <trace/events/block.h>
 
 #include "md.h"
@@ -60,6 +61,10 @@
 #include "raid0.h"
 #include "bitmap.h"
 
+#define cpu_to_group(cpu) cpu_to_node(cpu)
+#define ANY_GROUP NUMA_NO_NODE
+
+static struct workqueue_struct *raid5_wq;
 /*
  * Stripe cache
  */
@@ -200,6 +205,34 @@ static int stripe_operations_active(struct stripe_head *sh)
 	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 }
 
+static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
+{
+	struct r5conf *conf = sh->raid_conf;
+	struct r5worker_group *group;
+	int i, cpu = sh->cpu;
+
+	if (!cpu_online(cpu)) {
+		cpu = cpumask_any(cpu_online_mask);
+		sh->cpu = cpu;
+	}
+
+	if (list_empty(&sh->lru)) {
+		struct r5worker_group *group;
+		group = conf->worker_groups + cpu_to_group(cpu);
+		list_add_tail(&sh->lru, &group->handle_list);
+	}
+
+	if (conf->worker_cnt_per_group == 0) {
+		md_wakeup_thread(conf->mddev->thread);
+		return;
+	}
+
+	group = conf->worker_groups + cpu_to_group(sh->cpu);
+
+	for (i = 0; i < conf->worker_cnt_per_group; i++)
+		queue_work_on(sh->cpu, raid5_wq, &group->workers[i].work);
+}
+
 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
 {
 	BUG_ON(!list_empty(&sh->lru));
@@ -214,7 +247,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
 		else {
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			clear_bit(STRIPE_BIT_DELAY, &sh->state);
-			list_add_tail(&sh->lru, &conf->handle_list);
+			if (conf->worker_cnt_per_group == 0) {
+				list_add_tail(&sh->lru, &conf->handle_list);
+			} else {
+				raid5_wakeup_stripe_thread(sh);
+				return;
+			}
 		}
 		md_wakeup_thread(conf->mddev->thread);
 	} else {
@@ -409,6 +447,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 		raid5_build_block(sh, i, previous);
 	}
 	insert_hash(conf, sh);
+	sh->cpu = smp_processor_id();
 }
 
 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
@@ -3830,6 +3869,7 @@ static void raid5_activate_delayed(struct r5conf *conf)
 			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 				atomic_inc(&conf->preread_active_stripes);
 			list_add_tail(&sh->lru, &conf->hold_list);
+			raid5_wakeup_stripe_thread(sh);
 		}
 	}
 }
@@ -4109,18 +4149,32 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
  * head of the hold_list has changed, i.e. the head was promoted to the
  * handle_list.
  */
-static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
+static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 {
-	struct stripe_head *sh;
+	struct stripe_head *sh = NULL, *tmp;
+	struct list_head *handle_list = NULL;
+
+	if (conf->worker_cnt_per_group == 0) {
+		handle_list = &conf->handle_list;
+	} else if (group != ANY_GROUP) {
+		handle_list = &conf->worker_groups[group].handle_list;
+	} else {
+		int i;
+		for (i = 0; i < conf->group_cnt; i++) {
+			handle_list = &conf->worker_groups[i].handle_list;
+			if (!list_empty(handle_list))
+				break;
+		}
+	}
 
 	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
 		  __func__,
-		  list_empty(&conf->handle_list) ? "empty" : "busy",
+		  list_empty(handle_list) ? "empty" : "busy",
 		  list_empty(&conf->hold_list) ? "empty" : "busy",
 		  atomic_read(&conf->pending_full_writes), conf->bypass_count);
 
-	if (!list_empty(&conf->handle_list)) {
-		sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
+	if (!list_empty(handle_list)) {
+		sh = list_entry(handle_list->next, typeof(*sh), lru);
 
 		if (list_empty(&conf->hold_list))
 			conf->bypass_count = 0;
@@ -4138,12 +4192,25 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
 		   ((conf->bypass_threshold &&
 		     conf->bypass_count > conf->bypass_threshold) ||
 		    atomic_read(&conf->pending_full_writes) == 0)) {
-		sh = list_entry(conf->hold_list.next,
-				typeof(*sh), lru);
-		conf->bypass_count -= conf->bypass_threshold;
-		if (conf->bypass_count < 0)
-			conf->bypass_count = 0;
-	} else
+
+		list_for_each_entry(tmp, &conf->hold_list,  lru) {
+			if (conf->worker_cnt_per_group == 0 ||
+			    group == ANY_GROUP ||
+			    !cpu_online(tmp->cpu) ||
+			    cpu_to_group(tmp->cpu) == group) {
+				sh = tmp;
+				break;
+			}
+		}
+
+		if (sh) {
+			conf->bypass_count -= conf->bypass_threshold;
+			if (conf->bypass_count < 0)
+				conf->bypass_count = 0;
+		}
+	}
+
+	if (!sh)
 		return NULL;
 
 	list_del_init(&sh->lru);
@@ -4844,13 +4911,13 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 }
 
 #define MAX_STRIPE_BATCH 8
-static int handle_active_stripes(struct r5conf *conf)
+static int handle_active_stripes(struct r5conf *conf, int group)
 {
 	struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
 	int i, batch_size = 0;
 
 	while (batch_size < MAX_STRIPE_BATCH &&
-			(sh = __get_priority_stripe(conf)) != NULL)
+			(sh = __get_priority_stripe(conf, group)) != NULL)
 		batch[batch_size++] = sh;
 
 	if (batch_size == 0)
@@ -4868,6 +4935,38 @@ static int handle_active_stripes(struct r5conf *conf)
 	return batch_size;
 }
 
+static void raid5_do_work(struct work_struct *work)
+{
+	struct r5worker *worker = container_of(work, struct r5worker, work);
+	struct r5worker_group *group = worker->group;
+	struct r5conf *conf = group->conf;
+	int group_id = group - conf->worker_groups;
+	int handled;
+	struct blk_plug plug;
+
+	pr_debug("+++ raid5worker active\n");
+
+	blk_start_plug(&plug);
+	handled = 0;
+	spin_lock_irq(&conf->device_lock);
+	while (1) {
+		int batch_size, released;
+
+		released = release_stripe_list(conf);
+
+		batch_size = handle_active_stripes(conf, group_id);
+		if (!batch_size && !released)
+			break;
+		handled += batch_size;
+	}
+	pr_debug("%d stripes handled\n", handled);
+
+	spin_unlock_irq(&conf->device_lock);
+	blk_finish_plug(&plug);
+
+	pr_debug("--- raid5worker inactive\n");
+}
+
 /*
  * This is our raid5 kernel thread.
  *
@@ -4917,7 +5016,7 @@ static void raid5d(struct md_thread *thread)
 			handled++;
 		}
 
-		batch_size = handle_active_stripes(conf);
+		batch_size = handle_active_stripes(conf, ANY_GROUP);
 		if (!batch_size && !released)
 			break;
 		handled += batch_size;
@@ -5057,6 +5156,54 @@ static struct attribute_group raid5_attrs_group = {
 	.attrs = raid5_attrs,
 };
 
+static int alloc_thread_groups(struct r5conf *conf, int cnt)
+{
+	int i, j;
+	ssize_t size;
+	struct r5worker *workers;
+
+	conf->worker_cnt_per_group = cnt;
+	if (cnt == 0) {
+		conf->worker_groups = NULL;
+		return 0;
+	}
+	conf->group_cnt = num_possible_nodes();
+	size = sizeof(struct r5worker) * cnt;
+	workers = kzalloc(size * conf->group_cnt, GFP_NOIO);
+	conf->worker_groups = kzalloc(sizeof(struct r5worker_group) *
+				conf->group_cnt, GFP_NOIO);
+	if (!conf->worker_groups || !workers) {
+		kfree(workers);
+		kfree(conf->worker_groups);
+		conf->worker_groups = NULL;
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < conf->group_cnt; i++) {
+		struct r5worker_group *group;
+
+		group = &conf->worker_groups[i];
+		INIT_LIST_HEAD(&group->handle_list);
+		group->conf = conf;
+		group->workers = workers + i * cnt;
+
+		for (j = 0; j < cnt; j++) {
+			group->workers[j].group = group;
+			INIT_WORK(&group->workers[j].work, raid5_do_work);
+		}
+	}
+
+	return 0;
+}
+
+static void free_thread_groups(struct r5conf *conf)
+{
+	if (conf->worker_groups)
+		kfree(conf->worker_groups[0].workers);
+	kfree(conf->worker_groups);
+	conf->worker_groups = NULL;
+}
+
 static sector_t
 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 {
@@ -5097,6 +5244,7 @@ static void raid5_free_percpu(struct r5conf *conf)
 
 static void free_conf(struct r5conf *conf)
 {
+	free_thread_groups(conf);
 	shrink_stripes(conf);
 	raid5_free_percpu(conf);
 	kfree(conf->disks);
@@ -5225,6 +5373,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
 	if (conf == NULL)
 		goto abort;
+	/* Don't enable multi-threading by default*/
+	if (alloc_thread_groups(conf, 0))
+		goto abort;
 	spin_lock_init(&conf->device_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
@@ -6530,6 +6681,10 @@ static struct md_personality raid4_personality =
 
 static int __init raid5_init(void)
 {
+	raid5_wq = alloc_workqueue("raid5wq",
+		WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
+	if (!raid5_wq)
+		return -ENOMEM;
 	register_md_personality(&raid6_personality);
 	register_md_personality(&raid5_personality);
 	register_md_personality(&raid4_personality);
@@ -6541,6 +6696,7 @@ static void raid5_exit(void)
 	unregister_md_personality(&raid6_personality);
 	unregister_md_personality(&raid5_personality);
 	unregister_md_personality(&raid4_personality);
+	destroy_workqueue(raid5_wq);
 }
 
 module_init(raid5_init);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index a98f99d..1053663 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -212,6 +212,7 @@ struct stripe_head {
 	enum check_states	check_state;
 	enum reconstruct_states reconstruct_state;
 	spinlock_t		stripe_lock;
+	int			cpu;
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -365,6 +366,17 @@ struct disk_info {
 	struct md_rdev	*rdev, *replacement;
 };
 
+struct r5worker {
+	struct work_struct work;
+	struct r5worker_group *group;
+};
+
+struct r5worker_group {
+	struct list_head handle_list;
+	struct r5conf *conf;
+	struct r5worker *workers;
+};
+
 struct r5conf {
 	struct hlist_head	*stripe_hashtbl;
 	struct mddev		*mddev;
@@ -461,6 +473,9 @@ struct r5conf {
 	 * the new thread here until we fully activate the array.
 	 */
 	struct md_thread	*thread;
+	struct r5worker_group	*worker_groups;
+	int			group_cnt;
+	int			worker_cnt_per_group;
 };
 
 /*
-- 
cgit v0.10.2


From b721420e8719131896b009b11edbbd27d9b85e98 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Tue, 27 Aug 2013 17:50:42 +0800
Subject: raid5: sysfs entry to control worker thread number

Add a sysfs entry to control running workqueue thread number. If
group_thread_cnt is set to 0, we will disable workqueue offload handling of
stripes.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 32fa113..d79ecd9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5145,10 +5145,70 @@ stripe_cache_active_show(struct mddev *mddev, char *page)
 static struct md_sysfs_entry
 raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
 
+static ssize_t
+raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
+{
+	struct r5conf *conf = mddev->private;
+	if (conf)
+		return sprintf(page, "%d\n", conf->worker_cnt_per_group);
+	else
+		return 0;
+}
+
+static int alloc_thread_groups(struct r5conf *conf, int cnt);
+static ssize_t
+raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
+{
+	struct r5conf *conf = mddev->private;
+	unsigned long new;
+	int err;
+	struct r5worker_group *old_groups;
+	int old_group_cnt;
+
+	if (len >= PAGE_SIZE)
+		return -EINVAL;
+	if (!conf)
+		return -ENODEV;
+
+	if (kstrtoul(page, 10, &new))
+		return -EINVAL;
+
+	if (new == conf->worker_cnt_per_group)
+		return len;
+
+	mddev_suspend(mddev);
+
+	old_groups = conf->worker_groups;
+	old_group_cnt = conf->worker_cnt_per_group;
+
+	conf->worker_groups = NULL;
+	err = alloc_thread_groups(conf, new);
+	if (err) {
+		conf->worker_groups = old_groups;
+		conf->worker_cnt_per_group = old_group_cnt;
+	} else {
+		if (old_groups)
+			kfree(old_groups[0].workers);
+		kfree(old_groups);
+	}
+
+	mddev_resume(mddev);
+
+	if (err)
+		return err;
+	return len;
+}
+
+static struct md_sysfs_entry
+raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
+				raid5_show_group_thread_cnt,
+				raid5_store_group_thread_cnt);
+
 static struct attribute *raid5_attrs[] =  {
 	&raid5_stripecache_size.attr,
 	&raid5_stripecache_active.attr,
 	&raid5_preread_bypass_threshold.attr,
+	&raid5_group_thread_cnt.attr,
 	NULL,
 };
 static struct attribute_group raid5_attrs_group = {
-- 
cgit v0.10.2


From c46501b2deaa06efcaaf82917281941f02c6b307 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 27 Aug 2013 15:52:13 +1000
Subject: md/raid5: use seqcount to protect access to shape in make_request.

make_request() access various shape parameters (raid_disks, chunk_size
etc) which might be changed by raid5_start_reshape().

If the later is called at and awkward time during the form, the wrong
stripe_head might be used.

So introduce a 'seqcount' and after finding a stripe_head make sure
there is no reason to expect that we got the wrong one.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d79ecd9..4263df1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4408,8 +4408,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
 		int previous;
+		int seq;
 
 	retry:
+		seq = read_seqcount_begin(&conf->gen_lock);
 		previous = 0;
 		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 		if (unlikely(conf->reshape_progress != MaxSector)) {
@@ -4442,7 +4444,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 						  previous,
 						  &dd_idx, NULL);
 		pr_debug("raid456: make_request, sector %llu logical %llu\n",
-			(unsigned long long)new_sector, 
+			(unsigned long long)new_sector,
 			(unsigned long long)logical_sector);
 
 		sh = get_active_stripe(conf, new_sector, previous,
@@ -4471,6 +4473,13 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 					goto retry;
 				}
 			}
+			if (read_seqcount_retry(&conf->gen_lock, seq)) {
+				/* Might have got the wrong stripe_head
+				 * by accident
+				 */
+				release_stripe(sh);
+				goto retry;
+			}
 
 			if (rw == WRITE &&
 			    logical_sector >= mddev->suspend_lo &&
@@ -5437,6 +5446,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	if (alloc_thread_groups(conf, 0))
 		goto abort;
 	spin_lock_init(&conf->device_lock);
+	seqcount_init(&conf->gen_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
@@ -6249,6 +6259,7 @@ static int raid5_start_reshape(struct mddev *mddev)
 
 	atomic_set(&conf->reshape_stripes, 0);
 	spin_lock_irq(&conf->device_lock);
+	write_seqcount_begin(&conf->gen_lock);
 	conf->previous_raid_disks = conf->raid_disks;
 	conf->raid_disks += mddev->delta_disks;
 	conf->prev_chunk_sectors = conf->chunk_sectors;
@@ -6265,6 +6276,7 @@ static int raid5_start_reshape(struct mddev *mddev)
 	else
 		conf->reshape_progress = 0;
 	conf->reshape_safe = conf->reshape_progress;
+	write_seqcount_end(&conf->gen_lock);
 	spin_unlock_irq(&conf->device_lock);
 
 	/* Add some new drives, as many as will fit.
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 1053663..435b12d 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -400,6 +400,7 @@ struct r5conf {
 	int			prev_chunk_sectors;
 	int			prev_algo;
 	short			generation; /* increments with every reshape */
+	seqcount_t		gen_lock;	/* lock against generation changes */
 	unsigned long		reshape_checkpoint; /* Time we last updated
 						     * metadata */
 	long long		min_offset_diff; /* minimum difference between
-- 
cgit v0.10.2


From 4d77e3ba88d085836f1e8e475e3131844dd89d04 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 27 Aug 2013 15:57:47 +1000
Subject: md/raid5: flush out all pending requests before proceeding with
 reshape.

Some requests - particularly 'discard' and 'read' are handled
differently depending on whether a reshape is active or not.

It is harmless to assume reshape is active if it isn't but wrong
to act as though reshape is not active when it is.

So when we start reshape - after making clear to all requests that
reshape has started - use mddev_suspend/mddev_resume to flush out all
requests.  This will ensure that no requests will be assuming the
absence of reshape once it really starts.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4263df1..663a8e5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6279,6 +6279,13 @@ static int raid5_start_reshape(struct mddev *mddev)
 	write_seqcount_end(&conf->gen_lock);
 	spin_unlock_irq(&conf->device_lock);
 
+	/* Now make sure any requests that proceeded on the assumption
+	 * the reshape wasn't running - like Discard or Read - have
+	 * completed.
+	 */
+	mddev_suspend(mddev);
+	mddev_resume(mddev);
+
 	/* Add some new drives, as many as will fit.
 	 * We know there are enough to make the newly sized array work.
 	 * Don't add devices if we are reducing the number of
-- 
cgit v0.10.2


From bfc90cb0936f5b972706625f38f72c7cb726c20a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Thu, 29 Aug 2013 15:40:32 +0800
Subject: raid5: only wakeup necessary threads

If there are not enough stripes to handle, we'd better not always
queue all available work_structs. If one worker can only handle small
or even none stripes, it will impact request merge and create lock
contention.

With this patch, the number of work_struct running will depend on
pending stripes number. Note: some statistics info used in the patch
are accessed without locking protection. This should doesn't matter,
we just try best to avoid queue unnecessary work_struct.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 663a8e5..7ff4f25 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -77,6 +77,7 @@ static struct workqueue_struct *raid5_wq;
 #define BYPASS_THRESHOLD	1
 #define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
 #define HASH_MASK		(NR_HASH - 1)
+#define MAX_STRIPE_BATCH	8
 
 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
 {
@@ -209,6 +210,7 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 {
 	struct r5conf *conf = sh->raid_conf;
 	struct r5worker_group *group;
+	int thread_cnt;
 	int i, cpu = sh->cpu;
 
 	if (!cpu_online(cpu)) {
@@ -220,6 +222,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 		struct r5worker_group *group;
 		group = conf->worker_groups + cpu_to_group(cpu);
 		list_add_tail(&sh->lru, &group->handle_list);
+		group->stripes_cnt++;
+		sh->group = group;
 	}
 
 	if (conf->worker_cnt_per_group == 0) {
@@ -229,8 +233,20 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 
 	group = conf->worker_groups + cpu_to_group(sh->cpu);
 
-	for (i = 0; i < conf->worker_cnt_per_group; i++)
-		queue_work_on(sh->cpu, raid5_wq, &group->workers[i].work);
+	group->workers[0].working = true;
+	/* at least one worker should run to avoid race */
+	queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
+
+	thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
+	/* wakeup more workers */
+	for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
+		if (group->workers[i].working == false) {
+			group->workers[i].working = true;
+			queue_work_on(sh->cpu, raid5_wq,
+				      &group->workers[i].work);
+			thread_cnt--;
+		}
+	}
 }
 
 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
@@ -589,6 +605,10 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 				    !test_bit(STRIPE_EXPANDING, &sh->state))
 					BUG();
 				list_del_init(&sh->lru);
+				if (sh->group) {
+					sh->group->stripes_cnt--;
+					sh->group = NULL;
+				}
 			}
 		}
 	} while (sh == NULL);
@@ -4153,15 +4173,18 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 {
 	struct stripe_head *sh = NULL, *tmp;
 	struct list_head *handle_list = NULL;
+	struct r5worker_group *wg = NULL;
 
 	if (conf->worker_cnt_per_group == 0) {
 		handle_list = &conf->handle_list;
 	} else if (group != ANY_GROUP) {
 		handle_list = &conf->worker_groups[group].handle_list;
+		wg = &conf->worker_groups[group];
 	} else {
 		int i;
 		for (i = 0; i < conf->group_cnt; i++) {
 			handle_list = &conf->worker_groups[i].handle_list;
+			wg = &conf->worker_groups[i];
 			if (!list_empty(handle_list))
 				break;
 		}
@@ -4208,11 +4231,16 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 			if (conf->bypass_count < 0)
 				conf->bypass_count = 0;
 		}
+		wg = NULL;
 	}
 
 	if (!sh)
 		return NULL;
 
+	if (wg) {
+		wg->stripes_cnt--;
+		sh->group = NULL;
+	}
 	list_del_init(&sh->lru);
 	atomic_inc(&sh->count);
 	BUG_ON(atomic_read(&sh->count) != 1);
@@ -4919,8 +4947,8 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 	return handled;
 }
 
-#define MAX_STRIPE_BATCH 8
-static int handle_active_stripes(struct r5conf *conf, int group)
+static int handle_active_stripes(struct r5conf *conf, int group,
+				 struct r5worker *worker)
 {
 	struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
 	int i, batch_size = 0;
@@ -4963,7 +4991,8 @@ static void raid5_do_work(struct work_struct *work)
 
 		released = release_stripe_list(conf);
 
-		batch_size = handle_active_stripes(conf, group_id);
+		batch_size = handle_active_stripes(conf, group_id, worker);
+		worker->working = false;
 		if (!batch_size && !released)
 			break;
 		handled += batch_size;
@@ -5025,7 +5054,7 @@ static void raid5d(struct md_thread *thread)
 			handled++;
 		}
 
-		batch_size = handle_active_stripes(conf, ANY_GROUP);
+		batch_size = handle_active_stripes(conf, ANY_GROUP, NULL);
 		if (!batch_size && !released)
 			break;
 		handled += batch_size;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 435b12d..2113ffa 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -213,6 +213,7 @@ struct stripe_head {
 	enum reconstruct_states reconstruct_state;
 	spinlock_t		stripe_lock;
 	int			cpu;
+	struct r5worker_group	*group;
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -369,12 +370,14 @@ struct disk_info {
 struct r5worker {
 	struct work_struct work;
 	struct r5worker_group *group;
+	bool working;
 };
 
 struct r5worker_group {
 	struct list_head handle_list;
 	struct r5conf *conf;
 	struct r5worker *workers;
+	int stripes_cnt;
 };
 
 struct r5conf {
-- 
cgit v0.10.2