From ca9558a33f658155c3b69f92897c2e6a848684f5 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Fri, 31 Oct 2014 14:55:05 +0000
Subject: rcu: Remove redundant rcu_is_cpu_rrupt_from_idle() from tiny RCU

Let's start assuming that something in the idle loop posts a callback,
and scheduling-clock interrupt occurs:

1. The system is idle and stays that way, no runnable tasks.

2. Scheduling-clock interrupt occurs, rcu_check_callbacks() is called
   as result, which in turn calls rcu_is_cpu_rrupt_from_idle().

3. rcu_is_cpu_rrupt_from_idle() reports the CPU was interrupted from
   idle, which results in rcu_sched_qs() call, which does a
   raise_softirq(RCU_SOFTIRQ).

4. Upon return from interrupt, rcu_irq_exit() is invoked, which calls
   rcu_idle_enter_common(), which in turn calls rcu_sched_qs() again,
   which does another raise_softirq(RCU_SOFTIRQ).

5. The softirq happens shortly and invokes rcu_process_callbacks(),
   which invokes __rcu_process_callbacks().

6. So now callbacks can be invoked. At least they can be if
   ->donetail has been updated. Which it will have been because
   rcu_sched_qs() invokes rcu_qsctr_help().

In the described scenario rcu_sched_qs() and raise_softirq(RCU_SOFTIRQ)
get called twice in steps 3 and 4. This redundancy could be eliminated
by removing rcu_is_cpu_rrupt_from_idle() function.

Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 0db5649..805b6d5 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -186,16 +186,6 @@ EXPORT_SYMBOL(__rcu_is_watching);
 #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
 
 /*
- * Test whether the current CPU was interrupted from idle.  Nested
- * interrupts don't count, we must be running at the first interrupt
- * level.
- */
-static int rcu_is_cpu_rrupt_from_idle(void)
-{
-	return rcu_dynticks_nesting <= 1;
-}
-
-/*
  * Helper function for rcu_sched_qs() and rcu_bh_qs().
  * Also irqs are disabled to avoid confusion due to interrupt handlers
  * invoking call_rcu().
@@ -250,7 +240,7 @@ void rcu_bh_qs(void)
 void rcu_check_callbacks(int user)
 {
 	RCU_TRACE(check_cpu_stalls());
-	if (user || rcu_is_cpu_rrupt_from_idle())
+	if (user)
 		rcu_sched_qs();
 	else if (!in_softirq())
 		rcu_bh_qs();
-- 
cgit v0.10.2


From 924df8a0117aa2725750f1ec4d282867534d9a89 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 29 Oct 2014 15:37:58 -0700
Subject: rcu: Fix invoke_rcu_callbacks() comment

Despite what the comment says, it is only softirqs that are disabled,
not interrupts.  This commit therefore fixes the comment.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4c106fc..9c25b99 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2601,7 +2601,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
  * Schedule RCU callback invocation.  If the specified type of RCU
  * does not support RCU priority boosting, just do a direct call,
  * otherwise wake up the per-CPU kernel kthread.  Note that because we
- * are running on the current CPU with interrupts disabled, the
+ * are running on the current CPU with softirqs disabled, the
  * rcu_cpu_kthread_task cannot disappear out from under us.
  */
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
-- 
cgit v0.10.2


From 536fa402221f09633e7c5801b327055ab716a363 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 5 Sep 2014 11:14:48 -0700
Subject: compiler: Allow 1- and 2-byte smp_load_acquire() and
 smp_store_release()

CPUs without single-byte and double-byte loads and stores place some
"interesting" requirements on concurrent code.  For example (adapted
from Peter Hurley's test code), suppose we have the following structure:

	struct foo {
		spinlock_t lock1;
		spinlock_t lock2;
		char a; /* Protected by lock1. */
		char b; /* Protected by lock2. */
	};
	struct foo *foop;

Of course, it is common (and good) practice to place data protected
by different locks in separate cache lines.  However, if the locks are
rarely acquired (for example, only in rare error cases), and there are
a great many instances of the data structure, then memory footprint can
trump false-sharing concerns, so that it can be better to place them in
the same cache cache line as above.

But if the CPU does not support single-byte loads and stores, a store
to foop->a will do a non-atomic read-modify-write operation on foop->b,
which will come as a nasty surprise to someone holding foop->lock2.  So we
now require CPUs to support single-byte and double-byte loads and stores.
Therefore, this commit adjusts the definition of __native_word() to allow
these sizes to be used by smp_load_acquire() and smp_store_release().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index a1c81f8..49811cd 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -385,7 +385,7 @@ static __always_inline void __assign_once_size(volatile void *p, void *res, int
 
 /* Is this type a native word size -- useful for atomic operations */
 #ifndef __native_word
-# define __native_word(t) (sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
+# define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
 #endif
 
 /* Compile time object size, -1 for unknown */
-- 
cgit v0.10.2


From ac59853c06993a442e8060bc19040b2ca3025aec Mon Sep 17 00:00:00 2001
From: Pranith Kumar <bobby.prani@gmail.com>
Date: Thu, 13 Nov 2014 14:24:14 -0500
Subject: rcupdate: Replace smp_read_barrier_depends() with
 lockless_dereference()

Recently lockless_dereference() was added which can be used in place of
hard-coding smp_read_barrier_depends(). The following PATCH makes the change.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index ed4f593..386ba28 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -582,11 +582,11 @@ static inline void rcu_preempt_sleep_check(void)
 })
 #define __rcu_dereference_check(p, c, space) \
 ({ \
-	typeof(*p) *_________p1 = (typeof(*p) *__force)ACCESS_ONCE(p); \
+	/* Dependency order vs. p above. */ \
+	typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \
 	rcu_lockdep_assert(c, "suspicious rcu_dereference_check() usage"); \
 	rcu_dereference_sparse(p, space); \
-	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
-	((typeof(*p) __force __kernel *)(_________p1)); \
+	((typeof(*p) __force __kernel *)(________p1)); \
 })
 #define __rcu_dereference_protected(p, c, space) \
 ({ \
@@ -603,10 +603,10 @@ static inline void rcu_preempt_sleep_check(void)
 })
 #define __rcu_dereference_index_check(p, c) \
 ({ \
-	typeof(p) _________p1 = ACCESS_ONCE(p); \
+	/* Dependency order vs. p above. */ \
+	typeof(p) _________p1 = lockless_dereference(p); \
 	rcu_lockdep_assert(c, \
 			   "suspicious rcu_dereference_index_check() usage"); \
-	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
 	(_________p1); \
 })
 
-- 
cgit v0.10.2


From 5f6130fa52ee0df0b1da4518c5bbef42bcfe7d83 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 9 Dec 2014 17:53:34 +0800
Subject: tiny_rcu: Directly force QS when call_rcu_[bh|sched]() on idle_task

For RCU in UP, context-switch = QS = GP, thus we can force a
context-switch when any call_rcu_[bh|sched]() is happened on idle_task.
After doing so, rcu_idle/irq_enter/exit() are useless, so we can simply
make these functions empty.

More important, this change does not change the functionality logically.
Note: raise_softirq(RCU_SOFTIRQ)/rcu_sched_qs() in rcu_idle_enter() and
outmost rcu_irq_exit() will have to wake up the ksoftirqd
(due to in_interrupt() == 0).

Before this patch		After this patch:
call_rcu_sched() in idle;	call_rcu_sched() in idle
				  set resched
do other stuffs;		do other stuffs
outmost rcu_irq_exit()		outmost rcu_irq_exit() (empty function)
  (or rcu_idle_enter())		  (or rcu_idle_enter(), also empty function)
				start to resched. (see above)
  rcu_sched_qs()		rcu_sched_qs()
    QS,and GP and advance cb	  QS,and GP and advance cb
    wake up the ksoftirqd	    wake up the ksoftirqd
      set resched
resched to ksoftirqd (or other)	resched to ksoftirqd (or other)

These two code patches are almost the same.

Size changed after patched:

size kernel/rcu/tiny-old.o kernel/rcu/tiny-patched.o
   text	   data	    bss	    dec	    hex	filename
   3449	    206	      8	   3663	    e4f	kernel/rcu/tiny-old.o
   2406	    144	      8	   2558	    9fe	kernel/rcu/tiny-patched.o

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 07bb02e..80adef7 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -137,4 +137,10 @@ int rcu_jiffies_till_stall_check(void);
 
 void rcu_early_boot_tests(void);
 
+/*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
+
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 805b6d5..85287ec 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head,
 		       void (*func)(struct rcu_head *rcu),
 		       struct rcu_ctrlblk *rcp);
 
-static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-
 #include "tiny_plugin.h"
 
-/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */
-static void rcu_idle_enter_common(long long newval)
-{
-	if (newval) {
-		RCU_TRACE(trace_rcu_dyntick(TPS("--="),
-					    rcu_dynticks_nesting, newval));
-		rcu_dynticks_nesting = newval;
-		return;
-	}
-	RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
-				    rcu_dynticks_nesting, newval));
-	if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
-		struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-
-		RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
-					    rcu_dynticks_nesting, newval));
-		ftrace_dump(DUMP_ALL);
-		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
-			  current->pid, current->comm,
-			  idle->pid, idle->comm); /* must be idle task! */
-	}
-	rcu_sched_qs(); /* implies rcu_bh_inc() */
-	barrier();
-	rcu_dynticks_nesting = newval;
-}
-
 /*
  * Enter idle, which is an extended quiescent state if we have fully
- * entered that mode (i.e., if the new value of dynticks_nesting is zero).
+ * entered that mode.
  */
 void rcu_idle_enter(void)
 {
-	unsigned long flags;
-	long long newval;
-
-	local_irq_save(flags);
-	WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
-	if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
-	    DYNTICK_TASK_NEST_VALUE)
-		newval = 0;
-	else
-		newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
-	rcu_idle_enter_common(newval);
-	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_enter);
 
@@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
  */
 void rcu_irq_exit(void)
 {
-	unsigned long flags;
-	long long newval;
-
-	local_irq_save(flags);
-	newval = rcu_dynticks_nesting - 1;
-	WARN_ON_ONCE(newval < 0);
-	rcu_idle_enter_common(newval);
-	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_irq_exit);
 
-/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */
-static void rcu_idle_exit_common(long long oldval)
-{
-	if (oldval) {
-		RCU_TRACE(trace_rcu_dyntick(TPS("++="),
-					    oldval, rcu_dynticks_nesting));
-		return;
-	}
-	RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
-	if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
-		struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-
-		RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
-			  oldval, rcu_dynticks_nesting));
-		ftrace_dump(DUMP_ALL);
-		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
-			  current->pid, current->comm,
-			  idle->pid, idle->comm); /* must be idle task! */
-	}
-}
-
 /*
  * Exit idle, so that we are no longer in an extended quiescent state.
  */
 void rcu_idle_exit(void)
 {
-	unsigned long flags;
-	long long oldval;
-
-	local_irq_save(flags);
-	oldval = rcu_dynticks_nesting;
-	WARN_ON_ONCE(rcu_dynticks_nesting < 0);
-	if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
-		rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
-	else
-		rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-	rcu_idle_exit_common(oldval);
-	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_exit);
 
@@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
  */
 void rcu_irq_enter(void)
 {
-	unsigned long flags;
-	long long oldval;
-
-	local_irq_save(flags);
-	oldval = rcu_dynticks_nesting;
-	rcu_dynticks_nesting++;
-	WARN_ON_ONCE(rcu_dynticks_nesting == 0);
-	rcu_idle_exit_common(oldval);
-	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_irq_enter);
 
@@ -179,7 +89,7 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter);
  */
 bool notrace __rcu_is_watching(void)
 {
-	return rcu_dynticks_nesting;
+	return true;
 }
 EXPORT_SYMBOL(__rcu_is_watching);
 
@@ -347,6 +257,11 @@ static void __call_rcu(struct rcu_head *head,
 	rcp->curtail = &head->next;
 	RCU_TRACE(rcp->qlen++);
 	local_irq_restore(flags);
+
+	if (unlikely(is_idle_task(current))) {
+		/* force scheduling for rcu_sched_qs() */
+		resched_cpu(0);
+	}
 }
 
 /*
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 858c565..b5fc5e5 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -147,7 +147,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
 	js = ACCESS_ONCE(rcp->jiffies_stall);
 	if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
 		pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
-		       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+		       rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
 		       jiffies - rcp->gp_start, rcp->qlen);
 		dump_stack();
 	}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9c25b99..203b50d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -935,12 +935,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 }
 
 /*
- * This function really isn't for public consumption, but RCU is special in
- * that context switches can allow the state machine to make progress.
- */
-extern void resched_cpu(int cpu);
-
-/*
  * Return true if the specified CPU has passed through a quiescent
  * state by virtue of being in or having passed through an dynticks
  * idle state since the last call to dyntick_save_progress_counter()
-- 
cgit v0.10.2


From f520c98e3e5212d8c282a86d9b7697dd70326192 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Fri, 12 Dec 2014 09:36:14 +0800
Subject: rculist: Fix sparse warning

This fixes the following sparse warnings:

make C=1 CF=-D__CHECK_ENDIAN__ net/ipv6/addrconf.o
net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces)
net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces)
net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces)
net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces)

To silence these spare complaints, an RCU annotation should be added to
"next" pointer of hlist_node structure through hlist_next_rcu() macro
when iterating over a hlist with hlist_for_each_entry_continue_rcu_bh().

By the way, this commit also resolves the same error appearing in
hlist_for_each_entry_continue_rcu().

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 529bc94..a18b16f 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -524,11 +524,11 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n,
  * @member:	the name of the hlist_node within the struct.
  */
 #define hlist_for_each_entry_continue_rcu(pos, member)			\
-	for (pos = hlist_entry_safe(rcu_dereference((pos)->member.next),\
-			typeof(*(pos)), member);			\
+	for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
+			&(pos)->member)), typeof(*(pos)), member);	\
 	     pos;							\
-	     pos = hlist_entry_safe(rcu_dereference((pos)->member.next),\
-			typeof(*(pos)), member))
+	     pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(	\
+			&(pos)->member)), typeof(*(pos)), member))
 
 /**
  * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point
@@ -536,11 +536,11 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n,
  * @member:	the name of the hlist_node within the struct.
  */
 #define hlist_for_each_entry_continue_rcu_bh(pos, member)		\
-	for (pos = hlist_entry_safe(rcu_dereference_bh((pos)->member.next),\
-			typeof(*(pos)), member);			\
+	for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(  \
+			&(pos)->member)), typeof(*(pos)), member);	\
 	     pos;							\
-	     pos = hlist_entry_safe(rcu_dereference_bh((pos)->member.next),\
-			typeof(*(pos)), member))
+	     pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(	\
+			&(pos)->member)), typeof(*(pos)), member))
 
 /**
  * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point
-- 
cgit v0.10.2


From 87af9e7ff9d909e70a006ca0974466e2a1d8db0a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Fri, 12 Dec 2014 10:11:44 +0100
Subject: hotplugcpu: Avoid deadlocks by waking active_writer

Commit b2c4623dcd07 ("rcu: More on deadlock between CPU hotplug and expedited
grace periods") introduced another problem that can easily be reproduced by
starting/stopping cpus in a loop.

E.g.:
  for i in `seq 5000`; do
      echo 1 > /sys/devices/system/cpu/cpu1/online
      echo 0 > /sys/devices/system/cpu/cpu1/online
  done

Will result in:
  INFO: task /cpu_start_stop:1 blocked for more than 120 seconds.
  Call Trace:
  ([<00000000006a028e>] __schedule+0x406/0x91c)
   [<0000000000130f60>] cpu_hotplug_begin+0xd0/0xd4
   [<0000000000130ff6>] _cpu_up+0x3e/0x1c4
   [<0000000000131232>] cpu_up+0xb6/0xd4
   [<00000000004a5720>] device_online+0x80/0xc0
   [<00000000004a57f0>] online_store+0x90/0xb0
  ...

And a deadlock.

Problem is that if the last ref in put_online_cpus() can't get the
cpu_hotplug.lock the puts_pending count is incremented, but a sleeping
active_writer might never be woken up, therefore never exiting the loop in
cpu_hotplug_begin().

This fix removes puts_pending and turns refcount into an atomic variable. We
also introduce a wait queue for the active_writer, to avoid possible races and
use-after-free. There is no need to take the lock in put_online_cpus() anymore.

Can't reproduce it with this fix.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5d22023..1972b16 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,22 +58,23 @@ static int cpu_hotplug_disabled;
 
 static struct {
 	struct task_struct *active_writer;
-	struct mutex lock; /* Synchronizes accesses to refcount, */
+	/* wait queue to wake up the active_writer */
+	wait_queue_head_t wq;
+	/* verifies that no writer will get active while readers are active */
+	struct mutex lock;
 	/*
 	 * Also blocks the new readers during
 	 * an ongoing cpu hotplug operation.
 	 */
-	int refcount;
-	/* And allows lockless put_online_cpus(). */
-	atomic_t puts_pending;
+	atomic_t refcount;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif
 } cpu_hotplug = {
 	.active_writer = NULL,
+	.wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
 	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-	.refcount = 0,
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	.dep_map = {.name = "cpu_hotplug.lock" },
 #endif
@@ -86,15 +87,6 @@ static struct {
 #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
 #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
 
-static void apply_puts_pending(int max)
-{
-	int delta;
-
-	if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
-		delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
-		cpu_hotplug.refcount -= delta;
-	}
-}
 
 void get_online_cpus(void)
 {
@@ -103,8 +95,7 @@ void get_online_cpus(void)
 		return;
 	cpuhp_lock_acquire_read();
 	mutex_lock(&cpu_hotplug.lock);
-	apply_puts_pending(65536);
-	cpu_hotplug.refcount++;
+	atomic_inc(&cpu_hotplug.refcount);
 	mutex_unlock(&cpu_hotplug.lock);
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -116,8 +107,7 @@ bool try_get_online_cpus(void)
 	if (!mutex_trylock(&cpu_hotplug.lock))
 		return false;
 	cpuhp_lock_acquire_tryread();
-	apply_puts_pending(65536);
-	cpu_hotplug.refcount++;
+	atomic_inc(&cpu_hotplug.refcount);
 	mutex_unlock(&cpu_hotplug.lock);
 	return true;
 }
@@ -125,20 +115,18 @@ EXPORT_SYMBOL_GPL(try_get_online_cpus);
 
 void put_online_cpus(void)
 {
+	int refcount;
+
 	if (cpu_hotplug.active_writer == current)
 		return;
-	if (!mutex_trylock(&cpu_hotplug.lock)) {
-		atomic_inc(&cpu_hotplug.puts_pending);
-		cpuhp_lock_release();
-		return;
-	}
 
-	if (WARN_ON(!cpu_hotplug.refcount))
-		cpu_hotplug.refcount++; /* try to fix things up */
+	refcount = atomic_dec_return(&cpu_hotplug.refcount);
+	if (WARN_ON(refcount < 0)) /* try to fix things up */
+		atomic_inc(&cpu_hotplug.refcount);
+
+	if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
+		wake_up(&cpu_hotplug.wq);
 
-	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
-		wake_up_process(cpu_hotplug.active_writer);
-	mutex_unlock(&cpu_hotplug.lock);
 	cpuhp_lock_release();
 
 }
@@ -168,18 +156,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
  */
 void cpu_hotplug_begin(void)
 {
-	cpu_hotplug.active_writer = current;
+	DEFINE_WAIT(wait);
 
+	cpu_hotplug.active_writer = current;
 	cpuhp_lock_acquire();
+
 	for (;;) {
 		mutex_lock(&cpu_hotplug.lock);
-		apply_puts_pending(1);
-		if (likely(!cpu_hotplug.refcount))
-			break;
-		__set_current_state(TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
+		if (likely(!atomic_read(&cpu_hotplug.refcount)))
+				break;
 		mutex_unlock(&cpu_hotplug.lock);
 		schedule();
 	}
+	finish_wait(&cpu_hotplug.wq, &wait);
 }
 
 void cpu_hotplug_done(void)
-- 
cgit v0.10.2


From 41050a009640f9f330a5b916563ca7faf853a98c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 18 Dec 2014 12:31:27 -0800
Subject: rcu: Fix rcu_barrier() race that could result in too-short wait

The rcu_barrier() no-callbacks check for no-CBs CPUs has race conditions.
It checks a given CPU's lists of callbacks, and if all three no-CBs lists
are empty, ignores that CPU.  However, these three lists could potentially
be empty even when callbacks are present if the check executed just as
the callbacks were being moved from one list to another.  It turns out
that recent versions of rcutorture can spot this race.

This commit plugs this hole by consolidating the per-list counts of
no-CBs callbacks into a single count, which is incremented before
the corresponding callback is posted and after it is invoked.  Then
rcu_barrier() checks this single count to reliably determine whether
the corresponding CPU has no-CBs callbacks.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 203b50d..5b9f3b9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3344,6 +3344,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 			} else {
 				_rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
 						   rsp->n_barrier_done);
+				smp_mb__before_atomic();
 				atomic_inc(&rsp->barrier_cpu_count);
 				__call_rcu(&rdp->barrier_head,
 					   rcu_barrier_callback, rsp, cpu, 0);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b184..cb59086 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -340,14 +340,10 @@ struct rcu_data {
 #ifdef CONFIG_RCU_NOCB_CPU
 	struct rcu_head *nocb_head;	/* CBs waiting for kthread. */
 	struct rcu_head **nocb_tail;
-	atomic_long_t nocb_q_count;	/* # CBs waiting for kthread */
-	atomic_long_t nocb_q_count_lazy; /*  (approximate). */
+	atomic_long_t nocb_q_count;	/* # CBs waiting for nocb */
+	atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
 	struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
 	struct rcu_head **nocb_follower_tail;
-	atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
-	atomic_long_t nocb_follower_count_lazy; /*  (approximate). */
-	int nocb_p_count;		/* # CBs being invoked by kthread */
-	int nocb_p_count_lazy;		/*  (approximate). */
 	wait_queue_head_t nocb_wq;	/* For nocb kthreads to sleep on. */
 	struct task_struct *nocb_kthread;
 	int nocb_defer_wakeup;		/* Defer wakeup of nocb_kthread. */
@@ -356,8 +352,6 @@ struct rcu_data {
 	struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
 					/* CBs waiting for GP. */
 	struct rcu_head **nocb_gp_tail;
-	long nocb_gp_count;
-	long nocb_gp_count_lazy;
 	bool nocb_leader_sleep;		/* Is the nocb leader thread asleep? */
 	struct rcu_data *nocb_next_follower;
 					/* Next follower in wakeup chain. */
@@ -622,24 +616,15 @@ static void rcu_dynticks_task_exit(void);
 #endif /* #ifndef RCU_TREE_NONCORE */
 
 #ifdef CONFIG_RCU_TRACE
-#ifdef CONFIG_RCU_NOCB_CPU
-/* Sum up queue lengths for tracing. */
+/* Read out queue lengths for tracing. */
 static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 {
-	*ql = atomic_long_read(&rdp->nocb_q_count) +
-	      rdp->nocb_p_count +
-	      atomic_long_read(&rdp->nocb_follower_count) +
-	      rdp->nocb_p_count + rdp->nocb_gp_count;
-	*qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
-	       rdp->nocb_p_count_lazy +
-	       atomic_long_read(&rdp->nocb_follower_count_lazy) +
-	       rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
-}
+#ifdef CONFIG_RCU_NOCB_CPU
+	*ql = atomic_long_read(&rdp->nocb_q_count);
+	*qll = atomic_long_read(&rdp->nocb_q_count_lazy);
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
-{
 	*ql = 0;
 	*qll = 0;
-}
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+}
 #endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb..e5c43b7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2056,9 +2056,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 {
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+	unsigned long ret;
+#ifdef CONFIG_PROVE_RCU
 	struct rcu_head *rhp;
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
+	/*
+	 * Check count of all no-CBs callbacks awaiting invocation.
+	 * There needs to be a barrier before this function is called,
+	 * but associated with a prior determination that no more
+	 * callbacks would be posted.  In the worst case, the first
+	 * barrier in _rcu_barrier() suffices (but the caller cannot
+	 * necessarily rely on this, not a substitute for the caller
+	 * getting the concurrency design right!).  There must also be
+	 * a barrier between the following load an posting of a callback
+	 * (if a callback is in fact needed).  This is associated with an
+	 * atomic_inc() in the caller.
+	 */
+	ret = atomic_long_read(&rdp->nocb_q_count);
 
-	/* No-CBs CPUs might have callbacks on any of three lists. */
+#ifdef CONFIG_PROVE_RCU
 	rhp = ACCESS_ONCE(rdp->nocb_head);
 	if (!rhp)
 		rhp = ACCESS_ONCE(rdp->nocb_gp_head);
@@ -2072,8 +2089,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 		       cpu, rhp->func);
 		WARN_ON_ONCE(1);
 	}
+#endif /* #ifdef CONFIG_PROVE_RCU */
 
-	return !!rhp;
+	return !!ret;
 }
 
 /*
@@ -2095,9 +2113,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
 	struct task_struct *t;
 
 	/* Enqueue the callback on the nocb list and update counts. */
+	atomic_long_add(rhcount, &rdp->nocb_q_count);
+	/* rcu_barrier() relies on ->nocb_q_count add before xchg. */
 	old_rhpp = xchg(&rdp->nocb_tail, rhtp);
 	ACCESS_ONCE(*old_rhpp) = rhp;
-	atomic_long_add(rhcount, &rdp->nocb_q_count);
 	atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
 	smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
 
@@ -2288,9 +2307,6 @@ wait_again:
 		/* Move callbacks to wait-for-GP list, which is empty. */
 		ACCESS_ONCE(rdp->nocb_head) = NULL;
 		rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
-		rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
-		rdp->nocb_gp_count_lazy =
-			atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
 		gotcbs = true;
 	}
 
@@ -2338,9 +2354,6 @@ wait_again:
 		/* Append callbacks to follower's "done" list. */
 		tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
 		*tail = rdp->nocb_gp_head;
-		atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
-		atomic_long_add(rdp->nocb_gp_count_lazy,
-				&rdp->nocb_follower_count_lazy);
 		smp_mb__after_atomic(); /* Store *tail before wakeup. */
 		if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
 			/*
@@ -2415,13 +2428,11 @@ static int rcu_nocb_kthread(void *arg)
 		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
 		ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
 		tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
-		c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
-		cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
-		rdp->nocb_p_count += c;
-		rdp->nocb_p_count_lazy += cl;
 
 		/* Each pass through the following loop invokes a callback. */
-		trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
+		trace_rcu_batch_start(rdp->rsp->name,
+				      atomic_long_read(&rdp->nocb_q_count_lazy),
+				      atomic_long_read(&rdp->nocb_q_count), -1);
 		c = cl = 0;
 		while (list) {
 			next = list->next;
@@ -2443,9 +2454,9 @@ static int rcu_nocb_kthread(void *arg)
 			list = next;
 		}
 		trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
-		ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c;
-		ACCESS_ONCE(rdp->nocb_p_count_lazy) =
-						rdp->nocb_p_count_lazy - cl;
+		smp_mb__before_atomic();  /* _add after CB invocation. */
+		atomic_long_add(-c, &rdp->nocb_q_count);
+		atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
 		rdp->n_nocbs_invoked += c;
 	}
 	return 0;
-- 
cgit v0.10.2


From 5a43b88e98eaca88fa12abcacd0000adc879dd2c Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 24 Dec 2014 17:55:28 +0800
Subject: rcu: Remove "select IRQ_WORK" from config TREE_RCU

The 48a7639ce80c ("rcu: Make callers awaken grace-period kthread")
removed the irq_work_queue(), so the TREE_RCU doesn't need
irq work any more.  This commit therefore updates RCU's Kconfig and

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/init/Kconfig b/init/Kconfig
index 9afb971..39b4313 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -470,7 +470,6 @@ choice
 config TREE_RCU
 	bool "Tree-based hierarchical RCU"
 	depends on !PREEMPT && SMP
-	select IRQ_WORK
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP system with hundreds or
@@ -480,7 +479,6 @@ config TREE_RCU
 config PREEMPT_RCU
 	bool "Preemptible tree-based hierarchical RCU"
 	depends on PREEMPT
-	select IRQ_WORK
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP systems with hundreds or
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index cb59086..41860b9 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,7 +27,6 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
-#include <linux/irq_work.h>
 
 /*
  * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
-- 
cgit v0.10.2


From b08ea27d95bcaee6d9cf4edd64f373006661424a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 29 Oct 2014 15:39:39 -0700
Subject: rcu: Protect rcu_boost() lockless accesses with ACCESS_ONCE()

This commit prevents random compiler optimizations by applying
ACCESS_ONCE() to lockless accesses.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb..d59913e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1127,7 +1127,8 @@ static int rcu_boost(struct rcu_node *rnp)
 	struct task_struct *t;
 	struct list_head *tb;
 
-	if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
+	if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
+	    ACCESS_ONCE(rnp->boost_tasks) == NULL)
 		return 0;  /* Nothing left to boost. */
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-- 
cgit v0.10.2


From 74e871ac6cb17f67cbefb569d98a8d05de666e07 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 30 Oct 2014 21:08:53 -0700
Subject: rcu: Rename "empty" to "empty_norm" in preparation for boost rework

This commit undertakes a simple variable renaming to make way for
some rework of RCU priority boosting.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d59913e..3d22f0b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -313,8 +313,8 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
  */
 void rcu_read_unlock_special(struct task_struct *t)
 {
-	int empty;
 	int empty_exp;
+	int empty_norm;
 	int empty_exp_now;
 	unsigned long flags;
 	struct list_head *np;
@@ -367,7 +367,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 				break;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
-		empty = !rcu_preempt_blocked_readers_cgp(rnp);
+		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 		empty_exp = !rcu_preempted_readers_exp(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 		np = rcu_next_node_entry(t, rnp);
@@ -393,7 +393,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 		 * so we must take a snapshot of the expedited state.
 		 */
 		empty_exp_now = !rcu_preempted_readers_exp(rnp);
-		if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
+		if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
 			trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
 							 rnp->gpnum,
 							 0, rnp->qsmask,
-- 
cgit v0.10.2


From 8af3a5e78cfb63abe8813743946b7bd5a8a3134c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Oct 2014 11:22:37 -0700
Subject: rcu: Abstract rcu_cleanup_dead_rnp() from rcu_cleanup_dead_cpu()

This commit abstracts rcu_cleanup_dead_rnp() from rcu_cleanup_dead_cpu()
in preparation for the rework of RCU priority boosting.  This new function
will be invoked from rcu_read_unlock_special() in the reworked scheme,
which is why rcu_cleanup_dead_rnp() assumes that the leaf rcu_node
structure's ->qsmaskinit field has already been updated.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4c106fc..75c6b33 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2227,6 +2227,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 }
 
 /*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section.  Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields.  Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely.  That said, invoking it after the fact will cost you
+ * a needless lock acquisition.  So once it has done its work, don't
+ * invoke it again.
+ */
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+	long mask;
+	struct rcu_node *rnp = rnp_leaf;
+
+	if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+		return;
+	for (;;) {
+		mask = rnp->grpmask;
+		rnp = rnp->parent;
+		if (!rnp)
+			break;
+		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+		smp_mb__after_unlock_lock(); /* GP memory ordering. */
+		rnp->qsmaskinit &= ~mask;
+		if (rnp->qsmaskinit) {
+			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+			return;
+		}
+		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+	}
+}
+
+/*
  * The CPU has been completely removed, and some other CPU is reporting
  * this fact from process context.  Do the remainder of the cleanup,
  * including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2236,7 +2276,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
-	unsigned long mask;
 	int need_report = 0;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
@@ -2252,24 +2291,14 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
 	rcu_adopt_orphan_cbs(rsp, flags);
 
-	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
-	mask = rdp->grpmask;	/* rnp->grplo is constant. */
-	do {
-		raw_spin_lock(&rnp->lock);	/* irqs already disabled. */
-		smp_mb__after_unlock_lock();
-		rnp->qsmaskinit &= ~mask;
-		if (rnp->qsmaskinit != 0) {
-			if (rnp != rdp->mynode)
-				raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-			break;
-		}
-		if (rnp == rdp->mynode)
-			need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
-		else
-			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-		mask = rnp->grpmask;
-		rnp = rnp->parent;
-	} while (rnp != NULL);
+	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+	raw_spin_lock(&rnp->lock);	/* irqs already disabled. */
+	smp_mb__after_unlock_lock();	/* Enforce GP memory-order guarantee. */
+	rnp->qsmaskinit &= ~rdp->grpmask;
+	if (rnp->qsmaskinit == 0) {
+		need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
+		rcu_cleanup_dead_rnp(rnp);
+	}
 
 	/*
 	 * We still hold the leaf rcu_node structure lock here, and
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b184..9315477 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -552,6 +552,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
 				      unsigned long flags);
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static int rcu_print_task_stall(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3d22f0b..d044b9c 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -307,6 +307,15 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
 }
 
 /*
+ * Return true if the specified rcu_node structure has tasks that were
+ * preempted within an RCU read-side critical section.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
+{
+	return !list_empty(&rnp->blkd_tasks);
+}
+
+/*
  * Handle special cases during rcu_read_unlock(), such as needing to
  * notify RCU core processing or task having blocked during the RCU
  * read-side critical section.
@@ -967,6 +976,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 
+/*
+ * Because there is no preemptible RCU, there can be no readers blocked.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
+{
+	return false;
+}
+
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
 /*
-- 
cgit v0.10.2


From b6a932d1d9840727eee619d455bdeeedaa205be9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Oct 2014 12:05:04 -0700
Subject: rcu: Make rcu_read_unlock_special() propagate ->qsmaskinit bit
 clearing

This commit causes rcu_read_unlock_special() to propagate ->qsmaskinit
bit clearing up the rcu_node tree once a given rcu_node structure's
blkd_tasks list becomes empty.  This is the final commit in preparation
for the rework of RCU priority boosting:  It enables preempted tasks to
remain queued on their rcu_node structure even after all of that rcu_node
structure's CPUs have gone offline.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 75c6b33..6625a1b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2329,6 +2329,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 {
 }
 
+static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+}
+
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 }
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d044b9c..8a2b841 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -322,9 +322,10 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
  */
 void rcu_read_unlock_special(struct task_struct *t)
 {
-	int empty_exp;
-	int empty_norm;
-	int empty_exp_now;
+	bool empty;
+	bool empty_exp;
+	bool empty_norm;
+	bool empty_exp_now;
 	unsigned long flags;
 	struct list_head *np;
 #ifdef CONFIG_RCU_BOOST
@@ -376,6 +377,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 				break;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
+		empty = !rcu_preempt_has_tasks(rnp);
 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 		empty_exp = !rcu_preempted_readers_exp(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -396,6 +398,14 @@ void rcu_read_unlock_special(struct task_struct *t)
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
 		/*
+		 * If this was the last task on the list, go see if we
+		 * need to propagate ->qsmaskinit bit clearing up the
+		 * rcu_node tree.
+		 */
+		if (!empty && !rcu_preempt_has_tasks(rnp))
+			rcu_cleanup_dead_rnp(rnp);
+
+		/*
 		 * If this was the last task on the current list, and if
 		 * we aren't waiting on any CPUs, report the quiescent state.
 		 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
-- 
cgit v0.10.2


From d19fb8d1f3f66cc342d30aa48f090c70afb753ed Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Oct 2014 12:56:16 -0700
Subject: rcu: Don't migrate blocked tasks even if all corresponding CPUs
 offline

When the last CPU associated with a given leaf rcu_node structure
goes offline, something must be done about the tasks queued on that
rcu_node structure.  Each of these tasks has been preempted on one of
the leaf rcu_node structure's CPUs while in an RCU read-side critical
section that it have not yet exited.  Handling these tasks is the job of
rcu_preempt_offline_tasks(), which migrates them from the leaf rcu_node
structure to the root rcu_node structure.

Unfortunately, this migration has to be done one task at a time because
each tasks allegiance must be shifted from the original leaf rcu_node to
the root, so that future attempts to deal with these tasks will acquire
the root rcu_node structure's ->lock rather than that of the leaf.
Worse yet, this migration must be done with interrupts disabled, which
is not so good for realtime response, especially given that there is
no bound on the number of tasks on a given rcu_node structure's list.
(OK, OK, there is a bound, it is just that it is unreasonably large,
especially on 64-bit systems.)  This was not considered a problem back
when rcu_preempt_offline_tasks() was first written because realtime
systems were assumed not to do CPU-hotplug operations while real-time
applications were running.  This assumption has proved of dubious validity
given that people are starting to run multiple realtime applications
on a single SMP system and that it is common practice to offline then
online a CPU before starting its real-time application in order to clear
extraneous processing off of that CPU.  So we now need CPU hotplug
operations to avoid undue latencies.

This commit therefore avoids migrating these tasks, instead letting
them be dequeued one by one from the original leaf rcu_node structure
by rcu_read_unlock_special().  This means that the clearing of bits
from the upper-level rcu_node structures must be deferred until the
last such task has been dequeued, because otherwise subsequent grace
periods won't wait on them.  This commit has the beneficial side effect
of simplifying the CPU-hotplug code for TREE_PREEMPT_RCU, especially in
CONFIG_RCU_BOOST builds.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6625a1b..84f16cf 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2276,7 +2276,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
-	int need_report = 0;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
@@ -2295,25 +2294,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	raw_spin_lock(&rnp->lock);	/* irqs already disabled. */
 	smp_mb__after_unlock_lock();	/* Enforce GP memory-order guarantee. */
 	rnp->qsmaskinit &= ~rdp->grpmask;
-	if (rnp->qsmaskinit == 0) {
-		need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
+	if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
 		rcu_cleanup_dead_rnp(rnp);
-	}
-
-	/*
-	 * We still hold the leaf rcu_node structure lock here, and
-	 * irqs are still disabled.  The reason for this subterfuge is
-	 * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
-	 * held leads to deadlock.
-	 */
 	raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
-	rnp = rdp->mynode;
-	if (need_report & RCU_OFL_TASKS_NORM_GP)
-		rcu_report_unblock_qs_rnp(rnp, flags);
-	else
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
-	if (need_report & RCU_OFL_TASKS_EXP_GP)
-		rcu_report_exp_rnp(rsp, rnp, true);
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
 		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
 		  cpu, rdp->qlen, rdp->nxtlist);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9315477..883ebc8 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -514,13 +514,6 @@ extern struct list_head rcu_struct_flavors;
 #define for_each_rcu_flavor(rsp) \
 	list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
 
-/* Return values for rcu_preempt_offline_tasks(). */
-
-#define RCU_OFL_TASKS_NORM_GP	0x1		/* Tasks blocking normal */
-						/*  GP were moved to root. */
-#define RCU_OFL_TASKS_EXP_GP	0x2		/* Tasks blocking expedited */
-						/*  GP were moved to root. */
-
 /*
  * RCU implementation internal declarations:
  */
@@ -550,24 +543,13 @@ long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(void);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
-				      unsigned long flags);
 static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static int rcu_print_task_stall(struct rcu_node *rnp);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
-#ifdef CONFIG_HOTPLUG_CPU
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-				     struct rcu_node *rnp,
-				     struct rcu_data *rdp);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_preempt_check_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
-			       bool wake);
-#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
 static void __init __rcu_init_preempt(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8a2b841..d594da4 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -103,6 +103,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
 static struct rcu_state *rcu_state_p = &rcu_preempt_state;
 
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+			       bool wake);
 
 /*
  * Tell them what RCU they are running.
@@ -545,92 +547,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-/*
- * Handle tasklist migration for case in which all CPUs covered by the
- * specified rcu_node have gone offline.  Move them up to the root
- * rcu_node.  The reason for not just moving them to the immediate
- * parent is to remove the need for rcu_read_unlock_special() to
- * make more than two attempts to acquire the target rcu_node's lock.
- * Returns true if there were tasks blocking the current RCU grace
- * period.
- *
- * Returns 1 if there was previously a task blocking the current grace
- * period on the specified rcu_node structure.
- *
- * The caller must hold rnp->lock with irqs disabled.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-				     struct rcu_node *rnp,
-				     struct rcu_data *rdp)
-{
-	struct list_head *lp;
-	struct list_head *lp_root;
-	int retval = 0;
-	struct rcu_node *rnp_root = rcu_get_root(rsp);
-	struct task_struct *t;
-
-	if (rnp == rnp_root) {
-		WARN_ONCE(1, "Last CPU thought to be offlined?");
-		return 0;  /* Shouldn't happen: at least one CPU online. */
-	}
-
-	/* If we are on an internal node, complain bitterly. */
-	WARN_ON_ONCE(rnp != rdp->mynode);
-
-	/*
-	 * Move tasks up to root rcu_node.  Don't try to get fancy for
-	 * this corner-case operation -- just put this node's tasks
-	 * at the head of the root node's list, and update the root node's
-	 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
-	 * if non-NULL.  This might result in waiting for more tasks than
-	 * absolutely necessary, but this is a good performance/complexity
-	 * tradeoff.
-	 */
-	if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
-		retval |= RCU_OFL_TASKS_NORM_GP;
-	if (rcu_preempted_readers_exp(rnp))
-		retval |= RCU_OFL_TASKS_EXP_GP;
-	lp = &rnp->blkd_tasks;
-	lp_root = &rnp_root->blkd_tasks;
-	while (!list_empty(lp)) {
-		t = list_entry(lp->next, typeof(*t), rcu_node_entry);
-		raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
-		smp_mb__after_unlock_lock();
-		list_del(&t->rcu_node_entry);
-		t->rcu_blocked_node = rnp_root;
-		list_add(&t->rcu_node_entry, lp_root);
-		if (&t->rcu_node_entry == rnp->gp_tasks)
-			rnp_root->gp_tasks = rnp->gp_tasks;
-		if (&t->rcu_node_entry == rnp->exp_tasks)
-			rnp_root->exp_tasks = rnp->exp_tasks;
-#ifdef CONFIG_RCU_BOOST
-		if (&t->rcu_node_entry == rnp->boost_tasks)
-			rnp_root->boost_tasks = rnp->boost_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
-		raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
-	}
-
-	rnp->gp_tasks = NULL;
-	rnp->exp_tasks = NULL;
-#ifdef CONFIG_RCU_BOOST
-	rnp->boost_tasks = NULL;
-	/*
-	 * In case root is being boosted and leaf was not.  Make sure
-	 * that we boost the tasks blocking the current grace period
-	 * in this case.
-	 */
-	raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
-	smp_mb__after_unlock_lock();
-	if (rnp_root->boost_tasks != NULL &&
-	    rnp_root->boost_tasks != rnp_root->gp_tasks &&
-	    rnp_root->boost_tasks != rnp_root->exp_tasks)
-		rnp_root->boost_tasks = rnp_root->gp_tasks;
-	raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-	return retval;
-}
-
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
 /*
@@ -979,13 +895,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-/* Because preemptible RCU does not exist, no quieting of tasks. */
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
-	__releases(rnp->lock)
-{
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
 /*
  * Because there is no preemptible RCU, there can be no readers blocked.
  */
@@ -1023,23 +932,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 	WARN_ON_ONCE(rnp->qsmask);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Because preemptible RCU does not exist, it never needs to migrate
- * tasks that were blocked within RCU read-side critical sections, and
- * such non-existent tasks cannot possibly have been blocking the current
- * grace period.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-				     struct rcu_node *rnp,
-				     struct rcu_data *rdp)
-{
-	return 0;
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
 /*
  * Because preemptible RCU does not exist, it never has any callbacks
  * to check.
@@ -1058,20 +950,6 @@ void synchronize_rcu_expedited(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Because preemptible RCU does not exist, there is never any need to
- * report on tasks preempted in RCU read-side critical sections during
- * expedited RCU grace periods.
- */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
-			       bool wake)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
 /*
  * Because preemptible RCU does not exist, rcu_barrier() is just
  * another name for rcu_barrier_sched().
-- 
cgit v0.10.2


From a8f4cbadfb5c5f7f5d9eef70821276a4e2e9289d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Oct 2014 13:48:41 -0700
Subject: rcu: Shorten irq-disable region in rcu_cleanup_dead_cpu()

Now that we are not migrating callbacks, there is no need to hold the
->orphan_lock across the the ->qsmaskinit bit-clearing process.
This commit therefore releases ->orphan_lock immediately after adopting
the orphaned RCU callbacks.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 84f16cf..990b406 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2289,14 +2289,14 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
 	rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
 	rcu_adopt_orphan_cbs(rsp, flags);
+	raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
 
 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
-	raw_spin_lock(&rnp->lock);	/* irqs already disabled. */
+	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();	/* Enforce GP memory-order guarantee. */
 	rnp->qsmaskinit &= ~rdp->grpmask;
 	if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
 		rcu_cleanup_dead_rnp(rnp);
-	raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
 		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
-- 
cgit v0.10.2


From 96e92021d4fad8543d598b5e4926cb34fd40c4c4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Oct 2014 14:09:23 -0700
Subject: rcu: Make use of rcu_preempt_has_tasks()

Given that there is now arcu_preempt_has_tasks() function that checks
to see if the ->blkd_tasks list is non-empty, this commit makes use of it.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d594da4..bcc6d91 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -540,7 +540,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 {
 	WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
-	if (!list_empty(&rnp->blkd_tasks))
+	if (rcu_preempt_has_tasks(rnp))
 		rnp->gp_tasks = rnp->blkd_tasks.next;
 	WARN_ON_ONCE(rnp->qsmask);
 }
@@ -706,7 +706,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
-	if (list_empty(&rnp->blkd_tasks)) {
+	if (!rcu_preempt_has_tasks(rnp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	} else {
 		rnp->exp_tasks = rnp->blkd_tasks.next;
@@ -985,7 +985,7 @@ void exit_rcu(void)
 
 static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 {
-	if (list_empty(&rnp->blkd_tasks))
+	if (!rcu_preempt_has_tasks(rnp))
 		rnp->n_balk_blkd_tasks++;
 	else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
 		rnp->n_balk_exp_gp_tasks++;
-- 
cgit v0.10.2


From 3e9f5c70d85060ff0db71826e2048daffec336e7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 3 Nov 2014 18:15:17 -0800
Subject: rcu: Don't spawn rcub kthreads on root rcu_node structure

Now that offlining CPUs no longer moves leaf rcu_node structures'
->blkd_tasks lists to the root, there is no way for the root rcu_node
structure's ->blkd_task list to be nonempty, unless the root node is also
the sole leaf node.  This commit therefore refrains from creating an rcub
kthread for the root rcu_node structure unless it is also the sole leaf.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index bcc6d91..3e64bb1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1352,12 +1352,8 @@ static void __init rcu_spawn_boost_kthreads(void)
 	for_each_possible_cpu(cpu)
 		per_cpu(rcu_cpu_has_work, cpu) = 0;
 	BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
-	rnp = rcu_get_root(rcu_state_p);
-	(void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
-	if (NUM_RCU_NODES > 1) {
-		rcu_for_each_leaf_node(rcu_state_p, rnp)
-			(void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
-	}
+	rcu_for_each_leaf_node(rcu_state_p, rnp)
+		(void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
 }
 
 static void rcu_prepare_kthreads(int cpu)
-- 
cgit v0.10.2


From 1be0085b515e786e962b9f2d03616209eb6eb9a7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 3 Nov 2014 18:20:20 -0800
Subject: rcu: Don't initiate RCU priority boosting on root rcu_node

Because there is no longer any preempted tasks on the root rcu_node, and
because there is no longer ever an rcub kthread for the root rcu_node,
this commit drops the code in force_qs_rnp() that attempts to awaken
the non-existent root rcub kthread.  This is strictly a performance
enhancement, removing a root rcu_node ->lock acquisition and release
along with some tests in rcu_initiate_boost(), ending with the test that
notes that there is no rcub kthread.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 990b406..5a0a4c96 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2513,12 +2513,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
 		}
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	}
-	rnp = rcu_get_root(rsp);
-	if (rnp->qsmask == 0) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
-		rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
-	}
 }
 
 /*
-- 
cgit v0.10.2


From 5d0b024973027556b48a09bb36b55dc853ec7c6e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 10 Nov 2014 08:07:08 -0800
Subject: rcu: Don't bother affinitying rcub kthreads away from offline CPUs

When rcu_boost_kthread_setaffinity() sees that all CPUs for a given
rcu_node structure are now offline, it affinities the corresponding
RCU-boost ("rcub") kthread away from those CPUs.  This is pointless
because the kthread cannot run on those offline CPUs in any case.
This commit therefore removes this unneeded code.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3e64bb1..1fac682 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1322,12 +1322,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 	for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
 		if ((mask & 0x1) && cpu != outgoingcpu)
 			cpumask_set_cpu(cpu, cm);
-	if (cpumask_weight(cm) == 0) {
+	if (cpumask_weight(cm) == 0)
 		cpumask_setall(cm);
-		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
-			cpumask_clear_cpu(cpu, cm);
-		WARN_ON_ONCE(cpumask_weight(cm) == 0);
-	}
 	set_cpus_allowed_ptr(t, cm);
 	free_cpumask_var(cm);
 }
-- 
cgit v0.10.2


From 3ba4d0e09bf965297e97adf195e0ea246cfe5c74 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 12 Nov 2014 09:57:51 -0800
Subject: rcu: Note quiescent state when CPU goes offline

The rcu_cleanup_dead_cpu() function (called after a CPU has gone
completely offline) has not reported a quiescent state because there
was probably at least one synchronize_rcu() between the time the CPU
went offline and the CPU_DEAD notifier, and this would have detected
the CPU's offline state via quiescent-state forcing.  However, the plan
is for CPUs to take themselves offline, at which point it makes sense
for them to report their own quiescent state.  This commit makes this
change in preparation for the new CPU-hotplug setup.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5a0a4c96..24d3c67 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2297,7 +2297,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	rnp->qsmaskinit &= ~rdp->grpmask;
 	if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
 		rcu_cleanup_dead_rnp(rnp);
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
 	WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
 		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
 		  cpu, rdp->qlen, rdp->nxtlist);
-- 
cgit v0.10.2


From abaf3f9d275b8d856ae5e47531e40c0bfeac012b Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 18 Nov 2014 16:30:01 +0800
Subject: rcu: Revert "Allow post-unlock reference for rt_mutex" to avoid
 priority-inversion

The patch dfeb9765ce3c ("Allow post-unlock reference for rt_mutex")
ensured rcu-boost safe even the rt_mutex has post-unlock reference.

But rt_mutex allowing post-unlock reference is definitely a bug and it was
fixed by the commit 27e35715df54 ("rtmutex: Plug slow unlock race").
This fix made the previous patch (dfeb9765ce3c) useless.

And even worse, the priority-inversion introduced by the the previous
patch still exists.

rcu_read_unlock_special() {
	rt_mutex_unlock(&rnp->boost_mtx);
	/* Priority-Inversion:
	 * the current task had been deboosted and preempted as a low
	 * priority task immediately, it could wait long before reschedule in,
	 * and the rcu-booster also waits on this low priority task and sleeps.
	 * This priority-inversion makes rcu-booster can't work
	 * as expected.
	 */
	complete(&rnp->boost_completion);
}

Just revert the patch to avoid it.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 883ebc8..9535647 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -172,11 +172,6 @@ struct rcu_node {
 				/*  queued on this rcu_node structure that */
 				/*  are blocking the current grace period, */
 				/*  there can be no such task. */
-	struct completion boost_completion;
-				/* Used to ensure that the rt_mutex used */
-				/*  to carry out the boosting is fully */
-				/*  released with no future boostee accesses */
-				/*  before that rt_mutex is re-initialized. */
 	struct rt_mutex boost_mtx;
 				/* Used only for the priority-boosting */
 				/*  side effect, not as a lock. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 1fac682..625e260 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -429,10 +429,8 @@ void rcu_read_unlock_special(struct task_struct *t)
 
 #ifdef CONFIG_RCU_BOOST
 		/* Unboost if we were boosted. */
-		if (drop_boost_mutex) {
+		if (drop_boost_mutex)
 			rt_mutex_unlock(&rnp->boost_mtx);
-			complete(&rnp->boost_completion);
-		}
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
 		/*
@@ -1081,15 +1079,11 @@ static int rcu_boost(struct rcu_node *rnp)
 	 */
 	t = container_of(tb, struct task_struct, rcu_node_entry);
 	rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
-	init_completion(&rnp->boost_completion);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	/* Lock only for side effect: boosts task t's priority. */
 	rt_mutex_lock(&rnp->boost_mtx);
 	rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
 
-	/* Wait for boostee to be done w/boost_mtx before reinitializing. */
-	wait_for_completion(&rnp->boost_completion);
-
 	return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
 	       ACCESS_ONCE(rnp->boost_tasks) != NULL;
 }
-- 
cgit v0.10.2


From 6cd534ef8bfe422a5a847b953d1039841509c374 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 10 Dec 2014 07:45:13 -0800
Subject: rcu: Don't scan root rcu_node structure for stalled tasks

Now that blocked tasks are no longer migrated to the root rcu_node
structure, there is no need to scan the root rcu_node structure for
blocked tasks stalling the current grace period.  This commit therefore
removes this scan.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 24d3c67..4ed2c28 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1107,15 +1107,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	}
 
-	/*
-	 * Now rat on any tasks that got kicked up to the root rcu_node
-	 * due to CPU offlining.
-	 */
-	rnp = rcu_get_root(rsp);
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	ndetected += rcu_print_task_stall(rnp);
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-
 	print_cpu_stall_info_end();
 	for_each_possible_cpu(cpu)
 		totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
-- 
cgit v0.10.2


From ab954c167ed9ac107a8beb1d6e1745ae698a3421 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 17 Dec 2014 18:25:59 -0800
Subject: rcu: Remove redundant callback-list initialization

The RCU callback lists are initialized in both rcu_boot_init_percpu_data()
and rcu_init_percpu_data().  The former is intended for initializing
immutable data, so this commit removes the initialization from
rcu_boot_init_percpu_data() and leaves it in rcu_init_percpu_data().
This change prepares for permitting callbacks to be queued very early
in boot.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4ed2c28..febd0f7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3419,9 +3419,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
-	init_callback_list(rdp);
-	rdp->qlen_lazy = 0;
-	ACCESS_ONCE(rdp->qlen) = 0;
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
 	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
-- 
cgit v0.10.2


From a5c198f4f7da6cc48116ca239c59c9f44b753364 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 23 Nov 2014 20:30:06 -0800
Subject: rcu: Expand SRCU ->completed to 64 bits

When rcutorture used only the low-order 32 bits of the grace-period
number, it was not a problem for SRCU to use a 32-bit completed field.
However, rcutorture now uses the full 64 bits on 64-bit systems, so
this commit converts SRCU's ->completed field to unsigned long so as to
provide 64 bits on 64-bit systems.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index a2783cb..ef923dd 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -45,7 +45,7 @@ struct rcu_batch {
 #define RCU_BATCH_INIT(name) { NULL, &(name.head) }
 
 struct srcu_struct {
-	unsigned completed;
+	unsigned long completed;
 	struct srcu_struct_array __percpu *per_cpu_ref;
 	spinlock_t queue_lock; /* protect ->batch_queue, ->running */
 	bool running;
@@ -135,7 +135,7 @@ int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
 void synchronize_srcu(struct srcu_struct *sp);
 void synchronize_srcu_expedited(struct srcu_struct *sp);
-long srcu_batches_completed(struct srcu_struct *sp);
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
 void srcu_barrier(struct srcu_struct *sp);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e037f3e..445bf8f 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
  * Report the number of batches, correlated with, but not necessarily
  * precisely the same as, the number of grace periods that have elapsed.
  */
-long srcu_batches_completed(struct srcu_struct *sp)
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
 {
 	return sp->completed;
 }
-- 
cgit v0.10.2


From 9735af5c78599703be633c057af3faee26482028 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 26 Nov 2014 10:42:50 -0800
Subject: rcu: Combine DEFINE_SRCU() and DEFINE_STATIC_SRCU()

The DEFINE_SRCU() and DEFINE_STATIC_SRCU() definitions are quite
similar, so this commit combines them, saving a bit of code and removing
redundancy.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index ef923dd..9cfd962 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -102,13 +102,11 @@ void process_srcu(struct work_struct *work);
  * define and init a srcu struct at build time.
  * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it.
  */
-#define DEFINE_SRCU(name)						\
+#define __DEFINE_SRCU(name, is_static)					\
 	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
-	struct srcu_struct name = __SRCU_STRUCT_INIT(name);
-
-#define DEFINE_STATIC_SRCU(name)					\
-	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
-	static struct srcu_struct name = __SRCU_STRUCT_INIT(name);
+	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
+#define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
+#define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
 
 /**
  * call_srcu() - Queue a callback for invocation after an SRCU grace period
-- 
cgit v0.10.2


From 83fe27ea531161a655f02dc7732d14cfaa27fd5d Mon Sep 17 00:00:00 2001
From: Pranith Kumar <bobby.prani@gmail.com>
Date: Fri, 5 Dec 2014 11:24:45 -0500
Subject: rcu: Make SRCU optional by using CONFIG_SRCU

SRCU is not necessary to be compiled by default in all cases. For tinification
efforts not compiling SRCU unless necessary is desirable.

The current patch tries to make compiling SRCU optional by introducing a new
Kconfig option CONFIG_SRCU which is selected when any of the components making
use of SRCU are selected.

If we do not select CONFIG_SRCU, srcu.o will not be compiled at all.

   text    data     bss     dec     hex filename
   2007       0       0    2007     7d7 kernel/rcu/srcu.o

Size of arch/powerpc/boot/zImage changes from

   text    data     bss     dec     hex filename
 831552   64180   23944  919676   e087c arch/powerpc/boot/zImage : before
 829504   64180   23952  917636   e0084 arch/powerpc/boot/zImage : after

so the savings are about ~2000 bytes.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: resolve conflict due to removal of arch/ia64/kvm/Kconfig. ]

diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 466bd29..3afee5f 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -23,6 +23,7 @@ config KVM
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	select KVM_MMIO
 	select KVM_ARM_HOST
+	select SRCU
 	depends on ARM_VIRT_EXT && ARM_LPAE
 	---help---
 	  Support hosting virtualized guest machines. You will also
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 8ba85e9..b334084 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -26,6 +26,7 @@ config KVM
 	select KVM_ARM_HOST
 	select KVM_ARM_VGIC
 	select KVM_ARM_TIMER
+	select SRCU
 	---help---
 	  Support hosting virtualized guest machines.
 
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 30e334e..2ae1282 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -20,6 +20,7 @@ config KVM
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	select KVM_MMIO
+	select SRCU
 	---help---
 	  Support for hosting Guest kernels.
 	  Currently supported on MIPS32 processors.
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index f5769f1..11850f3 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	select HAVE_KVM_EVENTFD
+	select SRCU
 
 config KVM_BOOK3S_HANDLER
 	bool
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 646db9c..5fce52c 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQFD
 	select HAVE_KVM_IRQ_ROUTING
+	select SRCU
 	---help---
 	  Support hosting paravirtualized guest machines using the SIE
 	  virtualization capability on the mainframe. This should work
diff --git a/arch/tile/kvm/Kconfig b/arch/tile/kvm/Kconfig
index 2298cb1..1e968f7 100644
--- a/arch/tile/kvm/Kconfig
+++ b/arch/tile/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
 	depends on HAVE_KVM && MODULES
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
+	select SRCU
 	---help---
 	  Support hosting paravirtualized guest machines.
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ba397bd..6612699 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -138,6 +138,7 @@ config X86
 	select HAVE_ACPI_APEI_NMI if ACPI
 	select ACPI_LEGACY_TABLES_LOOKUP if ACPI
 	select X86_FEATURE_NAMES if PROC_FS
+	select SRCU
 
 config INSTRUCTION_DECODER
 	def_bool y
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index f9d16ff..7dc7ba5 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -40,6 +40,7 @@ config KVM
 	select HAVE_KVM_MSI
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	select KVM_VFIO
+	select SRCU
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig
index 3f44f29..91f8613 100644
--- a/drivers/clk/Kconfig
+++ b/drivers/clk/Kconfig
@@ -13,6 +13,7 @@ config COMMON_CLK
 	bool
 	select HAVE_CLK_PREPARE
 	select CLKDEV_LOOKUP
+	select SRCU
 	---help---
 	  The common clock framework is a single definition of struct
 	  clk, useful across many platforms, as well as an
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 29b2ef5..a171fef 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -2,6 +2,7 @@ menu "CPU Frequency scaling"
 
 config CPU_FREQ
 	bool "CPU Frequency scaling"
+	select SRCU
 	help
 	  CPU Frequency scaling allows you to change the clock speed of 
 	  CPUs on the fly. This is a nice method to save power, because 
diff --git a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig
index faf4e70..3891f67 100644
--- a/drivers/devfreq/Kconfig
+++ b/drivers/devfreq/Kconfig
@@ -1,5 +1,6 @@
 menuconfig PM_DEVFREQ
 	bool "Generic Dynamic Voltage and Frequency Scaling (DVFS) support"
+	select SRCU
 	help
 	  A device may have a list of frequencies and voltages available.
 	  devfreq, a generic DVFS framework can be registered for a device
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 5bdedf6..c355a22 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -5,6 +5,7 @@
 menuconfig MD
 	bool "Multiple devices driver support (RAID and LVM)"
 	depends on BLOCK
+	select SRCU
 	help
 	  Support multiple physical spindles through a single logical device.
 	  Required for RAID and logical volume management.
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index d6607ee..84673eb 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -197,6 +197,7 @@ config NETCONSOLE_DYNAMIC
 
 config NETPOLL
 	def_bool NETCONSOLE
+	select SRCU
 
 config NET_POLL_CONTROLLER
 	def_bool NETPOLL
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index a66768e..80e9c18 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -8,6 +8,7 @@ config BTRFS_FS
 	select LZO_DECOMPRESS
 	select RAID6_PQ
 	select XOR_BLOCKS
+	select SRCU
 
 	help
 	  Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 22c629e..2a24249 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,5 +1,6 @@
 config FSNOTIFY
 	def_bool n
+	select SRCU
 
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index c51df1d..4a09975 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -5,6 +5,7 @@
 config QUOTA
 	bool "Quota support"
 	select QUOTACTL
+	select SRCU
 	help
 	  If you say Y here, you will be able to set per user limits for disk
 	  usage (also called disk quotas). Currently, it works for the
diff --git a/init/Kconfig b/init/Kconfig
index 9afb971..f085969 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -501,9 +501,17 @@ config TINY_RCU
 
 endchoice
 
+config SRCU
+	bool
+	help
+	  This option selects the sleepable version of RCU. This version
+	  permits arbitrary sleeping or blocking within RCU read-side critical
+	  sections.
+
 config TASKS_RCU
 	bool "Task_based RCU implementation using voluntary context switch"
 	default n
+	select SRCU
 	help
 	  This option enables a task-based RCU implementation that uses
 	  only voluntary context switch (not preemption!), idle, and
@@ -1595,6 +1603,7 @@ config PERF_EVENTS
 	depends on HAVE_PERF_EVENTS
 	select ANON_INODES
 	select IRQ_WORK
+	select SRCU
 	help
 	  Enable kernel support for various performance events provided
 	  by software and hardware.
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4803da6..ae9fc7c 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
 }
 EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
 
+#ifdef CONFIG_SRCU
 /*
  *	SRCU notifier chain routines.    Registration and unregistration
  *	use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
 }
 EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
 
+#endif /* CONFIG_SRCU */
+
 static ATOMIC_NOTIFIER_HEAD(die_chain);
 
 int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 48b28d3..7e01f78 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -251,6 +251,7 @@ config APM_EMULATION
 
 config PM_OPP
 	bool
+	select SRCU
 	---help---
 	  SOCs have a standard set of tuples consisting of frequency and
 	  voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index e6fae50..50a8084 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,5 @@
-obj-y += update.o srcu.o
+obj-y += update.o
+obj-$(CONFIG_SRCU) += srcu.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_PREEMPT_RCU) += tree.o
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5f2ce61..7a9c93e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1215,6 +1215,7 @@ config RCU_TORTURE_TEST
 	tristate "torture tests for RCU"
 	depends on DEBUG_KERNEL
 	select TORTURE_TEST
+	select SRCU
 	default n
 	help
 	  This option provides a kernel module that runs torture tests
diff --git a/mm/Kconfig b/mm/Kconfig
index 1d1ae6b..4395b12 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -325,6 +325,7 @@ config VIRT_TO_BUS
 
 config MMU_NOTIFIER
 	bool
+	select SRCU
 
 config KSM
 	bool "Enable KSM for page merging"
diff --git a/security/tomoyo/Kconfig b/security/tomoyo/Kconfig
index 8eb779b9d..604e718 100644
--- a/security/tomoyo/Kconfig
+++ b/security/tomoyo/Kconfig
@@ -5,6 +5,7 @@ config SECURITY_TOMOYO
 	select SECURITYFS
 	select SECURITY_PATH
 	select SECURITY_NETWORK
+	select SRCU
 	default n
 	help
 	  This selects TOMOYO Linux, pathname-based access control.
-- 
cgit v0.10.2


From 68158fe2b2aa0f0c1cf96b40020c7d180915479b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 8 Dec 2014 07:41:31 -0800
Subject: rcu: Set default to RCU_CPU_STALL_INFO=y

The RCU_CPU_STALL_INFO code has been in for quite some time, and has
proven reliable.  This commit therefore enables it by default.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5f2ce61..b47d9e4 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1257,7 +1257,7 @@ config RCU_CPU_STALL_TIMEOUT
 config RCU_CPU_STALL_INFO
 	bool "Print additional diagnostics on RCU CPU stall"
 	depends on (TREE_RCU || PREEMPT_RCU) && DEBUG_KERNEL
-	default n
+	default y
 	help
 	  For each stalled CPU that is aware of the current RCU grace
 	  period, print out additional per-CPU diagnostic information
-- 
cgit v0.10.2


From e9408e4f278875aa5862560b4b425b27dfcb643f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 8 Dec 2014 09:13:52 -0800
Subject: rcutorture: Add checks for stall ending before dump start

The current rcutorture scripting checks for actual stalls (via the "Call
Trace:" check), but fails to spot the case where a stall ends just as it
is being detected.  This commit therefore adds a check for this case.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index f962ba4..d8f35cf 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -36,7 +36,7 @@ if grep -Pq '\x00' < $file
 then
 	print_warning Console output contains nul bytes, old qemu still running?
 fi
-egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $T
+egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $T
 if test -s $T
 then
 	print_warning Assertion failure in $file $title
-- 
cgit v0.10.2


From fc908ed33e7c1428f799abb12399f906da03b397 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 8 Dec 2014 09:57:48 -0800
Subject: rcu: Make RCU_CPU_STALL_INFO include number of fqs attempts

One way that an RCU CPU stall warning can happen is if the grace-period
kthread is not allowed to execute.  One proxy for this kthread's
forward progress is the number of force-quiescent-state (fqs) scans.
This commit therefore adds the number of fqs scans to the RCU CPU stall
warning printouts when CONFIG_RCU_CPU_STALL_INFO=y.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4c106fc..654b15b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1043,6 +1043,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
 	j1 = rcu_jiffies_till_stall_check();
 	ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
 	rsp->jiffies_resched = j + j1 / 2;
+	rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
 }
 
 /*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b184..e300848 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -492,6 +492,8 @@ struct rcu_state {
 						/*  for CPU stalls. */
 	unsigned long jiffies_resched;		/* Time at which to resched */
 						/*  a reluctant CPU. */
+	unsigned long n_force_qs_gpstart;	/* Snapshot of n_force_qs at */
+						/*  GP start. */
 	unsigned long gp_max;			/* Maximum GP duration in */
 						/*  jiffies. */
 	const char *name;			/* Name of structure. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb..769384d7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1898,11 +1898,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 		ticks_value = rsp->gpnum - rdp->gpnum;
 	}
 	print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-	pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
+	pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
 	       cpu, ticks_value, ticks_title,
 	       atomic_read(&rdtp->dynticks) & 0xfff,
 	       rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
 	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+	       ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
 	       fast_no_hz);
 }
 
-- 
cgit v0.10.2


From 6ccd2ecd422644277b7d8b37222e3af3f43ea9ae Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 11 Dec 2014 10:20:59 -0800
Subject: rcu: Improve diagnostics for spurious RCU CPU stall warnings

The current RCU CPU stall warning code will print "Stall ended before
state dump start" any time that the stall-warning code is triggered on
a CPU that has already reported a quiescent state for the current grace
period and if all quiescent states have been reported for the current
grace period.  However, a true stall can result in these symptoms, for
example, by preventing RCU's grace-period kthreads from ever running

This commit therefore checks for this condition, reporting the end of
the stall only if one of the grace-period counters has actually advanced.
Otherwise, it reports the last time that the grace-period kthread made
meaningful progress.  (In normal situations, the grace-period kthread
should make meaningful progress at least every jiffies_till_next_fqs
jiffies.)

Reported-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Miroslav Benes <mbenes@suse.cz>

diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index ed186a9..55f9707 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -187,6 +187,11 @@ o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
 	behavior, you might need to replace some of the cond_resched()
 	calls with calls to cond_resched_rcu_qs().
 
+o	Anything that prevents RCU's grace-period kthreads from running.
+	This can result in the "All QSes seen" console-log message.
+	This message will include information on when the kthread last
+	ran and how often it should be expected to run.
+
 o	A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
 	happen to preempt a low-priority task in the middle of an RCU
 	read-side critical section.   This is especially damaging if
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 654b15b..a2ceb66 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1066,11 +1066,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 	}
 }
 
-static void print_other_cpu_stall(struct rcu_state *rsp)
+static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 {
 	int cpu;
 	long delta;
 	unsigned long flags;
+	unsigned long gpa;
+	unsigned long j;
 	int ndetected = 0;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	long totqlen = 0;
@@ -1123,10 +1125,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 	pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
 	       smp_processor_id(), (long)(jiffies - rsp->gp_start),
 	       (long)rsp->gpnum, (long)rsp->completed, totqlen);
-	if (ndetected == 0)
-		pr_err("INFO: Stall ended before state dump start\n");
-	else
+	if (ndetected) {
 		rcu_dump_cpu_stacks(rsp);
+	} else {
+		if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
+		    ACCESS_ONCE(rsp->completed) == gpnum) {
+			pr_err("INFO: Stall ended before state dump start\n");
+		} else {
+			j = jiffies;
+			gpa = ACCESS_ONCE(rsp->gp_activity);
+			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
+			       rsp->name, j - gpa, j, gpa,
+			       jiffies_till_next_fqs);
+			/* In this case, the current CPU might be at fault. */
+			sched_show_task(current);
+		}
+	}
 
 	/* Complain about tasks blocking the grace period. */
 
@@ -1226,7 +1240,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
 
 		/* They had a few time units to dump stack, so complain. */
-		print_other_cpu_stall(rsp);
+		print_other_cpu_stall(rsp, gpnum);
 	}
 }
 
@@ -1622,6 +1636,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
+	ACCESS_ONCE(rsp->gp_activity) = jiffies;
 	rcu_bind_gp_kthread();
 	raw_spin_lock_irq(&rnp->lock);
 	smp_mb__after_unlock_lock();
@@ -1682,6 +1697,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock_irq(&rnp->lock);
 		cond_resched_rcu_qs();
+		ACCESS_ONCE(rsp->gp_activity) = jiffies;
 	}
 
 	mutex_unlock(&rsp->onoff_mutex);
@@ -1698,6 +1714,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 	unsigned long maxj;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
+	ACCESS_ONCE(rsp->gp_activity) = jiffies;
 	rsp->n_force_qs++;
 	if (fqs_state == RCU_SAVE_DYNTICK) {
 		/* Collect dyntick-idle snapshots. */
@@ -1736,6 +1753,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
+	ACCESS_ONCE(rsp->gp_activity) = jiffies;
 	raw_spin_lock_irq(&rnp->lock);
 	smp_mb__after_unlock_lock();
 	gp_duration = jiffies - rsp->gp_start;
@@ -1772,6 +1790,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 		nocb += rcu_future_gp_cleanup(rsp, rnp);
 		raw_spin_unlock_irq(&rnp->lock);
 		cond_resched_rcu_qs();
+		ACCESS_ONCE(rsp->gp_activity) = jiffies;
 	}
 	rnp = rcu_get_root(rsp);
 	raw_spin_lock_irq(&rnp->lock);
@@ -1821,6 +1840,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 			if (rcu_gp_init(rsp))
 				break;
 			cond_resched_rcu_qs();
+			ACCESS_ONCE(rsp->gp_activity) = jiffies;
 			WARN_ON(signal_pending(current));
 			trace_rcu_grace_period(rsp->name,
 					       ACCESS_ONCE(rsp->gpnum),
@@ -1864,9 +1884,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
 						       ACCESS_ONCE(rsp->gpnum),
 						       TPS("fqsend"));
 				cond_resched_rcu_qs();
+				ACCESS_ONCE(rsp->gp_activity) = jiffies;
 			} else {
 				/* Deal with stray signal. */
 				cond_resched_rcu_qs();
+				ACCESS_ONCE(rsp->gp_activity) = jiffies;
 				WARN_ON(signal_pending(current));
 				trace_rcu_grace_period(rsp->name,
 						       ACCESS_ONCE(rsp->gpnum),
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e300848..5ec81cf 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -488,6 +488,8 @@ struct rcu_state {
 						/*  due to no GP active. */
 	unsigned long gp_start;			/* Time at which GP started, */
 						/*  but in jiffies. */
+	unsigned long gp_activity;		/* Time of last GP kthread */
+						/*  activity in jiffies. */
 	unsigned long jiffies_stall;		/* Time at which to check */
 						/*  for CPU stalls. */
 	unsigned long jiffies_resched;		/* Time at which to resched */
-- 
cgit v0.10.2


From e3663b1024d1f94688e5233440ad67a9bc10b94e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 8 Dec 2014 20:26:55 -0800
Subject: rcu: Handle gpnum/completed wrap while dyntick idle

Subtle race conditions can result if a CPU stays in dyntick-idle mode
long enough for the ->gpnum and ->completed fields to wrap.  For
example, consider the following sequence of events:

o	CPU 1 encounters a quiescent state while waiting for grace period
	5 to complete, but then enters dyntick-idle mode.

o	While CPU 1 is in dyntick-idle mode, the grace-period counters
	wrap around so that the grace period number is now 4.

o	Just as CPU 1 exits dyntick-idle mode, grace period 4 completes
	and grace period 5 begins.

o	The quiescent state that CPU 1 passed through during the old
	grace period 5 looks like it applies to the new grace period
	5.  Therefore, the new grace period 5 completes without CPU 1
	having passed through a quiescent state.

This could clearly be a fatal surprise to any long-running RCU read-side
critical section that happened to be running on CPU 1 at the time.  At one
time, this was not a problem, given that it takes significant time for
the grace-period counters to overflow even on 32-bit systems.  However,
with the advent of NO_HZ_FULL and SMP embedded systems, arbitrarily long
idle periods are now becoming quite feasible.  It is therefore time to
close this race.

This commit therefore avoids this race condition by having the
quiescent-state forcing code detect when a CPU is falling too far
behind, and setting a new rcu_data field ->gpwrap when this happens.
Whenever this new ->gpwrap field is set, the CPU's ->gpnum and ->completed
fields are known to be untrustworthy, and can be ignored, along with
any associated quiescent states.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a2ceb66..5987fdc 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -930,6 +930,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
 		return 1;
 	} else {
+		if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+				 rdp->mynode->gpnum))
+			ACCESS_ONCE(rdp->gpwrap) = true;
 		return 0;
 	}
 }
@@ -1577,7 +1580,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 	bool ret;
 
 	/* Handle the ends of any preceding grace periods first. */
-	if (rdp->completed == rnp->completed) {
+	if (rdp->completed == rnp->completed &&
+	    !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
 
 		/* No grace period end, so just accelerate recent callbacks. */
 		ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1592,7 +1596,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
 	}
 
-	if (rdp->gpnum != rnp->gpnum) {
+	if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
 		/*
 		 * If the current grace period is waiting for this CPU,
 		 * set up to detect a quiescent state, otherwise don't
@@ -1603,6 +1607,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		rdp->passed_quiesce = 0;
 		rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
 		zero_cpu_stall_ticks(rdp);
+		ACCESS_ONCE(rdp->gpwrap) = false;
 	}
 	return ret;
 }
@@ -1616,7 +1621,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 	local_irq_save(flags);
 	rnp = rdp->mynode;
 	if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
-	     rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
+	     rdp->completed == ACCESS_ONCE(rnp->completed) &&
+	     !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
 	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
 		local_irq_restore(flags);
 		return;
@@ -2066,7 +2072,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
 	if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
-	    rnp->completed == rnp->gpnum) {
+	    rnp->completed == rnp->gpnum || rdp->gpwrap) {
 
 		/*
 		 * The grace period in which this quiescent state was
@@ -3190,7 +3196,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 
 	/* Has a new RCU grace period started? */
-	if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
+	if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
+	    unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
 		rdp->n_rp_gp_started++;
 		return 1;
 	}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 5ec81cf..7472ff3 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -260,6 +260,7 @@ struct rcu_data {
 	bool		passed_quiesce;	/* User-mode/idle loop etc. */
 	bool		qs_pending;	/* Core waits for quiesc state. */
 	bool		beenonline;	/* CPU online at least once. */
+	bool		gpwrap;		/* Possible gpnum/completed wrap. */
 	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
 	unsigned long grpmask;		/* Mask to apply to leaf qsmask. */
 #ifdef CONFIG_RCU_CPU_STALL_INFO
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 769384d7..81ff8b9 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1605,7 +1605,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
 		 * completed since we last checked and there are
 		 * callbacks not yet ready to invoke.
 		 */
-		if (rdp->completed != rnp->completed &&
+		if ((rdp->completed != rnp->completed ||
+		     unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
 		    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
 			note_gp_changes(rsp, rdp);
 
-- 
cgit v0.10.2


From 79619cf5151257f82a4c016b02283e2ceb4b63ce Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 17 Nov 2014 10:12:27 -0800
Subject: rcutorture: Issue warnings on close calls due to Reader Batch blows

Normal rcutorture checking overestimates grace periods somewhat due to
the fact that there is a delay from a grace-period request until the
start of the corresponding grace period and another delay from the end
of that grace period to notification of the requestor.  This means that
rcutorture's detection of RCU bugs is less sensitive than it might be.

It turns out that rcutorture also checks the underlying grace-period
"completed" counter (displayed in Reader Batch output), which in theory
allows rcutorture to do exact checks.  In practice, memory misordering
(by both compiler and CPU) can result in false positives.  However,
experience on x86 shows that these false positives are quite rare,
occurring less than one time per 1,000 hours of testing.  This commit
therefore does the exact checking, giving a warning if any Reader Batch
blows happen, and flagging an error if they happen more often than
once every three hours in long tests.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
index d6cc07f..559e01a 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
@@ -30,6 +30,7 @@ else
 	echo Unreadable results directory: $i
 	exit 1
 fi
+. tools/testing/selftests/rcutorture/bin/functions.sh
 
 configfile=`echo $i | sed -e 's/^.*\///'`
 ngps=`grep ver: $i/console.log 2> /dev/null | tail -1 | sed -e 's/^.* ver: //' -e 's/ .*$//'`
@@ -48,4 +49,21 @@ else
 		title="$title ($ngpsps per second)"
 	fi
 	echo $title
+	nclosecalls=`grep --binary-files=text 'torture: Reader Batch' $i/console.log | tail -1 | awk '{for (i=NF-8;i<=NF;i++) sum+=$i; } END {print sum}'`
+	if test -z "$nclosecalls"
+	then
+		exit 0
+	fi
+	if test "$nclosecalls" -eq 0
+	then
+		exit 0
+	fi
+	# Compute number of close calls per tenth of an hour
+	nclosecalls10=`awk -v nclosecalls=$nclosecalls -v dur=$dur 'BEGIN { print int(nclosecalls * 36000 / dur) }' < /dev/null`
+	if test $nclosecalls10 -gt 5 -a $nclosecalls -gt 1
+	then
+		print_bug $nclosecalls "Reader Batch close calls in" $(($dur/60)) minute run: $i
+	else
+		print_warning $nclosecalls "Reader Batch close calls in" $(($dur/60)) minute run: $i
+	fi
 fi
-- 
cgit v0.10.2


From 9733e4f0a973a354034f5dd603b4142a3095c85f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 12:49:13 -0800
Subject: rcu: Make _batches_completed() functions return unsigned long

Long ago, the various ->completed fields were of type long, but now are
unsigned long due to signed-integer-overflow concerns.  However, the
various _batches_completed() functions remained of type long, even though
their only purpose in life is to return the corresponding ->completed
field.  This patch cleans this up by changing these functions' return
types to unsigned long.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 0e53662..91f7e4c 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -94,7 +94,7 @@ static inline void rcu_virt_note_context_switch(int cpu)
 /*
  * Return the number of grace periods.
  */
-static inline long rcu_batches_completed(void)
+static inline unsigned long rcu_batches_completed(void)
 {
 	return 0;
 }
@@ -102,7 +102,7 @@ static inline long rcu_batches_completed(void)
 /*
  * Return the number of bottom-half grace periods.
  */
-static inline long rcu_batches_completed_bh(void)
+static inline unsigned long rcu_batches_completed_bh(void)
 {
 	return 0;
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 5295379..9885bfb 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -81,9 +81,9 @@ void cond_synchronize_rcu(unsigned long oldstate);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
-long rcu_batches_completed(void);
-long rcu_batches_completed_bh(void);
-long rcu_batches_completed_sched(void);
+unsigned long rcu_batches_completed(void);
+unsigned long rcu_batches_completed_bh(void);
+unsigned long rcu_batches_completed_sched(void);
 void show_rcu_gp_kthreads(void);
 
 void rcu_force_quiescent_state(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4c106fc..e26d787 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -317,7 +317,7 @@ static int rcu_pending(void);
 /*
  * Return the number of RCU-sched batches processed thus far for debug & stats.
  */
-long rcu_batches_completed_sched(void)
+unsigned long rcu_batches_completed_sched(void)
 {
 	return rcu_sched_state.completed;
 }
@@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 /*
  * Return the number of RCU BH batches processed thus far for debug & stats.
  */
-long rcu_batches_completed_bh(void)
+unsigned long rcu_batches_completed_bh(void)
 {
 	return rcu_bh_state.completed;
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b184..1a07d73 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -546,7 +546,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
 
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
-long rcu_batches_completed(void);
+unsigned long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(void);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb..f69300d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -117,7 +117,7 @@ static void __init rcu_bootup_announce(void)
  * Return the number of RCU-preempt batches processed thus far
  * for debug and statistics.
  */
-static long rcu_batches_completed_preempt(void)
+static unsigned long rcu_batches_completed_preempt(void)
 {
 	return rcu_preempt_state.completed;
 }
@@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
 /*
  * Return the number of RCU batches processed thus far for debug & stats.
  */
-long rcu_batches_completed(void)
+unsigned long rcu_batches_completed(void)
 {
 	return rcu_batches_completed_preempt();
 }
@@ -935,7 +935,7 @@ static void __init rcu_bootup_announce(void)
 /*
  * Return the number of RCU batches processed thus far for debug & stats.
  */
-long rcu_batches_completed(void)
+unsigned long rcu_batches_completed(void)
 {
 	return rcu_batches_completed_sched();
 }
-- 
cgit v0.10.2


From 3b009c0ebe02a06d2d5bd667da0eb9cefa79321d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 13:54:57 -0800
Subject: rcutorture: Make build-output parsing correctly flag RCU's warnings

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/tools/testing/selftests/rcutorture/bin/parse-build.sh b/tools/testing/selftests/rcutorture/bin/parse-build.sh
index 499d1e5..a6b5762 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-build.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-build.sh
@@ -26,12 +26,15 @@
 #
 # Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
 
-T=$1
+F=$1
 title=$2
+T=/tmp/parse-build.sh.$$
+trap 'rm -rf $T' 0
+mkdir $T
 
 . functions.sh
 
-if grep -q CC < $T
+if grep -q CC < $F
 then
 	:
 else
@@ -39,18 +42,21 @@ else
 	exit 1
 fi
 
-if grep -q "error:" < $T
+if grep -q "error:" < $F
 then
 	print_bug $title build errors:
-	grep "error:" < $T
+	grep "error:" < $F
 	exit 2
 fi
-exit 0
 
-if egrep -q "rcu[^/]*\.c.*warning:|rcu.*\.h.*warning:" < $T
+grep warning: < $F > $T/warnings
+grep "include/linux/*rcu*\.h:" $T/warnings > $T/hwarnings
+grep "kernel/rcu/[^/]*:" $T/warnings > $T/cwarnings
+cat $T/hwarnings $T/cwarnings > $T/rcuwarnings
+if test -s $T/rcuwarnings
 then
 	print_warning $title build errors:
-	egrep "rcu[^/]*\.c.*warning:|rcu.*\.h.*warning:" < $T
+	cat $T/rcuwarnings
 	exit 2
 fi
 exit 0
-- 
cgit v0.10.2


From 6b80da42c02bc731ab38b6a37da5366abe26717f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 14:19:26 -0800
Subject: rcutorture: Use unsigned for Reader Batch computations

The counter returned by the various ->completed functions is subject to
overflow, which means that subtracting two such counters might result
in overflow, which invokes undefined behavior in the C standard.  This
commit therefore changes these functions and variables to unsigned to
avoid this undefined behavior.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4d559ba..f43e351 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -244,7 +244,7 @@ struct rcu_torture_ops {
 	int (*readlock)(void);
 	void (*read_delay)(struct torture_random_state *rrsp);
 	void (*readunlock)(int idx);
-	int (*completed)(void);
+	unsigned long (*completed)(void);
 	void (*deferred_free)(struct rcu_torture *p);
 	void (*sync)(void);
 	void (*exp_sync)(void);
@@ -296,7 +296,7 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU)
 	rcu_read_unlock();
 }
 
-static int rcu_torture_completed(void)
+static unsigned long rcu_torture_completed(void)
 {
 	return rcu_batches_completed();
 }
@@ -356,7 +356,7 @@ rcu_torture_cb(struct rcu_head *p)
 		cur_ops->deferred_free(rp);
 }
 
-static int rcu_no_completed(void)
+static unsigned long rcu_no_completed(void)
 {
 	return 0;
 }
@@ -407,7 +407,7 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
 	rcu_read_unlock_bh();
 }
 
-static int rcu_bh_torture_completed(void)
+static unsigned long rcu_bh_torture_completed(void)
 {
 	return rcu_batches_completed_bh();
 }
@@ -510,7 +510,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
 	srcu_read_unlock(&srcu_ctl, idx);
 }
 
-static int srcu_torture_completed(void)
+static unsigned long srcu_torture_completed(void)
 {
 	return srcu_batches_completed(&srcu_ctl);
 }
@@ -1015,8 +1015,8 @@ static void rcutorture_trace_dump(void)
 static void rcu_torture_timer(unsigned long unused)
 {
 	int idx;
-	int completed;
-	int completed_end;
+	unsigned long completed;
+	unsigned long completed_end;
 	static DEFINE_TORTURE_RANDOM(rand);
 	static DEFINE_SPINLOCK(rand_lock);
 	struct rcu_torture *p;
@@ -1073,8 +1073,8 @@ static void rcu_torture_timer(unsigned long unused)
 static int
 rcu_torture_reader(void *arg)
 {
-	int completed;
-	int completed_end;
+	unsigned long completed;
+	unsigned long completed_end;
 	int idx;
 	DEFINE_TORTURE_RANDOM(rand);
 	struct rcu_torture *p;
-- 
cgit v0.10.2


From c1fe9cde4ae904fffb5b4d975d0a37e99136ff50 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 15:45:27 -0800
Subject: rcu: Provide rcu_batches_completed_sched() for TINY_RCU

A bug in rcutorture has caused it to ignore completed batches.
In preparation for fixing that bug, this commit provides TINY_RCU with
the required rcu_batches_completed_sched().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 91f7e4c..1ce2d6b 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -107,6 +107,14 @@ static inline unsigned long rcu_batches_completed_bh(void)
 	return 0;
 }
 
+/*
+ * Return the number of sched grace periods.
+ */
+static inline unsigned long rcu_batches_completed_sched(void)
+{
+	return 0;
+}
+
 static inline void rcu_force_quiescent_state(void)
 {
 }
-- 
cgit v0.10.2


From 1e32eaee4c370afca20324f634bf47ae991cd240 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 16:04:34 -0800
Subject: rcutorture: Drop rcu_torture_completed() and friends

Now that the return type of rcu_batches_completed() and friends matches
that of the rcu_torture_ops structure's ->completed field, the wrapper
functions can be deleted.  This commit carries out that deletion, while
also wiring "sched"'s ->completed field to rcu_batches_completed_sched().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f43e351..aadbc07 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -296,11 +296,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU)
 	rcu_read_unlock();
 }
 
-static unsigned long rcu_torture_completed(void)
-{
-	return rcu_batches_completed();
-}
-
 /*
  * Update callback in the pipe.  This should be invoked after a grace period.
  */
@@ -377,7 +372,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.readlock	= rcu_torture_read_lock,
 	.read_delay	= rcu_read_delay,
 	.readunlock	= rcu_torture_read_unlock,
-	.completed	= rcu_torture_completed,
+	.completed	= rcu_batches_completed,
 	.deferred_free	= rcu_torture_deferred_free,
 	.sync		= synchronize_rcu,
 	.exp_sync	= synchronize_rcu_expedited,
@@ -407,11 +402,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
 	rcu_read_unlock_bh();
 }
 
-static unsigned long rcu_bh_torture_completed(void)
-{
-	return rcu_batches_completed_bh();
-}
-
 static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
 {
 	call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
@@ -423,7 +413,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
 	.readlock	= rcu_bh_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= rcu_bh_torture_read_unlock,
-	.completed	= rcu_bh_torture_completed,
+	.completed	= rcu_batches_completed_bh,
 	.deferred_free	= rcu_bh_torture_deferred_free,
 	.sync		= synchronize_rcu_bh,
 	.exp_sync	= synchronize_rcu_bh_expedited,
@@ -600,7 +590,7 @@ static struct rcu_torture_ops sched_ops = {
 	.readlock	= sched_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= sched_torture_read_unlock,
-	.completed	= rcu_no_completed,
+	.completed	= rcu_batches_completed_sched,
 	.deferred_free	= rcu_sched_torture_deferred_free,
 	.sync		= synchronize_sched,
 	.exp_sync	= synchronize_sched_expedited,
-- 
cgit v0.10.2


From f9103c390257d06c162d9e3c2a90d2bdedadfe17 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 16:48:20 -0800
Subject: rcu: Remove redundant rcu_batches_completed() declaration

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 1a07d73..69eb422 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -546,7 +546,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
 
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
-unsigned long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(void);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
-- 
cgit v0.10.2


From 917963d0b30f9c4153c372c165178501d97b6b55 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 17:10:16 -0800
Subject: rcutorture: Check from beginning to end of grace period

Currently, rcutorture's Reader Batch checks measure from the end of
the previous grace period to the end of the current one.  This commit
tightens up these checks by measuring from the start and end of the same
grace period.  This involves adding rcu_batches_started() and friends
corresponding to the existing rcu_batches_completed() and friends.

We leave SRCU alone for the moment, as it does not yet have a way of
tracking both ends of its grace periods.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 1ce2d6b..9841921 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -92,7 +92,31 @@ static inline void rcu_virt_note_context_switch(int cpu)
 }
 
 /*
- * Return the number of grace periods.
+ * Return the number of grace periods started.
+ */
+static inline unsigned long rcu_batches_started(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of bottom-half grace periods started.
+ */
+static inline unsigned long rcu_batches_started_bh(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of sched grace periods started.
+ */
+static inline unsigned long rcu_batches_started_sched(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of grace periods completed.
  */
 static inline unsigned long rcu_batches_completed(void)
 {
@@ -100,7 +124,7 @@ static inline unsigned long rcu_batches_completed(void)
 }
 
 /*
- * Return the number of bottom-half grace periods.
+ * Return the number of bottom-half grace periods completed.
  */
 static inline unsigned long rcu_batches_completed_bh(void)
 {
@@ -108,7 +132,7 @@ static inline unsigned long rcu_batches_completed_bh(void)
 }
 
 /*
- * Return the number of sched grace periods.
+ * Return the number of sched grace periods completed.
  */
 static inline unsigned long rcu_batches_completed_sched(void)
 {
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 9885bfb..c0dd124 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -81,6 +81,9 @@ void cond_synchronize_rcu(unsigned long oldstate);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
+unsigned long rcu_batches_started(void);
+unsigned long rcu_batches_started_bh(void);
+unsigned long rcu_batches_started_sched(void);
 unsigned long rcu_batches_completed(void);
 unsigned long rcu_batches_completed_bh(void);
 unsigned long rcu_batches_completed_sched(void);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index aadbc07..24142c2 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -244,6 +244,7 @@ struct rcu_torture_ops {
 	int (*readlock)(void);
 	void (*read_delay)(struct torture_random_state *rrsp);
 	void (*readunlock)(int idx);
+	unsigned long (*started)(void);
 	unsigned long (*completed)(void);
 	void (*deferred_free)(struct rcu_torture *p);
 	void (*sync)(void);
@@ -372,6 +373,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.readlock	= rcu_torture_read_lock,
 	.read_delay	= rcu_read_delay,
 	.readunlock	= rcu_torture_read_unlock,
+	.started	= rcu_batches_started,
 	.completed	= rcu_batches_completed,
 	.deferred_free	= rcu_torture_deferred_free,
 	.sync		= synchronize_rcu,
@@ -413,6 +415,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
 	.readlock	= rcu_bh_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= rcu_bh_torture_read_unlock,
+	.started	= rcu_batches_started_bh,
 	.completed	= rcu_batches_completed_bh,
 	.deferred_free	= rcu_bh_torture_deferred_free,
 	.sync		= synchronize_rcu_bh,
@@ -456,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
 	.readlock	= rcu_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= rcu_torture_read_unlock,
+	.started	= rcu_no_completed,
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_busted_torture_deferred_free,
 	.sync		= synchronize_rcu_busted,
@@ -554,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = {
 	.readlock	= srcu_torture_read_lock,
 	.read_delay	= srcu_read_delay,
 	.readunlock	= srcu_torture_read_unlock,
+	.started	= NULL,
 	.completed	= srcu_torture_completed,
 	.deferred_free	= srcu_torture_deferred_free,
 	.sync		= srcu_torture_synchronize,
@@ -590,6 +595,7 @@ static struct rcu_torture_ops sched_ops = {
 	.readlock	= sched_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= sched_torture_read_unlock,
+	.started	= rcu_batches_started_sched,
 	.completed	= rcu_batches_completed_sched,
 	.deferred_free	= rcu_sched_torture_deferred_free,
 	.sync		= synchronize_sched,
@@ -628,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = {
 	.readlock	= tasks_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= tasks_torture_read_unlock,
+	.started	= rcu_no_completed,
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_tasks_torture_deferred_free,
 	.sync		= synchronize_rcu_tasks,
@@ -1005,8 +1012,8 @@ static void rcutorture_trace_dump(void)
 static void rcu_torture_timer(unsigned long unused)
 {
 	int idx;
+	unsigned long started;
 	unsigned long completed;
-	unsigned long completed_end;
 	static DEFINE_TORTURE_RANDOM(rand);
 	static DEFINE_SPINLOCK(rand_lock);
 	struct rcu_torture *p;
@@ -1014,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused)
 	unsigned long long ts;
 
 	idx = cur_ops->readlock();
-	completed = cur_ops->completed();
+	if (cur_ops->started)
+		started = cur_ops->started();
+	else
+		started = cur_ops->completed();
 	ts = rcu_trace_clock_local();
 	p = rcu_dereference_check(rcu_torture_current,
 				  rcu_read_lock_bh_held() ||
@@ -1037,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused)
 		/* Should not happen, but... */
 		pipe_count = RCU_TORTURE_PIPE_LEN;
 	}
-	completed_end = cur_ops->completed();
+	completed = cur_ops->completed();
 	if (pipe_count > 1) {
 		do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
-					  completed, completed_end);
+					  started, completed);
 		rcutorture_trace_dump();
 	}
 	__this_cpu_inc(rcu_torture_count[pipe_count]);
-	completed = completed_end - completed;
+	completed = completed - started;
+	if (cur_ops->started)
+		completed++;
 	if (completed > RCU_TORTURE_PIPE_LEN) {
 		/* Should not happen, but... */
 		completed = RCU_TORTURE_PIPE_LEN;
@@ -1063,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused)
 static int
 rcu_torture_reader(void *arg)
 {
+	unsigned long started;
 	unsigned long completed;
-	unsigned long completed_end;
 	int idx;
 	DEFINE_TORTURE_RANDOM(rand);
 	struct rcu_torture *p;
@@ -1083,7 +1095,10 @@ rcu_torture_reader(void *arg)
 				mod_timer(&t, jiffies + 1);
 		}
 		idx = cur_ops->readlock();
-		completed = cur_ops->completed();
+		if (cur_ops->started)
+			started = cur_ops->started();
+		else
+			started = cur_ops->completed();
 		ts = rcu_trace_clock_local();
 		p = rcu_dereference_check(rcu_torture_current,
 					  rcu_read_lock_bh_held() ||
@@ -1104,14 +1119,16 @@ rcu_torture_reader(void *arg)
 			/* Should not happen, but... */
 			pipe_count = RCU_TORTURE_PIPE_LEN;
 		}
-		completed_end = cur_ops->completed();
+		completed = cur_ops->completed();
 		if (pipe_count > 1) {
 			do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
-						  ts, completed, completed_end);
+						  ts, started, completed);
 			rcutorture_trace_dump();
 		}
 		__this_cpu_inc(rcu_torture_count[pipe_count]);
-		completed = completed_end - completed;
+		completed = completed - started;
+		if (cur_ops->started)
+			completed++;
 		if (completed > RCU_TORTURE_PIPE_LEN) {
 			/* Should not happen, but... */
 			completed = RCU_TORTURE_PIPE_LEN;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e26d787..c0faad5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -315,7 +315,43 @@ static void force_quiescent_state(struct rcu_state *rsp);
 static int rcu_pending(void);
 
 /*
- * Return the number of RCU-sched batches processed thus far for debug & stats.
+ * Return the number of RCU batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started(void)
+{
+	return rcu_state_p->gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started);
+
+/*
+ * Return the number of RCU-sched batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_sched(void)
+{
+	return rcu_sched_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
+
+/*
+ * Return the number of RCU BH batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_bh(void)
+{
+	return rcu_bh_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
+
+/*
+ * Return the number of RCU batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed(void)
+{
+	return rcu_state_p->completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU-sched batches completed thus far for debug & stats.
  */
 unsigned long rcu_batches_completed_sched(void)
 {
@@ -324,7 +360,7 @@ unsigned long rcu_batches_completed_sched(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 
 /*
- * Return the number of RCU BH batches processed thus far for debug & stats.
+ * Return the number of RCU BH batches completed thus far for debug & stats.
  */
 unsigned long rcu_batches_completed_bh(void)
 {
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index f69300d..07e61a0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -114,25 +114,6 @@ static void __init rcu_bootup_announce(void)
 }
 
 /*
- * Return the number of RCU-preempt batches processed thus far
- * for debug and statistics.
- */
-static unsigned long rcu_batches_completed_preempt(void)
-{
-	return rcu_preempt_state.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
-
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-unsigned long rcu_batches_completed(void)
-{
-	return rcu_batches_completed_preempt();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-/*
  * Record a preemptible-RCU quiescent state for the specified CPU.  Note
  * that this just means that the task currently running on the CPU is
  * not in a quiescent state.  There might be any number of tasks blocked
@@ -933,15 +914,6 @@ static void __init rcu_bootup_announce(void)
 }
 
 /*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-unsigned long rcu_batches_completed(void)
-{
-	return rcu_batches_completed_sched();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-/*
  * Because preemptible RCU does not exist, we never have to check for
  * CPUs being in quiescent states.
  */
-- 
cgit v0.10.2


From 94162c8daa839bd0de791f66d754a880e2787c82 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 10 Dec 2014 17:42:16 -0800
Subject: rcutorture: Handle different mpstat versions

The mpstat command recently added the %gnice column, which messes up
the cpu2use.sh script's idle-CPU calculations.  This commit therefore
uses $NF instead of $12 to select the last (%idle) column.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/tools/testing/selftests/rcutorture/bin/cpus2use.sh b/tools/testing/selftests/rcutorture/bin/cpus2use.sh
index abe14b7..bb99cde 100755
--- a/tools/testing/selftests/rcutorture/bin/cpus2use.sh
+++ b/tools/testing/selftests/rcutorture/bin/cpus2use.sh
@@ -24,7 +24,7 @@
 
 ncpus=`grep '^processor' /proc/cpuinfo | wc -l`
 idlecpus=`mpstat | tail -1 | \
-	awk -v ncpus=$ncpus '{ print ncpus * ($7 + $12) / 100 }'`
+	awk -v ncpus=$ncpus '{ print ncpus * ($7 + $NF) / 100 }'`
 awk -v ncpus=$ncpus -v idlecpus=$idlecpus < /dev/null '
 BEGIN {
 	cpus2use = idlecpus;
-- 
cgit v0.10.2


From 16c77ea7d0f4a74e49009aa2d26c275f7f93de7c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 10 Dec 2014 17:43:58 -0800
Subject: torture: Add "-enable-kvm -soundhw pcspk" to qemu command line

More recent qemu implementations really want "-enable-kvm", and the
"-soundhw pcspk" makes the script a bit less dependent on odd audio
libraries being installed.  This commit therefore adds both to the
default qemu command line.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 8ca9f21..35b8532 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -8,9 +8,9 @@
 #
 # Usage: kvm-test-1-run.sh config builddir resdir minutes qemu-args boot_args
 #
-# qemu-args defaults to "-nographic", along with arguments specifying the
-#			number of CPUs and other options generated from
-#			the underlying CPU architecture.
+# qemu-args defaults to "-enable-kvm -soundhw pcspk -nographic", along with
+#			arguments specifying the number of CPUs and other
+#			options generated from the underlying CPU architecture.
 # boot_args defaults to value returned by the per_version_boot_params
 #			shell function.
 #
@@ -138,7 +138,7 @@ then
 fi
 
 # Generate -smp qemu argument.
-qemu_args="-nographic $qemu_args"
+qemu_args="-enable-kvm -soundhw pcspk -nographic $qemu_args"
 cpu_count=`configNR_CPUS.sh $config_template`
 cpu_count=`configfrag_boot_cpus "$boot_args" "$config_template" "$cpu_count"`
 vcpus=`identify_qemu_vcpus`
-- 
cgit v0.10.2


From d2f74b5b488dd2b283977f64844fee3ba91676db Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 11 Dec 2014 10:50:22 -0800
Subject: torture: Flag console.log file to prevent holdovers from earlier runs

A system misconfiguration that prevents qemu from running at all (for
example, a missing dynamically linked library) will keep the console.log
file from the previous run.  This can fool the developer into thinking
that this failed run actually completed correctly.  This commit therefore
overwrites the console.log file just before launching qemu.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 35b8532..5236e07 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -168,6 +168,7 @@ then
 	touch $resdir/buildonly
 	exit 0
 fi
+echo "NOTE: $QEMU either did not run or was interactive" > $builddir/console.log
 echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd
 ( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) &
 qemu_pid=$!
-- 
cgit v0.10.2


From 7602de4af192b1527767538454557cb7564bae60 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 17 Dec 2014 18:39:54 -0800
Subject: rcutorture: Add more diagnostics in rcu_barrier() test failure case

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 24142c2..30d42aa 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1427,6 +1427,9 @@ static int rcu_torture_barrier(void *arg)
 		cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
 		if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
 			n_rcu_torture_barrier_error++;
+			pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
+			       atomic_read(&barrier_cbs_invoked),
+			       n_barrier_cbs);
 			WARN_ON_ONCE(1);
 		}
 		n_barrier_successes++;
-- 
cgit v0.10.2


From 28423ad283d5348793b0c45cc9b1af058e776fd6 Mon Sep 17 00:00:00 2001
From: Calvin Owens <calvinowens@fb.com>
Date: Tue, 13 Jan 2015 13:16:18 -0800
Subject: ksoftirqd: Enable IRQs and call cond_resched() before poking RCU

While debugging an issue with excessive softirq usage, I encountered the
following note in commit 3e339b5dae24a706 ("softirq: Use hotplug thread
infrastructure"):

    [ paulmck: Call rcu_note_context_switch() with interrupts enabled. ]

...but despite this note, the patch still calls RCU with IRQs disabled.

This seemingly innocuous change caused a significant regression in softirq
CPU usage on the sending side of a large TCP transfer (~1 GB/s): when
introducing 0.01% packet loss, the softirq usage would jump to around 25%,
spiking as high as 50%. Before the change, the usage would never exceed 5%.

Moving the call to rcu_note_context_switch() after the cond_sched() call,
as it was originally before the hotplug patch, completely eliminated this
problem.

Signed-off-by: Calvin Owens <calvinowens@fb.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 501baa9..c497fcd 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -656,9 +656,13 @@ static void run_ksoftirqd(unsigned int cpu)
 		 * in the task stack here.
 		 */
 		__do_softirq();
-		rcu_note_context_switch();
 		local_irq_enable();
 		cond_resched();
+
+		preempt_disable();
+		rcu_note_context_switch();
+		preempt_enable();
+
 		return;
 	}
 	local_irq_enable();
-- 
cgit v0.10.2


From 60479676eb6e9913176d93ebad194e3dc167bc63 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 14 Jan 2015 13:20:26 -0800
Subject: ksoftirqd: Use new cond_resched_rcu_qs() function

Simplify run_ksoftirqd() by using the new cond_resched_rcu_qs() function
that conditionally reschedules, but unconditionally supplies an RCU
quiescent state.  This commit is separate from the previous commit by
Calvin Owens because Calvin's approach can be backported, while this
commit cannot be.  The reason that this commit cannot be backported is
that cond_resched_rcu_qs() does not always provide the needed quiescent
state in earlier kernels.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index c497fcd..8cdb988 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -657,12 +657,7 @@ static void run_ksoftirqd(unsigned int cpu)
 		 */
 		__do_softirq();
 		local_irq_enable();
-		cond_resched();
-
-		preempt_disable();
-		rcu_note_context_switch();
-		preempt_enable();
-
+		cond_resched_rcu_qs();
 		return;
 	}
 	local_irq_enable();
-- 
cgit v0.10.2


From a94844b22a2e2b9155bbc0878c507850477221c2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 12 Dec 2014 07:37:48 -0800
Subject: rcu: Optionally run grace-period kthreads at real-time priority

Recent testing has shown that under heavy load, running RCU's grace-period
kthreads at real-time priority can improve performance (according to 0day
test robot) and reduce the incidence of RCU CPU stall warnings.  However,
most systems do just fine with the default non-realtime priorities for
these kthreads, and it does not make sense to expose the entire user
base to any risk stemming from this change, given that this change is
of use only to a few users running extremely heavy workloads.

Therefore, this commit allows users to specify realtime priorities
for the grace-period kthreads, but leaves them running SCHED_OTHER
by default.  The realtime priority may be specified at build time
via the RCU_KTHREAD_PRIO Kconfig parameter, or at boot time via the
rcutree.kthread_prio parameter.  Either way, 0 says to continue the
default SCHED_OTHER behavior and values from 1-99 specify that priority
of SCHED_FIFO behavior.  Note that a value of 0 is not permitted when
the RCU_BOOST Kconfig parameter is specified.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/init/Kconfig b/init/Kconfig
index 9afb971..d3ee66a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -668,9 +668,10 @@ config RCU_BOOST
 
 config RCU_KTHREAD_PRIO
 	int "Real-time priority to use for RCU worker threads"
-	range 1 99
-	depends on RCU_BOOST
-	default 1
+	range 1 99 if RCU_BOOST
+	range 0 99 if !RCU_BOOST
+	default 1 if RCU_BOOST
+	default 0 if !RCU_BOOST
 	help
 	  This option specifies the SCHED_FIFO priority value that will be
 	  assigned to the rcuc/n and rcub/n threads and is also the value
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5987fdc..75ce123 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 
+/* rcuc/rcub kthread realtime priority */
+static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+module_param(kthread_prio, int, 0644);
+
 /*
  * Track the rcutorture test sequence number and the update version
  * number within a given test.  The rcutorture_testseq is incremented
@@ -3597,17 +3601,35 @@ static int rcu_pm_notify(struct notifier_block *self,
 static int __init rcu_spawn_gp_kthread(void)
 {
 	unsigned long flags;
+	int kthread_prio_in = kthread_prio;
 	struct rcu_node *rnp;
 	struct rcu_state *rsp;
+	struct sched_param sp;
 	struct task_struct *t;
 
+	/* Force priority into range. */
+	if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+		kthread_prio = 1;
+	else if (kthread_prio < 0)
+		kthread_prio = 0;
+	else if (kthread_prio > 99)
+		kthread_prio = 99;
+	if (kthread_prio != kthread_prio_in)
+		pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
+			 kthread_prio, kthread_prio_in);
+
 	rcu_scheduler_fully_active = 1;
 	for_each_rcu_flavor(rsp) {
-		t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
+		t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
 		BUG_ON(IS_ERR(t));
 		rnp = rcu_get_root(rsp);
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		rsp->gp_kthread = t;
+		if (kthread_prio) {
+			sp.sched_priority = kthread_prio;
+			sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+		}
+		wake_up_process(t);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	}
 	rcu_spawn_nocb_kthreads();
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 81ff8b9..dcb3263 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -34,10 +34,6 @@
 
 #include "../locking/rtmutex_common.h"
 
-/* rcuc/rcub kthread realtime priority */
-static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
-module_param(kthread_prio, int, 0644);
-
 /*
  * Control variables for per-CPU and per-rcu_node kthreads.  These
  * handle all flavors of RCU.
-- 
cgit v0.10.2


From 5cd37193ce8539be1e6ef76be226f4bcc984e0f5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 13 Dec 2014 20:32:04 -0800
Subject: rcu: Make cond_resched_rcu_qs() apply to normal RCU flavors

Although cond_resched_rcu_qs() only applies to TASKS_RCU, it is used
in places where it would be useful for it to apply to the normal RCU
flavors, rcu_preempt, rcu_sched, and rcu_bh.  This is especially the
case for workloads that aggressively overload the system, particularly
those that generate large numbers of RCU updates on systems running
NO_HZ_FULL CPUs.  This commit therefore communicates quiescent states
from cond_resched_rcu_qs() to the normal RCU flavors.

Note that it is unfortunately necessary to leave the old ->passed_quiesce
mechanism in place to allow quiescent states that apply to only one
flavor to be recorded.  (Yes, we could decrement ->rcu_qs_ctr_snap in
that case, but that is not so good for debugging of RCU internals.)
In addition, if one of the RCU flavor's grace period has stalled, this
will invoke rcu_momentary_dyntick_idle(), resulting in a heavy-weight
quiescent state visible from other CPUs.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Merge commit from Sasha Levin fixing a bug where __this_cpu()
  was used in preemptible code. ]

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index b63b9bb..08651da 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -56,14 +56,14 @@ rcuboost:
 
 The output of "cat rcu/rcu_preempt/rcudata" looks as follows:
 
-  0!c=30455 g=30456 pq=1 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
-  1!c=30719 g=30720 pq=1 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
-  2!c=30150 g=30151 pq=1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
-  3 c=31249 g=31250 pq=1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
-  4!c=29502 g=29503 pq=1 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
-  5 c=31201 g=31202 pq=1 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
-  6!c=30253 g=30254 pq=1 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
-  7 c=31178 g=31178 pq=1 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
+  0!c=30455 g=30456 pq=1/0 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
+  1!c=30719 g=30720 pq=1/0 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
+  2!c=30150 g=30151 pq=1/1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
+  3 c=31249 g=31250 pq=1/1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
+  4!c=29502 g=29503 pq=1/0 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
+  5 c=31201 g=31202 pq=1/0 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
+  6!c=30253 g=30254 pq=1/0 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
+  7 c=31178 g=31178 pq=1/0 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
 
 This file has one line per CPU, or eight for this 8-CPU system.
 The fields are as follows:
@@ -188,14 +188,14 @@ o	"ca" is the number of RCU callbacks that have been adopted by this
 Kernels compiled with CONFIG_RCU_BOOST=y display the following from
 /debug/rcu/rcu_preempt/rcudata:
 
-  0!c=12865 g=12866 pq=1 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
-  1 c=14407 g=14408 pq=1 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
-  2 c=14407 g=14408 pq=1 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
-  3 c=14407 g=14408 pq=1 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
-  4 c=14405 g=14406 pq=1 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
-  5!c=14168 g=14169 pq=1 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
-  6 c=14404 g=14405 pq=1 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
-  7 c=14407 g=14408 pq=1 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
+  0!c=12865 g=12866 pq=1/0 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
+  1 c=14407 g=14408 pq=1/0 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
+  2 c=14407 g=14408 pq=1/0 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
+  3 c=14407 g=14408 pq=1/0 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
+  4 c=14405 g=14406 pq=1/0 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
+  5!c=14168 g=14169 pq=1/0 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
+  6 c=14404 g=14405 pq=1/0 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
+  7 c=14407 g=14408 pq=1/0 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
 
 This is similar to the output discussed above, but contains the following
 additional fields:
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index ed4f593..4682287 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -331,12 +331,13 @@ static inline void rcu_init_nohz(void)
 extern struct srcu_struct tasks_rcu_exit_srcu;
 #define rcu_note_voluntary_context_switch(t) \
 	do { \
+		rcu_all_qs(); \
 		if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \
 			ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \
 	} while (0)
 #else /* #ifdef CONFIG_TASKS_RCU */
 #define TASKS_RCU(x) do { } while (0)
-#define rcu_note_voluntary_context_switch(t)	do { } while (0)
+#define rcu_note_voluntary_context_switch(t)	rcu_all_qs()
 #endif /* #else #ifdef CONFIG_TASKS_RCU */
 
 /**
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 0e53662..fabd3fa 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -154,7 +154,10 @@ static inline bool rcu_is_watching(void)
 	return true;
 }
 
-
 #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
 
+static inline void rcu_all_qs(void)
+{
+}
+
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 5295379..ddba927 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -97,4 +97,6 @@ extern int rcu_scheduler_active __read_mostly;
 
 bool rcu_is_watching(void);
 
+void rcu_all_qs(void);
+
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 75ce123..cb00e03 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -219,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 };
 
+DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
+
 /*
  * Let the RCU core know that this CPU has gone through the scheduler,
  * which is a quiescent state.  This is called when the need for a
@@ -288,6 +291,22 @@ void rcu_note_context_switch(void)
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
+/*
+ * Register a quiesecent state for all RCU flavors.  If there is an
+ * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
+ * dyntick-idle quiescent state visible to other CPUs (but only for those
+ * RCU flavors in desparate need of a quiescent state, which will normally
+ * be none of them).  Either way, do a lightweight quiescent state for
+ * all RCU flavors.
+ */
+void rcu_all_qs(void)
+{
+	if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+		rcu_momentary_dyntick_idle();
+	this_cpu_inc(rcu_qs_ctr);
+}
+EXPORT_SYMBOL_GPL(rcu_all_qs);
+
 static long blimit = 10;	/* Maximum callbacks per rcu_do_batch. */
 static long qhimark = 10000;	/* If this many pending, ignore blimit. */
 static long qlowmark = 100;	/* Once only this many pending, use blimit. */
@@ -1609,6 +1628,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		rdp->gpnum = rnp->gpnum;
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
 		rdp->passed_quiesce = 0;
+		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
 		zero_cpu_stall_ticks(rdp);
 		ACCESS_ONCE(rdp->gpwrap) = false;
@@ -2075,8 +2095,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 	rnp = rdp->mynode;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
-	if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
-	    rnp->completed == rnp->gpnum || rdp->gpwrap) {
+	if ((rdp->passed_quiesce == 0 &&
+	     rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
+	    rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
+	    rdp->gpwrap) {
 
 		/*
 		 * The grace period in which this quiescent state was
@@ -2085,6 +2107,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 		 * within the current grace period.
 		 */
 		rdp->passed_quiesce = 0;	/* need qs for new gp. */
+		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
@@ -2129,7 +2152,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * Was there a quiescent state since the beginning of the grace
 	 * period? If no, then exit and wait for the next call.
 	 */
-	if (!rdp->passed_quiesce)
+	if (!rdp->passed_quiesce &&
+	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
 		return;
 
 	/*
@@ -3174,9 +3198,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
 	if (rcu_scheduler_fully_active &&
-	    rdp->qs_pending && !rdp->passed_quiesce) {
+	    rdp->qs_pending && !rdp->passed_quiesce &&
+	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
 		rdp->n_rp_qs_pending++;
-	} else if (rdp->qs_pending && rdp->passed_quiesce) {
+	} else if (rdp->qs_pending &&
+		   (rdp->passed_quiesce ||
+		    rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
 		rdp->n_rp_report_qs++;
 		return 1;
 	}
@@ -3510,6 +3537,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 			rdp->gpnum = rnp->completed;
 			rdp->completed = rnp->completed;
 			rdp->passed_quiesce = 0;
+			rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 			rdp->qs_pending = 0;
 			trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
 		}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 7472ff3..1e7f8b0 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -257,6 +257,8 @@ struct rcu_data {
 					/*  in order to detect GP end. */
 	unsigned long	gpnum;		/* Highest gp number that this CPU */
 					/*  is aware of having started. */
+	unsigned long	rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
+					/*  for rcu_all_qs() invocations. */
 	bool		passed_quiesce;	/* User-mode/idle loop etc. */
 	bool		qs_pending;	/* Core waits for quiesc state. */
 	bool		beenonline;	/* CPU online at least once. */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 5cdc62e..fbb6240 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -46,6 +46,8 @@
 #define RCU_TREE_NONCORE
 #include "tree.h"
 
+DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+
 static int r_open(struct inode *inode, struct file *file,
 					const struct seq_operations *op)
 {
@@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 
 	if (!rdp->beenonline)
 		return;
-	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
+	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
 		   ulong2long(rdp->completed), ulong2long(rdp->gpnum),
-		   rdp->passed_quiesce, rdp->qs_pending);
+		   rdp->passed_quiesce,
+		   rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
+		   rdp->qs_pending);
 	seq_printf(m, " dt=%d/%llx/%d df=%lu",
 		   atomic_read(&rdp->dynticks->dynticks),
 		   rdp->dynticks->dynticks_nesting,
-- 
cgit v0.10.2


From fb81a44b88e6173ed0f6e9d6a1afa5305fb63f6e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 17 Dec 2014 08:35:02 -0800
Subject: rcu: Add GP-kthread-starvation checks to CPU stall warnings

This commit adds a message that is printed if the relevant grace-period
kthread has not been able to run for the two seconds preceding the
stall warning.  (The two seconds is double the maximum interval between
successive bouts of quiescent-state forcing.)

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 55f9707..53e7d28 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -152,6 +152,15 @@ no non-lazy callbacks ("." is printed otherwise, as shown above) and
 "D" indicates that dyntick-idle processing is enabled ("." is printed
 otherwise, for example, if disabled via the "nohz=" kernel boot parameter).
 
+If the relevant grace-period kthread has been unable to run prior to
+the stall warning, the following additional line is printed:
+
+	rcu_preempt kthread starved for 2023 jiffies!
+
+Starving the grace-period kthreads of CPU time can of course result in
+RCU CPU stall warnings even when all CPUs and tasks have passed through
+the required quiescent states.
+
 
 Multiple Warnings From One Stall
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cb00e03..e335f33 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1073,6 +1073,21 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
 }
 
 /*
+ * Complain about starvation of grace-period kthread.
+ */
+static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
+{
+	unsigned long gpa;
+	unsigned long j;
+
+	j = jiffies;
+	gpa = ACCESS_ONCE(rsp->gp_activity);
+	if (j - gpa > 2 * HZ)
+		pr_err("%s kthread starved for %ld jiffies!\n",
+		       rsp->name, j - gpa);
+}
+
+/*
  * Dump stacks of all tasks running on stalled CPUs.
  */
 static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
@@ -1169,9 +1184,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	}
 
 	/* Complain about tasks blocking the grace period. */
-
 	rcu_print_detail_task_stall(rsp);
 
+	rcu_check_gp_kthread_starvation(rsp);
+
 	force_quiescent_state(rsp);  /* Kick them all. */
 }
 
@@ -1196,6 +1212,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
 		jiffies - rsp->gp_start,
 		(long)rsp->gpnum, (long)rsp->completed, totqlen);
+
+	rcu_check_gp_kthread_starvation(rsp);
+
 	rcu_dump_cpu_stacks(rsp);
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-- 
cgit v0.10.2


From ec1fe396ff42e240c9b32111ee53665c5916fe5e Mon Sep 17 00:00:00 2001
From: Miroslav Benes <mbenes@suse.cz>
Date: Mon, 22 Dec 2014 11:10:12 -0800
Subject: rcu: Fix RCU CPU stall detection in tiny implementation

The tiny RCU CPU stall detection depends on *rcp->curtail not being
NULL. It is however a tail pointer and thus NULL by definition. Instead we
should check rcp->rcucblist for the presence of pending callbacks which
need to be processed. With this fix INFO about the stall is printed and
jiffies_stall (jiffies at next stall) correctly updated.

Note that the check for pending callback is necessary to avoid spurious
warnings if there are no pendings callbacks.

Signed-off-by: Miroslav Benes <mbenes@suse.cz>
[ paulmck: Fused identical "if" statements, ported to -rcu. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 858c565..80f908a 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
 	rcp->ticks_this_gp++;
 	j = jiffies;
 	js = ACCESS_ONCE(rcp->jiffies_stall);
-	if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+	if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
 		pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
 		       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
 		       jiffies - rcp->gp_start, rcp->qlen);
 		dump_stack();
-	}
-	if (*rcp->curtail && ULONG_CMP_GE(j, js))
 		ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
 			3 * rcu_jiffies_till_stall_check() + 3;
-	else if (ULONG_CMP_GE(j, js))
+	} else if (ULONG_CMP_GE(j, js)) {
 		ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+	}
 }
 
 static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
-- 
cgit v0.10.2


From 630181c4a915edb0761bb282c567d3abc8ca2f35 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 23 Dec 2014 21:33:14 -0800
Subject: rcu: Initialize tiny RCU stall-warning timeouts at boot

The current tiny RCU stall-warning code assumes that the jiffies counter
starts at zero, however, it is sometimes initialized to other values,
for example, -30,000.  This commit therefore changes rcu_init() to
invoke reset_cpu_stall_ticks() for both flavors of RCU to initialize
the stall-warning times properly at boot.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 0db5649..fc0f2c0 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -383,6 +383,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
 void __init rcu_init(void)
 {
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+	RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
+	RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
 
 	rcu_early_boot_tests();
 }
-- 
cgit v0.10.2