From ca9558a33f658155c3b69f92897c2e6a848684f5 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 31 Oct 2014 14:55:05 +0000 Subject: rcu: Remove redundant rcu_is_cpu_rrupt_from_idle() from tiny RCU Let's start assuming that something in the idle loop posts a callback, and scheduling-clock interrupt occurs: 1. The system is idle and stays that way, no runnable tasks. 2. Scheduling-clock interrupt occurs, rcu_check_callbacks() is called as result, which in turn calls rcu_is_cpu_rrupt_from_idle(). 3. rcu_is_cpu_rrupt_from_idle() reports the CPU was interrupted from idle, which results in rcu_sched_qs() call, which does a raise_softirq(RCU_SOFTIRQ). 4. Upon return from interrupt, rcu_irq_exit() is invoked, which calls rcu_idle_enter_common(), which in turn calls rcu_sched_qs() again, which does another raise_softirq(RCU_SOFTIRQ). 5. The softirq happens shortly and invokes rcu_process_callbacks(), which invokes __rcu_process_callbacks(). 6. So now callbacks can be invoked. At least they can be if ->donetail has been updated. Which it will have been because rcu_sched_qs() invokes rcu_qsctr_help(). In the described scenario rcu_sched_qs() and raise_softirq(RCU_SOFTIRQ) get called twice in steps 3 and 4. This redundancy could be eliminated by removing rcu_is_cpu_rrupt_from_idle() function. Signed-off-by: Alexander Gordeev Cc: Paul E. McKenney Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 0db5649..805b6d5 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -186,16 +186,6 @@ EXPORT_SYMBOL(__rcu_is_watching); #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ /* - * Test whether the current CPU was interrupted from idle. Nested - * interrupts don't count, we must be running at the first interrupt - * level. - */ -static int rcu_is_cpu_rrupt_from_idle(void) -{ - return rcu_dynticks_nesting <= 1; -} - -/* * Helper function for rcu_sched_qs() and rcu_bh_qs(). * Also irqs are disabled to avoid confusion due to interrupt handlers * invoking call_rcu(). @@ -250,7 +240,7 @@ void rcu_bh_qs(void) void rcu_check_callbacks(int user) { RCU_TRACE(check_cpu_stalls()); - if (user || rcu_is_cpu_rrupt_from_idle()) + if (user) rcu_sched_qs(); else if (!in_softirq()) rcu_bh_qs(); -- cgit v0.10.2 From 924df8a0117aa2725750f1ec4d282867534d9a89 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Oct 2014 15:37:58 -0700 Subject: rcu: Fix invoke_rcu_callbacks() comment Despite what the comment says, it is only softirqs that are disabled, not interrupts. This commit therefore fixes the comment. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4c106fc..9c25b99 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2601,7 +2601,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) * Schedule RCU callback invocation. If the specified type of RCU * does not support RCU priority boosting, just do a direct call, * otherwise wake up the per-CPU kernel kthread. Note that because we - * are running on the current CPU with interrupts disabled, the + * are running on the current CPU with softirqs disabled, the * rcu_cpu_kthread_task cannot disappear out from under us. */ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) -- cgit v0.10.2 From 536fa402221f09633e7c5801b327055ab716a363 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Sep 2014 11:14:48 -0700 Subject: compiler: Allow 1- and 2-byte smp_load_acquire() and smp_store_release() CPUs without single-byte and double-byte loads and stores place some "interesting" requirements on concurrent code. For example (adapted from Peter Hurley's test code), suppose we have the following structure: struct foo { spinlock_t lock1; spinlock_t lock2; char a; /* Protected by lock1. */ char b; /* Protected by lock2. */ }; struct foo *foop; Of course, it is common (and good) practice to place data protected by different locks in separate cache lines. However, if the locks are rarely acquired (for example, only in rare error cases), and there are a great many instances of the data structure, then memory footprint can trump false-sharing concerns, so that it can be better to place them in the same cache cache line as above. But if the CPU does not support single-byte loads and stores, a store to foop->a will do a non-atomic read-modify-write operation on foop->b, which will come as a nasty surprise to someone holding foop->lock2. So we now require CPUs to support single-byte and double-byte loads and stores. Therefore, this commit adjusts the definition of __native_word() to allow these sizes to be used by smp_load_acquire() and smp_store_release(). Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra diff --git a/include/linux/compiler.h b/include/linux/compiler.h index a1c81f8..49811cd 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -385,7 +385,7 @@ static __always_inline void __assign_once_size(volatile void *p, void *res, int /* Is this type a native word size -- useful for atomic operations */ #ifndef __native_word -# define __native_word(t) (sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) +# define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) #endif /* Compile time object size, -1 for unknown */ -- cgit v0.10.2 From ac59853c06993a442e8060bc19040b2ca3025aec Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Thu, 13 Nov 2014 14:24:14 -0500 Subject: rcupdate: Replace smp_read_barrier_depends() with lockless_dereference() Recently lockless_dereference() was added which can be used in place of hard-coding smp_read_barrier_depends(). The following PATCH makes the change. Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ed4f593..386ba28 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -582,11 +582,11 @@ static inline void rcu_preempt_sleep_check(void) }) #define __rcu_dereference_check(p, c, space) \ ({ \ - typeof(*p) *_________p1 = (typeof(*p) *__force)ACCESS_ONCE(p); \ + /* Dependency order vs. p above. */ \ + typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \ rcu_lockdep_assert(c, "suspicious rcu_dereference_check() usage"); \ rcu_dereference_sparse(p, space); \ - smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ - ((typeof(*p) __force __kernel *)(_________p1)); \ + ((typeof(*p) __force __kernel *)(________p1)); \ }) #define __rcu_dereference_protected(p, c, space) \ ({ \ @@ -603,10 +603,10 @@ static inline void rcu_preempt_sleep_check(void) }) #define __rcu_dereference_index_check(p, c) \ ({ \ - typeof(p) _________p1 = ACCESS_ONCE(p); \ + /* Dependency order vs. p above. */ \ + typeof(p) _________p1 = lockless_dereference(p); \ rcu_lockdep_assert(c, \ "suspicious rcu_dereference_index_check() usage"); \ - smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ (_________p1); \ }) -- cgit v0.10.2 From 5f6130fa52ee0df0b1da4518c5bbef42bcfe7d83 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 9 Dec 2014 17:53:34 +0800 Subject: tiny_rcu: Directly force QS when call_rcu_[bh|sched]() on idle_task For RCU in UP, context-switch = QS = GP, thus we can force a context-switch when any call_rcu_[bh|sched]() is happened on idle_task. After doing so, rcu_idle/irq_enter/exit() are useless, so we can simply make these functions empty. More important, this change does not change the functionality logically. Note: raise_softirq(RCU_SOFTIRQ)/rcu_sched_qs() in rcu_idle_enter() and outmost rcu_irq_exit() will have to wake up the ksoftirqd (due to in_interrupt() == 0). Before this patch After this patch: call_rcu_sched() in idle; call_rcu_sched() in idle set resched do other stuffs; do other stuffs outmost rcu_irq_exit() outmost rcu_irq_exit() (empty function) (or rcu_idle_enter()) (or rcu_idle_enter(), also empty function) start to resched. (see above) rcu_sched_qs() rcu_sched_qs() QS,and GP and advance cb QS,and GP and advance cb wake up the ksoftirqd wake up the ksoftirqd set resched resched to ksoftirqd (or other) resched to ksoftirqd (or other) These two code patches are almost the same. Size changed after patched: size kernel/rcu/tiny-old.o kernel/rcu/tiny-patched.o text data bss dec hex filename 3449 206 8 3663 e4f kernel/rcu/tiny-old.o 2406 144 8 2558 9fe kernel/rcu/tiny-patched.o Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 07bb02e..80adef7 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -137,4 +137,10 @@ int rcu_jiffies_till_stall_check(void); void rcu_early_boot_tests(void); +/* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 805b6d5..85287ec 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_ctrlblk *rcp); -static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - #include "tiny_plugin.h" -/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */ -static void rcu_idle_enter_common(long long newval) -{ - if (newval) { - RCU_TRACE(trace_rcu_dyntick(TPS("--="), - rcu_dynticks_nesting, newval)); - rcu_dynticks_nesting = newval; - return; - } - RCU_TRACE(trace_rcu_dyntick(TPS("Start"), - rcu_dynticks_nesting, newval)); - if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), - rcu_dynticks_nesting, newval)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } - rcu_sched_qs(); /* implies rcu_bh_inc() */ - barrier(); - rcu_dynticks_nesting = newval; -} - /* * Enter idle, which is an extended quiescent state if we have fully - * entered that mode (i.e., if the new value of dynticks_nesting is zero). + * entered that mode. */ void rcu_idle_enter(void) { - unsigned long flags; - long long newval; - - local_irq_save(flags); - WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); - if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == - DYNTICK_TASK_NEST_VALUE) - newval = 0; - else - newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE; - rcu_idle_enter_common(newval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_irq_exit(void) { - unsigned long flags; - long long newval; - - local_irq_save(flags); - newval = rcu_dynticks_nesting - 1; - WARN_ON_ONCE(newval < 0); - rcu_idle_enter_common(newval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_irq_exit); -/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */ -static void rcu_idle_exit_common(long long oldval) -{ - if (oldval) { - RCU_TRACE(trace_rcu_dyntick(TPS("++="), - oldval, rcu_dynticks_nesting)); - return; - } - RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); - if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), - oldval, rcu_dynticks_nesting)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } -} - /* * Exit idle, so that we are no longer in an extended quiescent state. */ void rcu_idle_exit(void) { - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - WARN_ON_ONCE(rcu_dynticks_nesting < 0); - if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) - rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - else - rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_idle_exit_common(oldval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); */ void rcu_irq_enter(void) { - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting++; - WARN_ON_ONCE(rcu_dynticks_nesting == 0); - rcu_idle_exit_common(oldval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_irq_enter); @@ -179,7 +89,7 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter); */ bool notrace __rcu_is_watching(void) { - return rcu_dynticks_nesting; + return true; } EXPORT_SYMBOL(__rcu_is_watching); @@ -347,6 +257,11 @@ static void __call_rcu(struct rcu_head *head, rcp->curtail = &head->next; RCU_TRACE(rcp->qlen++); local_irq_restore(flags); + + if (unlikely(is_idle_task(current))) { + /* force scheduling for rcu_sched_qs() */ + resched_cpu(0); + } } /* diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 858c565..b5fc5e5 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -147,7 +147,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) js = ACCESS_ONCE(rcp->jiffies_stall); if (*rcp->curtail && ULONG_CMP_GE(j, js)) { pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, + rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE, jiffies - rcp->gp_start, rcp->qlen); dump_stack(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9c25b99..203b50d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -935,12 +935,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, } /* - * This function really isn't for public consumption, but RCU is special in - * that context switches can allow the state machine to make progress. - */ -extern void resched_cpu(int cpu); - -/* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() -- cgit v0.10.2 From f520c98e3e5212d8c282a86d9b7697dd70326192 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Fri, 12 Dec 2014 09:36:14 +0800 Subject: rculist: Fix sparse warning This fixes the following sparse warnings: make C=1 CF=-D__CHECK_ENDIAN__ net/ipv6/addrconf.o net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces) net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces) net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces) net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces) To silence these spare complaints, an RCU annotation should be added to "next" pointer of hlist_node structure through hlist_next_rcu() macro when iterating over a hlist with hlist_for_each_entry_continue_rcu_bh(). By the way, this commit also resolves the same error appearing in hlist_for_each_entry_continue_rcu(). Signed-off-by: Ying Xue Signed-off-by: Paul E. McKenney diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 529bc94..a18b16f 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -524,11 +524,11 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue_rcu(pos, member) \ - for (pos = hlist_entry_safe(rcu_dereference((pos)->member.next),\ - typeof(*(pos)), member); \ + for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ + &(pos)->member)), typeof(*(pos)), member); \ pos; \ - pos = hlist_entry_safe(rcu_dereference((pos)->member.next),\ - typeof(*(pos)), member)) + pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ + &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point @@ -536,11 +536,11 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue_rcu_bh(pos, member) \ - for (pos = hlist_entry_safe(rcu_dereference_bh((pos)->member.next),\ - typeof(*(pos)), member); \ + for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \ + &(pos)->member)), typeof(*(pos)), member); \ pos; \ - pos = hlist_entry_safe(rcu_dereference_bh((pos)->member.next),\ - typeof(*(pos)), member)) + pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \ + &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point -- cgit v0.10.2 From 87af9e7ff9d909e70a006ca0974466e2a1d8db0a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 12 Dec 2014 10:11:44 +0100 Subject: hotplugcpu: Avoid deadlocks by waking active_writer Commit b2c4623dcd07 ("rcu: More on deadlock between CPU hotplug and expedited grace periods") introduced another problem that can easily be reproduced by starting/stopping cpus in a loop. E.g.: for i in `seq 5000`; do echo 1 > /sys/devices/system/cpu/cpu1/online echo 0 > /sys/devices/system/cpu/cpu1/online done Will result in: INFO: task /cpu_start_stop:1 blocked for more than 120 seconds. Call Trace: ([<00000000006a028e>] __schedule+0x406/0x91c) [<0000000000130f60>] cpu_hotplug_begin+0xd0/0xd4 [<0000000000130ff6>] _cpu_up+0x3e/0x1c4 [<0000000000131232>] cpu_up+0xb6/0xd4 [<00000000004a5720>] device_online+0x80/0xc0 [<00000000004a57f0>] online_store+0x90/0xb0 ... And a deadlock. Problem is that if the last ref in put_online_cpus() can't get the cpu_hotplug.lock the puts_pending count is incremented, but a sleeping active_writer might never be woken up, therefore never exiting the loop in cpu_hotplug_begin(). This fix removes puts_pending and turns refcount into an atomic variable. We also introduce a wait queue for the active_writer, to avoid possible races and use-after-free. There is no need to take the lock in put_online_cpus() anymore. Can't reproduce it with this fix. Signed-off-by: David Hildenbrand Signed-off-by: Paul E. McKenney diff --git a/kernel/cpu.c b/kernel/cpu.c index 5d22023..1972b16 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -58,22 +58,23 @@ static int cpu_hotplug_disabled; static struct { struct task_struct *active_writer; - struct mutex lock; /* Synchronizes accesses to refcount, */ + /* wait queue to wake up the active_writer */ + wait_queue_head_t wq; + /* verifies that no writer will get active while readers are active */ + struct mutex lock; /* * Also blocks the new readers during * an ongoing cpu hotplug operation. */ - int refcount; - /* And allows lockless put_online_cpus(). */ - atomic_t puts_pending; + atomic_t refcount; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif } cpu_hotplug = { .active_writer = NULL, + .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), - .refcount = 0, #ifdef CONFIG_DEBUG_LOCK_ALLOC .dep_map = {.name = "cpu_hotplug.lock" }, #endif @@ -86,15 +87,6 @@ static struct { #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) -static void apply_puts_pending(int max) -{ - int delta; - - if (atomic_read(&cpu_hotplug.puts_pending) >= max) { - delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); - cpu_hotplug.refcount -= delta; - } -} void get_online_cpus(void) { @@ -103,8 +95,7 @@ void get_online_cpus(void) return; cpuhp_lock_acquire_read(); mutex_lock(&cpu_hotplug.lock); - apply_puts_pending(65536); - cpu_hotplug.refcount++; + atomic_inc(&cpu_hotplug.refcount); mutex_unlock(&cpu_hotplug.lock); } EXPORT_SYMBOL_GPL(get_online_cpus); @@ -116,8 +107,7 @@ bool try_get_online_cpus(void) if (!mutex_trylock(&cpu_hotplug.lock)) return false; cpuhp_lock_acquire_tryread(); - apply_puts_pending(65536); - cpu_hotplug.refcount++; + atomic_inc(&cpu_hotplug.refcount); mutex_unlock(&cpu_hotplug.lock); return true; } @@ -125,20 +115,18 @@ EXPORT_SYMBOL_GPL(try_get_online_cpus); void put_online_cpus(void) { + int refcount; + if (cpu_hotplug.active_writer == current) return; - if (!mutex_trylock(&cpu_hotplug.lock)) { - atomic_inc(&cpu_hotplug.puts_pending); - cpuhp_lock_release(); - return; - } - if (WARN_ON(!cpu_hotplug.refcount)) - cpu_hotplug.refcount++; /* try to fix things up */ + refcount = atomic_dec_return(&cpu_hotplug.refcount); + if (WARN_ON(refcount < 0)) /* try to fix things up */ + atomic_inc(&cpu_hotplug.refcount); + + if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq)) + wake_up(&cpu_hotplug.wq); - if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) - wake_up_process(cpu_hotplug.active_writer); - mutex_unlock(&cpu_hotplug.lock); cpuhp_lock_release(); } @@ -168,18 +156,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus); */ void cpu_hotplug_begin(void) { - cpu_hotplug.active_writer = current; + DEFINE_WAIT(wait); + cpu_hotplug.active_writer = current; cpuhp_lock_acquire(); + for (;;) { mutex_lock(&cpu_hotplug.lock); - apply_puts_pending(1); - if (likely(!cpu_hotplug.refcount)) - break; - __set_current_state(TASK_UNINTERRUPTIBLE); + prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE); + if (likely(!atomic_read(&cpu_hotplug.refcount))) + break; mutex_unlock(&cpu_hotplug.lock); schedule(); } + finish_wait(&cpu_hotplug.wq, &wait); } void cpu_hotplug_done(void) -- cgit v0.10.2 From 41050a009640f9f330a5b916563ca7faf853a98c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 18 Dec 2014 12:31:27 -0800 Subject: rcu: Fix rcu_barrier() race that could result in too-short wait The rcu_barrier() no-callbacks check for no-CBs CPUs has race conditions. It checks a given CPU's lists of callbacks, and if all three no-CBs lists are empty, ignores that CPU. However, these three lists could potentially be empty even when callbacks are present if the check executed just as the callbacks were being moved from one list to another. It turns out that recent versions of rcutorture can spot this race. This commit plugs this hole by consolidating the per-list counts of no-CBs callbacks into a single count, which is incremented before the corresponding callback is posted and after it is invoked. Then rcu_barrier() checks this single count to reliably determine whether the corresponding CPU has no-CBs callbacks. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 203b50d..5b9f3b9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3344,6 +3344,7 @@ static void _rcu_barrier(struct rcu_state *rsp) } else { _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, rsp->n_barrier_done); + smp_mb__before_atomic(); atomic_inc(&rsp->barrier_cpu_count); __call_rcu(&rdp->barrier_head, rcu_barrier_callback, rsp, cpu, 0); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e7b184..cb59086 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -340,14 +340,10 @@ struct rcu_data { #ifdef CONFIG_RCU_NOCB_CPU struct rcu_head *nocb_head; /* CBs waiting for kthread. */ struct rcu_head **nocb_tail; - atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ - atomic_long_t nocb_q_count_lazy; /* (approximate). */ + atomic_long_t nocb_q_count; /* # CBs waiting for nocb */ + atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ struct rcu_head **nocb_follower_tail; - atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */ - atomic_long_t nocb_follower_count_lazy; /* (approximate). */ - int nocb_p_count; /* # CBs being invoked by kthread */ - int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ @@ -356,8 +352,6 @@ struct rcu_data { struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; /* CBs waiting for GP. */ struct rcu_head **nocb_gp_tail; - long nocb_gp_count; - long nocb_gp_count_lazy; bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ struct rcu_data *nocb_next_follower; /* Next follower in wakeup chain. */ @@ -622,24 +616,15 @@ static void rcu_dynticks_task_exit(void); #endif /* #ifndef RCU_TREE_NONCORE */ #ifdef CONFIG_RCU_TRACE -#ifdef CONFIG_RCU_NOCB_CPU -/* Sum up queue lengths for tracing. */ +/* Read out queue lengths for tracing. */ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) { - *ql = atomic_long_read(&rdp->nocb_q_count) + - rdp->nocb_p_count + - atomic_long_read(&rdp->nocb_follower_count) + - rdp->nocb_p_count + rdp->nocb_gp_count; - *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + - rdp->nocb_p_count_lazy + - atomic_long_read(&rdp->nocb_follower_count_lazy) + - rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy; -} +#ifdef CONFIG_RCU_NOCB_CPU + *ql = atomic_long_read(&rdp->nocb_q_count); + *qll = atomic_long_read(&rdp->nocb_q_count_lazy); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) -{ *ql = 0; *qll = 0; -} #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +} #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb..e5c43b7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2056,9 +2056,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + unsigned long ret; +#ifdef CONFIG_PROVE_RCU struct rcu_head *rhp; +#endif /* #ifdef CONFIG_PROVE_RCU */ + + /* + * Check count of all no-CBs callbacks awaiting invocation. + * There needs to be a barrier before this function is called, + * but associated with a prior determination that no more + * callbacks would be posted. In the worst case, the first + * barrier in _rcu_barrier() suffices (but the caller cannot + * necessarily rely on this, not a substitute for the caller + * getting the concurrency design right!). There must also be + * a barrier between the following load an posting of a callback + * (if a callback is in fact needed). This is associated with an + * atomic_inc() in the caller. + */ + ret = atomic_long_read(&rdp->nocb_q_count); - /* No-CBs CPUs might have callbacks on any of three lists. */ +#ifdef CONFIG_PROVE_RCU rhp = ACCESS_ONCE(rdp->nocb_head); if (!rhp) rhp = ACCESS_ONCE(rdp->nocb_gp_head); @@ -2072,8 +2089,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) cpu, rhp->func); WARN_ON_ONCE(1); } +#endif /* #ifdef CONFIG_PROVE_RCU */ - return !!rhp; + return !!ret; } /* @@ -2095,9 +2113,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, struct task_struct *t; /* Enqueue the callback on the nocb list and update counts. */ + atomic_long_add(rhcount, &rdp->nocb_q_count); + /* rcu_barrier() relies on ->nocb_q_count add before xchg. */ old_rhpp = xchg(&rdp->nocb_tail, rhtp); ACCESS_ONCE(*old_rhpp) = rhp; - atomic_long_add(rhcount, &rdp->nocb_q_count); atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ @@ -2288,9 +2307,6 @@ wait_again: /* Move callbacks to wait-for-GP list, which is empty. */ ACCESS_ONCE(rdp->nocb_head) = NULL; rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); - rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0); - rdp->nocb_gp_count_lazy = - atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); gotcbs = true; } @@ -2338,9 +2354,6 @@ wait_again: /* Append callbacks to follower's "done" list. */ tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); *tail = rdp->nocb_gp_head; - atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); - atomic_long_add(rdp->nocb_gp_count_lazy, - &rdp->nocb_follower_count_lazy); smp_mb__after_atomic(); /* Store *tail before wakeup. */ if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { /* @@ -2415,13 +2428,11 @@ static int rcu_nocb_kthread(void *arg) trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); ACCESS_ONCE(rdp->nocb_follower_head) = NULL; tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); - c = atomic_long_xchg(&rdp->nocb_follower_count, 0); - cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0); - rdp->nocb_p_count += c; - rdp->nocb_p_count_lazy += cl; /* Each pass through the following loop invokes a callback. */ - trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); + trace_rcu_batch_start(rdp->rsp->name, + atomic_long_read(&rdp->nocb_q_count_lazy), + atomic_long_read(&rdp->nocb_q_count), -1); c = cl = 0; while (list) { next = list->next; @@ -2443,9 +2454,9 @@ static int rcu_nocb_kthread(void *arg) list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); - ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c; - ACCESS_ONCE(rdp->nocb_p_count_lazy) = - rdp->nocb_p_count_lazy - cl; + smp_mb__before_atomic(); /* _add after CB invocation. */ + atomic_long_add(-c, &rdp->nocb_q_count); + atomic_long_add(-cl, &rdp->nocb_q_count_lazy); rdp->n_nocbs_invoked += c; } return 0; -- cgit v0.10.2 From 5a43b88e98eaca88fa12abcacd0000adc879dd2c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Dec 2014 17:55:28 +0800 Subject: rcu: Remove "select IRQ_WORK" from config TREE_RCU The 48a7639ce80c ("rcu: Make callers awaken grace-period kthread") removed the irq_work_queue(), so the TREE_RCU doesn't need irq work any more. This commit therefore updates RCU's Kconfig and Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney diff --git a/init/Kconfig b/init/Kconfig index 9afb971..39b4313 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -470,7 +470,6 @@ choice config TREE_RCU bool "Tree-based hierarchical RCU" depends on !PREEMPT && SMP - select IRQ_WORK help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -480,7 +479,6 @@ config TREE_RCU config PREEMPT_RCU bool "Preemptible tree-based hierarchical RCU" depends on PREEMPT - select IRQ_WORK help This option selects the RCU implementation that is designed for very large SMP systems with hundreds or diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index cb59086..41860b9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,7 +27,6 @@ #include #include #include -#include /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and -- cgit v0.10.2 From b08ea27d95bcaee6d9cf4edd64f373006661424a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Oct 2014 15:39:39 -0700 Subject: rcu: Protect rcu_boost() lockless accesses with ACCESS_ONCE() This commit prevents random compiler optimizations by applying ACCESS_ONCE() to lockless accesses. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb..d59913e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1127,7 +1127,8 @@ static int rcu_boost(struct rcu_node *rnp) struct task_struct *t; struct list_head *tb; - if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) + if (ACCESS_ONCE(rnp->exp_tasks) == NULL && + ACCESS_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ raw_spin_lock_irqsave(&rnp->lock, flags); -- cgit v0.10.2 From 74e871ac6cb17f67cbefb569d98a8d05de666e07 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Oct 2014 21:08:53 -0700 Subject: rcu: Rename "empty" to "empty_norm" in preparation for boost rework This commit undertakes a simple variable renaming to make way for some rework of RCU priority boosting. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d59913e..3d22f0b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -313,8 +313,8 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, */ void rcu_read_unlock_special(struct task_struct *t) { - int empty; int empty_exp; + int empty_norm; int empty_exp_now; unsigned long flags; struct list_head *np; @@ -367,7 +367,7 @@ void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = !rcu_preempt_blocked_readers_cgp(rnp); + empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); @@ -393,7 +393,7 @@ void rcu_read_unlock_special(struct task_struct *t) * so we must take a snapshot of the expedited state. */ empty_exp_now = !rcu_preempted_readers_exp(rnp); - if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { + if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gpnum, 0, rnp->qsmask, -- cgit v0.10.2 From 8af3a5e78cfb63abe8813743946b7bd5a8a3134c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 11:22:37 -0700 Subject: rcu: Abstract rcu_cleanup_dead_rnp() from rcu_cleanup_dead_cpu() This commit abstracts rcu_cleanup_dead_rnp() from rcu_cleanup_dead_cpu() in preparation for the rework of RCU priority boosting. This new function will be invoked from rcu_read_unlock_special() in the reworked scheme, which is why rcu_cleanup_dead_rnp() assumes that the leaf rcu_node structure's ->qsmaskinit field has already been updated. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4c106fc..75c6b33 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2227,6 +2227,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) } /* + * All CPUs for the specified rcu_node structure have gone offline, + * and all tasks that were preempted within an RCU read-side critical + * section while running on one of those CPUs have since exited their RCU + * read-side critical section. Some other CPU is reporting this fact with + * the specified rcu_node structure's ->lock held and interrupts disabled. + * This function therefore goes up the tree of rcu_node structures, + * clearing the corresponding bits in the ->qsmaskinit fields. Note that + * the leaf rcu_node structure's ->qsmaskinit field has already been + * updated + * + * This function does check that the specified rcu_node structure has + * all CPUs offline and no blocked tasks, so it is OK to invoke it + * prematurely. That said, invoking it after the fact will cost you + * a needless lock acquisition. So once it has done its work, don't + * invoke it again. + */ +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ + long mask; + struct rcu_node *rnp = rnp_leaf; + + if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) + return; + for (;;) { + mask = rnp->grpmask; + rnp = rnp->parent; + if (!rnp) + break; + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); /* GP memory ordering. */ + rnp->qsmaskinit &= ~mask; + if (rnp->qsmaskinit) { + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + return; + } + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } +} + +/* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, * including orphaning the outgoing CPU's RCU callbacks, and also @@ -2236,7 +2276,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; - unsigned long mask; int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2252,24 +2291,14 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); - /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - mask = rdp->grpmask; /* rnp->grplo is constant. */ - do { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); - rnp->qsmaskinit &= ~mask; - if (rnp->qsmaskinit != 0) { - if (rnp != rdp->mynode) - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - break; - } - if (rnp == rdp->mynode) - need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - else - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - mask = rnp->grpmask; - rnp = rnp->parent; - } while (rnp != NULL); + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinit &= ~rdp->grpmask; + if (rnp->qsmaskinit == 0) { + need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + rcu_cleanup_dead_rnp(rnp); + } /* * We still hold the leaf rcu_node structure lock here, and diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e7b184..9315477 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -552,6 +552,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); +static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3d22f0b..d044b9c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -307,6 +307,15 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, } /* + * Return true if the specified rcu_node structure has tasks that were + * preempted within an RCU read-side critical section. + */ +static bool rcu_preempt_has_tasks(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blkd_tasks); +} + +/* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU * read-side critical section. @@ -967,6 +976,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore(&rnp->lock, flags); } +/* + * Because there is no preemptible RCU, there can be no readers blocked. + */ +static bool rcu_preempt_has_tasks(struct rcu_node *rnp) +{ + return false; +} + #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* -- cgit v0.10.2 From b6a932d1d9840727eee619d455bdeeedaa205be9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 12:05:04 -0700 Subject: rcu: Make rcu_read_unlock_special() propagate ->qsmaskinit bit clearing This commit causes rcu_read_unlock_special() to propagate ->qsmaskinit bit clearing up the rcu_node tree once a given rcu_node structure's blkd_tasks list becomes empty. This is the final commit in preparation for the rework of RCU priority boosting: It enables preempted tasks to remain queued on their rcu_node structure even after all of that rcu_node structure's CPUs have gone offline. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 75c6b33..6625a1b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2329,6 +2329,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } +static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ +} + static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d044b9c..8a2b841 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -322,9 +322,10 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) */ void rcu_read_unlock_special(struct task_struct *t) { - int empty_exp; - int empty_norm; - int empty_exp_now; + bool empty; + bool empty_exp; + bool empty_norm; + bool empty_exp_now; unsigned long flags; struct list_head *np; #ifdef CONFIG_RCU_BOOST @@ -376,6 +377,7 @@ void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } + empty = !rcu_preempt_has_tasks(rnp); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -396,6 +398,14 @@ void rcu_read_unlock_special(struct task_struct *t) #endif /* #ifdef CONFIG_RCU_BOOST */ /* + * If this was the last task on the list, go see if we + * need to propagate ->qsmaskinit bit clearing up the + * rcu_node tree. + */ + if (!empty && !rcu_preempt_has_tasks(rnp)) + rcu_cleanup_dead_rnp(rnp); + + /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, -- cgit v0.10.2 From d19fb8d1f3f66cc342d30aa48f090c70afb753ed Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 12:56:16 -0700 Subject: rcu: Don't migrate blocked tasks even if all corresponding CPUs offline When the last CPU associated with a given leaf rcu_node structure goes offline, something must be done about the tasks queued on that rcu_node structure. Each of these tasks has been preempted on one of the leaf rcu_node structure's CPUs while in an RCU read-side critical section that it have not yet exited. Handling these tasks is the job of rcu_preempt_offline_tasks(), which migrates them from the leaf rcu_node structure to the root rcu_node structure. Unfortunately, this migration has to be done one task at a time because each tasks allegiance must be shifted from the original leaf rcu_node to the root, so that future attempts to deal with these tasks will acquire the root rcu_node structure's ->lock rather than that of the leaf. Worse yet, this migration must be done with interrupts disabled, which is not so good for realtime response, especially given that there is no bound on the number of tasks on a given rcu_node structure's list. (OK, OK, there is a bound, it is just that it is unreasonably large, especially on 64-bit systems.) This was not considered a problem back when rcu_preempt_offline_tasks() was first written because realtime systems were assumed not to do CPU-hotplug operations while real-time applications were running. This assumption has proved of dubious validity given that people are starting to run multiple realtime applications on a single SMP system and that it is common practice to offline then online a CPU before starting its real-time application in order to clear extraneous processing off of that CPU. So we now need CPU hotplug operations to avoid undue latencies. This commit therefore avoids migrating these tasks, instead letting them be dequeued one by one from the original leaf rcu_node structure by rcu_read_unlock_special(). This means that the clearing of bits from the upper-level rcu_node structures must be deferred until the last such task has been dequeued, because otherwise subsequent grace periods won't wait on them. This commit has the beneficial side effect of simplifying the CPU-hotplug code for TREE_PREEMPT_RCU, especially in CONFIG_RCU_BOOST builds. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6625a1b..84f16cf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2276,7 +2276,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; - int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2295,25 +2294,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) raw_spin_lock(&rnp->lock); /* irqs already disabled. */ smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ rnp->qsmaskinit &= ~rdp->grpmask; - if (rnp->qsmaskinit == 0) { - need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp)) rcu_cleanup_dead_rnp(rnp); - } - - /* - * We still hold the leaf rcu_node structure lock here, and - * irqs are still disabled. The reason for this subterfuge is - * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock - * held leads to deadlock. - */ raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ - rnp = rdp->mynode; - if (need_report & RCU_OFL_TASKS_NORM_GP) - rcu_report_unblock_qs_rnp(rnp, flags); - else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); + raw_spin_unlock_irqrestore(&rnp->lock, flags); WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9315477..883ebc8 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -514,13 +514,6 @@ extern struct list_head rcu_struct_flavors; #define for_each_rcu_flavor(rsp) \ list_for_each_entry((rsp), &rcu_struct_flavors, flavors) -/* Return values for rcu_preempt_offline_tasks(). */ - -#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ - /* GP were moved to root. */ -#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ - /* GP were moved to root. */ - /* * RCU implementation internal declarations: */ @@ -550,24 +543,13 @@ long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, - unsigned long flags); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -#ifdef CONFIG_HOTPLUG_CPU -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake); -#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */ static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8a2b841..d594da4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -103,6 +103,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *rcu_state_p = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake); /* * Tell them what RCU they are running. @@ -545,92 +547,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU -/* - * Handle tasklist migration for case in which all CPUs covered by the - * specified rcu_node have gone offline. Move them up to the root - * rcu_node. The reason for not just moving them to the immediate - * parent is to remove the need for rcu_read_unlock_special() to - * make more than two attempts to acquire the target rcu_node's lock. - * Returns true if there were tasks blocking the current RCU grace - * period. - * - * Returns 1 if there was previously a task blocking the current grace - * period on the specified rcu_node structure. - * - * The caller must hold rnp->lock with irqs disabled. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - struct list_head *lp; - struct list_head *lp_root; - int retval = 0; - struct rcu_node *rnp_root = rcu_get_root(rsp); - struct task_struct *t; - - if (rnp == rnp_root) { - WARN_ONCE(1, "Last CPU thought to be offlined?"); - return 0; /* Shouldn't happen: at least one CPU online. */ - } - - /* If we are on an internal node, complain bitterly. */ - WARN_ON_ONCE(rnp != rdp->mynode); - - /* - * Move tasks up to root rcu_node. Don't try to get fancy for - * this corner-case operation -- just put this node's tasks - * at the head of the root node's list, and update the root node's - * ->gp_tasks and ->exp_tasks pointers to those of this node's, - * if non-NULL. This might result in waiting for more tasks than - * absolutely necessary, but this is a good performance/complexity - * tradeoff. - */ - if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) - retval |= RCU_OFL_TASKS_NORM_GP; - if (rcu_preempted_readers_exp(rnp)) - retval |= RCU_OFL_TASKS_EXP_GP; - lp = &rnp->blkd_tasks; - lp_root = &rnp_root->blkd_tasks; - while (!list_empty(lp)) { - t = list_entry(lp->next, typeof(*t), rcu_node_entry); - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - list_del(&t->rcu_node_entry); - t->rcu_blocked_node = rnp_root; - list_add(&t->rcu_node_entry, lp_root); - if (&t->rcu_node_entry == rnp->gp_tasks) - rnp_root->gp_tasks = rnp->gp_tasks; - if (&t->rcu_node_entry == rnp->exp_tasks) - rnp_root->exp_tasks = rnp->exp_tasks; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp_root->boost_tasks = rnp->boost_tasks; -#endif /* #ifdef CONFIG_RCU_BOOST */ - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ - } - - rnp->gp_tasks = NULL; - rnp->exp_tasks = NULL; -#ifdef CONFIG_RCU_BOOST - rnp->boost_tasks = NULL; - /* - * In case root is being boosted and leaf was not. Make sure - * that we boost the tasks blocking the current grace period - * in this case. - */ - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - if (rnp_root->boost_tasks != NULL && - rnp_root->boost_tasks != rnp_root->gp_tasks && - rnp_root->boost_tasks != rnp_root->exp_tasks) - rnp_root->boost_tasks = rnp_root->gp_tasks; - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ -#endif /* #ifdef CONFIG_RCU_BOOST */ - - return retval; -} - #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* @@ -979,13 +895,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU -/* Because preemptible RCU does not exist, no quieting of tasks. */ -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) - __releases(rnp->lock) -{ - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - /* * Because there is no preemptible RCU, there can be no readers blocked. */ @@ -1023,23 +932,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) WARN_ON_ONCE(rnp->qsmask); } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections, and - * such non-existent tasks cannot possibly have been blocking the current - * grace period. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - return 0; -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -1058,20 +950,6 @@ void synchronize_rcu_expedited(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, there is never any need to - * report on tasks preempted in RCU read-side critical sections during - * expedited RCU grace periods. - */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) -{ -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). -- cgit v0.10.2 From a8f4cbadfb5c5f7f5d9eef70821276a4e2e9289d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 13:48:41 -0700 Subject: rcu: Shorten irq-disable region in rcu_cleanup_dead_cpu() Now that we are not migrating callbacks, there is no need to hold the ->orphan_lock across the the ->qsmaskinit bit-clearing process. This commit therefore releases ->orphan_lock immediately after adopting the orphaned RCU callbacks. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 84f16cf..990b406 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2289,14 +2289,14 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); + raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ rnp->qsmaskinit &= ~rdp->grpmask; if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp)) rcu_cleanup_dead_rnp(rnp); - raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", -- cgit v0.10.2 From 96e92021d4fad8543d598b5e4926cb34fd40c4c4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 14:09:23 -0700 Subject: rcu: Make use of rcu_preempt_has_tasks() Given that there is now arcu_preempt_has_tasks() function that checks to see if the ->blkd_tasks list is non-empty, this commit makes use of it. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d594da4..bcc6d91 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -540,7 +540,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); - if (!list_empty(&rnp->blkd_tasks)) + if (rcu_preempt_has_tasks(rnp)) rnp->gp_tasks = rnp->blkd_tasks.next; WARN_ON_ONCE(rnp->qsmask); } @@ -706,7 +706,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if (list_empty(&rnp->blkd_tasks)) { + if (!rcu_preempt_has_tasks(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { rnp->exp_tasks = rnp->blkd_tasks.next; @@ -985,7 +985,7 @@ void exit_rcu(void) static void rcu_initiate_boost_trace(struct rcu_node *rnp) { - if (list_empty(&rnp->blkd_tasks)) + if (!rcu_preempt_has_tasks(rnp)) rnp->n_balk_blkd_tasks++; else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) rnp->n_balk_exp_gp_tasks++; -- cgit v0.10.2 From 3e9f5c70d85060ff0db71826e2048daffec336e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Nov 2014 18:15:17 -0800 Subject: rcu: Don't spawn rcub kthreads on root rcu_node structure Now that offlining CPUs no longer moves leaf rcu_node structures' ->blkd_tasks lists to the root, there is no way for the root rcu_node structure's ->blkd_task list to be nonempty, unless the root node is also the sole leaf node. This commit therefore refrains from creating an rcub kthread for the root rcu_node structure unless it is also the sole leaf. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index bcc6d91..3e64bb1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1352,12 +1352,8 @@ static void __init rcu_spawn_boost_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); - rnp = rcu_get_root(rcu_state_p); - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); - if (NUM_RCU_NODES > 1) { - rcu_for_each_leaf_node(rcu_state_p, rnp) - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); - } + rcu_for_each_leaf_node(rcu_state_p, rnp) + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); } static void rcu_prepare_kthreads(int cpu) -- cgit v0.10.2 From 1be0085b515e786e962b9f2d03616209eb6eb9a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Nov 2014 18:20:20 -0800 Subject: rcu: Don't initiate RCU priority boosting on root rcu_node Because there is no longer any preempted tasks on the root rcu_node, and because there is no longer ever an rcub kthread for the root rcu_node, this commit drops the code in force_qs_rnp() that attempts to awaken the non-existent root rcub kthread. This is strictly a performance enhancement, removing a root rcu_node ->lock acquisition and release along with some tests in rcu_initiate_boost(), ending with the test that notes that there is no rcub kthread. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 990b406..5a0a4c96 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2513,12 +2513,6 @@ static void force_qs_rnp(struct rcu_state *rsp, } raw_spin_unlock_irqrestore(&rnp->lock, flags); } - rnp = rcu_get_root(rsp); - if (rnp->qsmask == 0) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ - } } /* -- cgit v0.10.2 From 5d0b024973027556b48a09bb36b55dc853ec7c6e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Nov 2014 08:07:08 -0800 Subject: rcu: Don't bother affinitying rcub kthreads away from offline CPUs When rcu_boost_kthread_setaffinity() sees that all CPUs for a given rcu_node structure are now offline, it affinities the corresponding RCU-boost ("rcub") kthread away from those CPUs. This is pointless because the kthread cannot run on those offline CPUs in any case. This commit therefore removes this unneeded code. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3e64bb1..1fac682 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1322,12 +1322,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) if ((mask & 0x1) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); - if (cpumask_weight(cm) == 0) { + if (cpumask_weight(cm) == 0) cpumask_setall(cm); - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) - cpumask_clear_cpu(cpu, cm); - WARN_ON_ONCE(cpumask_weight(cm) == 0); - } set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } -- cgit v0.10.2 From 3ba4d0e09bf965297e97adf195e0ea246cfe5c74 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Nov 2014 09:57:51 -0800 Subject: rcu: Note quiescent state when CPU goes offline The rcu_cleanup_dead_cpu() function (called after a CPU has gone completely offline) has not reported a quiescent state because there was probably at least one synchronize_rcu() between the time the CPU went offline and the CPU_DEAD notifier, and this would have detected the CPU's offline state via quiescent-state forcing. However, the plan is for CPUs to take themselves offline, at which point it makes sense for them to report their own quiescent state. This commit makes this change in preparation for the new CPU-hotplug setup. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5a0a4c96..24d3c67 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2297,7 +2297,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) rnp->qsmaskinit &= ~rdp->grpmask; if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp)) rcu_cleanup_dead_rnp(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */ WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); -- cgit v0.10.2 From abaf3f9d275b8d856ae5e47531e40c0bfeac012b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 18 Nov 2014 16:30:01 +0800 Subject: rcu: Revert "Allow post-unlock reference for rt_mutex" to avoid priority-inversion The patch dfeb9765ce3c ("Allow post-unlock reference for rt_mutex") ensured rcu-boost safe even the rt_mutex has post-unlock reference. But rt_mutex allowing post-unlock reference is definitely a bug and it was fixed by the commit 27e35715df54 ("rtmutex: Plug slow unlock race"). This fix made the previous patch (dfeb9765ce3c) useless. And even worse, the priority-inversion introduced by the the previous patch still exists. rcu_read_unlock_special() { rt_mutex_unlock(&rnp->boost_mtx); /* Priority-Inversion: * the current task had been deboosted and preempted as a low * priority task immediately, it could wait long before reschedule in, * and the rcu-booster also waits on this low priority task and sleeps. * This priority-inversion makes rcu-booster can't work * as expected. */ complete(&rnp->boost_completion); } Just revert the patch to avoid it. Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Peter Zijlstra Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 883ebc8..9535647 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -172,11 +172,6 @@ struct rcu_node { /* queued on this rcu_node structure that */ /* are blocking the current grace period, */ /* there can be no such task. */ - struct completion boost_completion; - /* Used to ensure that the rt_mutex used */ - /* to carry out the boosting is fully */ - /* released with no future boostee accesses */ - /* before that rt_mutex is re-initialized. */ struct rt_mutex boost_mtx; /* Used only for the priority-boosting */ /* side effect, not as a lock. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1fac682..625e260 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -429,10 +429,8 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (drop_boost_mutex) { + if (drop_boost_mutex) rt_mutex_unlock(&rnp->boost_mtx); - complete(&rnp->boost_completion); - } #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -1081,15 +1079,11 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); - init_completion(&rnp->boost_completion); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ - /* Wait for boostee to be done w/boost_mtx before reinitializing. */ - wait_for_completion(&rnp->boost_completion); - return ACCESS_ONCE(rnp->exp_tasks) != NULL || ACCESS_ONCE(rnp->boost_tasks) != NULL; } -- cgit v0.10.2 From 6cd534ef8bfe422a5a847b953d1039841509c374 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Dec 2014 07:45:13 -0800 Subject: rcu: Don't scan root rcu_node structure for stalled tasks Now that blocked tasks are no longer migrated to the root rcu_node structure, there is no need to scan the root rcu_node structure for blocked tasks stalling the current grace period. This commit therefore removes this scan. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 24d3c67..4ed2c28 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1107,15 +1107,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); } - /* - * Now rat on any tasks that got kicked up to the root rcu_node - * due to CPU offlining. - */ - rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); - ndetected += rcu_print_task_stall(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; -- cgit v0.10.2 From ab954c167ed9ac107a8beb1d6e1745ae698a3421 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Dec 2014 18:25:59 -0800 Subject: rcu: Remove redundant callback-list initialization The RCU callback lists are initialized in both rcu_boot_init_percpu_data() and rcu_init_percpu_data(). The former is intended for initializing immutable data, so this commit removes the initialization from rcu_boot_init_percpu_data() and leaves it in rcu_init_percpu_data(). This change prepares for permitting callbacks to be queued very early in boot. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4ed2c28..febd0f7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3419,9 +3419,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); - init_callback_list(rdp); - rdp->qlen_lazy = 0; - ACCESS_ONCE(rdp->qlen) = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); -- cgit v0.10.2 From a5c198f4f7da6cc48116ca239c59c9f44b753364 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 23 Nov 2014 20:30:06 -0800 Subject: rcu: Expand SRCU ->completed to 64 bits When rcutorture used only the low-order 32 bits of the grace-period number, it was not a problem for SRCU to use a 32-bit completed field. However, rcutorture now uses the full 64 bits on 64-bit systems, so this commit converts SRCU's ->completed field to unsigned long so as to provide 64 bits on 64-bit systems. Signed-off-by: Paul E. McKenney diff --git a/include/linux/srcu.h b/include/linux/srcu.h index a2783cb..ef923dd 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -45,7 +45,7 @@ struct rcu_batch { #define RCU_BATCH_INIT(name) { NULL, &(name.head) } struct srcu_struct { - unsigned completed; + unsigned long completed; struct srcu_struct_array __percpu *per_cpu_ref; spinlock_t queue_lock; /* protect ->batch_queue, ->running */ bool running; @@ -135,7 +135,7 @@ int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); void synchronize_srcu(struct srcu_struct *sp); void synchronize_srcu_expedited(struct srcu_struct *sp); -long srcu_batches_completed(struct srcu_struct *sp); +unsigned long srcu_batches_completed(struct srcu_struct *sp); void srcu_barrier(struct srcu_struct *sp); #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index e037f3e..445bf8f 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); * Report the number of batches, correlated with, but not necessarily * precisely the same as, the number of grace periods that have elapsed. */ -long srcu_batches_completed(struct srcu_struct *sp) +unsigned long srcu_batches_completed(struct srcu_struct *sp) { return sp->completed; } -- cgit v0.10.2 From 9735af5c78599703be633c057af3faee26482028 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 26 Nov 2014 10:42:50 -0800 Subject: rcu: Combine DEFINE_SRCU() and DEFINE_STATIC_SRCU() The DEFINE_SRCU() and DEFINE_STATIC_SRCU() definitions are quite similar, so this commit combines them, saving a bit of code and removing redundancy. Signed-off-by: Paul E. McKenney diff --git a/include/linux/srcu.h b/include/linux/srcu.h index ef923dd..9cfd962 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -102,13 +102,11 @@ void process_srcu(struct work_struct *work); * define and init a srcu struct at build time. * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it. */ -#define DEFINE_SRCU(name) \ +#define __DEFINE_SRCU(name, is_static) \ static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\ - struct srcu_struct name = __SRCU_STRUCT_INIT(name); - -#define DEFINE_STATIC_SRCU(name) \ - static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\ - static struct srcu_struct name = __SRCU_STRUCT_INIT(name); + is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) +#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) +#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) /** * call_srcu() - Queue a callback for invocation after an SRCU grace period -- cgit v0.10.2 From 83fe27ea531161a655f02dc7732d14cfaa27fd5d Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Fri, 5 Dec 2014 11:24:45 -0500 Subject: rcu: Make SRCU optional by using CONFIG_SRCU SRCU is not necessary to be compiled by default in all cases. For tinification efforts not compiling SRCU unless necessary is desirable. The current patch tries to make compiling SRCU optional by introducing a new Kconfig option CONFIG_SRCU which is selected when any of the components making use of SRCU are selected. If we do not select CONFIG_SRCU, srcu.o will not be compiled at all. text data bss dec hex filename 2007 0 0 2007 7d7 kernel/rcu/srcu.o Size of arch/powerpc/boot/zImage changes from text data bss dec hex filename 831552 64180 23944 919676 e087c arch/powerpc/boot/zImage : before 829504 64180 23952 917636 e0084 arch/powerpc/boot/zImage : after so the savings are about ~2000 bytes. Signed-off-by: Pranith Kumar CC: Paul E. McKenney CC: Josh Triplett CC: Lai Jiangshan Signed-off-by: Paul E. McKenney [ paulmck: resolve conflict due to removal of arch/ia64/kvm/Kconfig. ] diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 466bd29..3afee5f 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -23,6 +23,7 @@ config KVM select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO select KVM_ARM_HOST + select SRCU depends on ARM_VIRT_EXT && ARM_LPAE ---help--- Support hosting virtualized guest machines. You will also diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 8ba85e9..b334084 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -26,6 +26,7 @@ config KVM select KVM_ARM_HOST select KVM_ARM_VGIC select KVM_ARM_TIMER + select SRCU ---help--- Support hosting virtualized guest machines. diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 30e334e..2ae1282 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -20,6 +20,7 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select KVM_MMIO + select SRCU ---help--- Support for hosting Guest kernels. Currently supported on MIPS32 processors. diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index f5769f1..11850f3 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -21,6 +21,7 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_EVENTFD + select SRCU config KVM_BOOK3S_HANDLER bool diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 646db9c..5fce52c 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -28,6 +28,7 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING + select SRCU ---help--- Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/tile/kvm/Kconfig b/arch/tile/kvm/Kconfig index 2298cb1..1e968f7 100644 --- a/arch/tile/kvm/Kconfig +++ b/arch/tile/kvm/Kconfig @@ -21,6 +21,7 @@ config KVM depends on HAVE_KVM && MODULES select PREEMPT_NOTIFIERS select ANON_INODES + select SRCU ---help--- Support hosting paravirtualized guest machines. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ba397bd..6612699 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -138,6 +138,7 @@ config X86 select HAVE_ACPI_APEI_NMI if ACPI select ACPI_LEGACY_TABLES_LOOKUP if ACPI select X86_FEATURE_NAMES if PROC_FS + select SRCU config INSTRUCTION_DECODER def_bool y diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index f9d16ff..7dc7ba5 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -40,6 +40,7 @@ config KVM select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_VFIO + select SRCU ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig index 3f44f29..91f8613 100644 --- a/drivers/clk/Kconfig +++ b/drivers/clk/Kconfig @@ -13,6 +13,7 @@ config COMMON_CLK bool select HAVE_CLK_PREPARE select CLKDEV_LOOKUP + select SRCU ---help--- The common clock framework is a single definition of struct clk, useful across many platforms, as well as an diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 29b2ef5..a171fef 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -2,6 +2,7 @@ menu "CPU Frequency scaling" config CPU_FREQ bool "CPU Frequency scaling" + select SRCU help CPU Frequency scaling allows you to change the clock speed of CPUs on the fly. This is a nice method to save power, because diff --git a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig index faf4e70..3891f67 100644 --- a/drivers/devfreq/Kconfig +++ b/drivers/devfreq/Kconfig @@ -1,5 +1,6 @@ menuconfig PM_DEVFREQ bool "Generic Dynamic Voltage and Frequency Scaling (DVFS) support" + select SRCU help A device may have a list of frequencies and voltages available. devfreq, a generic DVFS framework can be registered for a device diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 5bdedf6..c355a22 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -5,6 +5,7 @@ menuconfig MD bool "Multiple devices driver support (RAID and LVM)" depends on BLOCK + select SRCU help Support multiple physical spindles through a single logical device. Required for RAID and logical volume management. diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index d6607ee..84673eb 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -197,6 +197,7 @@ config NETCONSOLE_DYNAMIC config NETPOLL def_bool NETCONSOLE + select SRCU config NET_POLL_CONTROLLER def_bool NETPOLL diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index a66768e..80e9c18 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -8,6 +8,7 @@ config BTRFS_FS select LZO_DECOMPRESS select RAID6_PQ select XOR_BLOCKS + select SRCU help Btrfs is a general purpose copy-on-write filesystem with extents, diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index 22c629e..2a24249 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -1,5 +1,6 @@ config FSNOTIFY def_bool n + select SRCU source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index c51df1d..4a09975 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -5,6 +5,7 @@ config QUOTA bool "Quota support" select QUOTACTL + select SRCU help If you say Y here, you will be able to set per user limits for disk usage (also called disk quotas). Currently, it works for the diff --git a/init/Kconfig b/init/Kconfig index 9afb971..f085969 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -501,9 +501,17 @@ config TINY_RCU endchoice +config SRCU + bool + help + This option selects the sleepable version of RCU. This version + permits arbitrary sleeping or blocking within RCU read-side critical + sections. + config TASKS_RCU bool "Task_based RCU implementation using voluntary context switch" default n + select SRCU help This option enables a task-based RCU implementation that uses only voluntary context switch (not preemption!), idle, and @@ -1595,6 +1603,7 @@ config PERF_EVENTS depends on HAVE_PERF_EVENTS select ANON_INODES select IRQ_WORK + select SRCU help Enable kernel support for various performance events provided by software and hardware. diff --git a/kernel/notifier.c b/kernel/notifier.c index 4803da6..ae9fc7c 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh, } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); +#ifdef CONFIG_SRCU /* * SRCU notifier chain routines. Registration and unregistration * use a mutex, and call_chain is synchronized by SRCU (no locks). @@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh) } EXPORT_SYMBOL_GPL(srcu_init_notifier_head); +#endif /* CONFIG_SRCU */ + static ATOMIC_NOTIFIER_HEAD(die_chain); int notrace notify_die(enum die_val val, const char *str, diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 48b28d3..7e01f78 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -251,6 +251,7 @@ config APM_EMULATION config PM_OPP bool + select SRCU ---help--- SOCs have a standard set of tuples consisting of frequency and voltage pairs that the device will support per voltage domain. This diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index e6fae50..50a8084 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,4 +1,5 @@ -obj-y += update.o srcu.o +obj-y += update.o +obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5f2ce61..7a9c93e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1215,6 +1215,7 @@ config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL select TORTURE_TEST + select SRCU default n help This option provides a kernel module that runs torture tests diff --git a/mm/Kconfig b/mm/Kconfig index 1d1ae6b..4395b12 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -325,6 +325,7 @@ config VIRT_TO_BUS config MMU_NOTIFIER bool + select SRCU config KSM bool "Enable KSM for page merging" diff --git a/security/tomoyo/Kconfig b/security/tomoyo/Kconfig index 8eb779b9d..604e718 100644 --- a/security/tomoyo/Kconfig +++ b/security/tomoyo/Kconfig @@ -5,6 +5,7 @@ config SECURITY_TOMOYO select SECURITYFS select SECURITY_PATH select SECURITY_NETWORK + select SRCU default n help This selects TOMOYO Linux, pathname-based access control. -- cgit v0.10.2 From 68158fe2b2aa0f0c1cf96b40020c7d180915479b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Dec 2014 07:41:31 -0800 Subject: rcu: Set default to RCU_CPU_STALL_INFO=y The RCU_CPU_STALL_INFO code has been in for quite some time, and has proven reliable. This commit therefore enables it by default. Signed-off-by: Paul E. McKenney diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5f2ce61..b47d9e4 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1257,7 +1257,7 @@ config RCU_CPU_STALL_TIMEOUT config RCU_CPU_STALL_INFO bool "Print additional diagnostics on RCU CPU stall" depends on (TREE_RCU || PREEMPT_RCU) && DEBUG_KERNEL - default n + default y help For each stalled CPU that is aware of the current RCU grace period, print out additional per-CPU diagnostic information -- cgit v0.10.2 From e9408e4f278875aa5862560b4b425b27dfcb643f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Dec 2014 09:13:52 -0800 Subject: rcutorture: Add checks for stall ending before dump start The current rcutorture scripting checks for actual stalls (via the "Call Trace:" check), but fails to spot the case where a stall ends just as it is being detected. This commit therefore adds a check for this case. Signed-off-by: Paul E. McKenney diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index f962ba4..d8f35cf 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -36,7 +36,7 @@ if grep -Pq '\x00' < $file then print_warning Console output contains nul bytes, old qemu still running? fi -egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $T +egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $T if test -s $T then print_warning Assertion failure in $file $title -- cgit v0.10.2 From fc908ed33e7c1428f799abb12399f906da03b397 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Dec 2014 09:57:48 -0800 Subject: rcu: Make RCU_CPU_STALL_INFO include number of fqs attempts One way that an RCU CPU stall warning can happen is if the grace-period kthread is not allowed to execute. One proxy for this kthread's forward progress is the number of force-quiescent-state (fqs) scans. This commit therefore adds the number of fqs scans to the RCU CPU stall warning printouts when CONFIG_RCU_CPU_STALL_INFO=y. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4c106fc..654b15b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1043,6 +1043,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) j1 = rcu_jiffies_till_stall_check(); ACCESS_ONCE(rsp->jiffies_stall) = j + j1; rsp->jiffies_resched = j + j1 / 2; + rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs); } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e7b184..e300848 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -492,6 +492,8 @@ struct rcu_state { /* for CPU stalls. */ unsigned long jiffies_resched; /* Time at which to resched */ /* a reluctant CPU. */ + unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ + /* GP start. */ unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ const char *name; /* Name of structure. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb..769384d7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1898,11 +1898,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", + pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), + ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, fast_no_hz); } -- cgit v0.10.2 From 6ccd2ecd422644277b7d8b37222e3af3f43ea9ae Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Dec 2014 10:20:59 -0800 Subject: rcu: Improve diagnostics for spurious RCU CPU stall warnings The current RCU CPU stall warning code will print "Stall ended before state dump start" any time that the stall-warning code is triggered on a CPU that has already reported a quiescent state for the current grace period and if all quiescent states have been reported for the current grace period. However, a true stall can result in these symptoms, for example, by preventing RCU's grace-period kthreads from ever running This commit therefore checks for this condition, reporting the end of the stall only if one of the grace-period counters has actually advanced. Otherwise, it reports the last time that the grace-period kthread made meaningful progress. (In normal situations, the grace-period kthread should make meaningful progress at least every jiffies_till_next_fqs jiffies.) Reported-by: Miroslav Benes Signed-off-by: Paul E. McKenney Tested-by: Miroslav Benes diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index ed186a9..55f9707 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -187,6 +187,11 @@ o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the behavior, you might need to replace some of the cond_resched() calls with calls to cond_resched_rcu_qs(). +o Anything that prevents RCU's grace-period kthreads from running. + This can result in the "All QSes seen" console-log message. + This message will include information on when the kthread last + ran and how often it should be expected to run. + o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might happen to preempt a low-priority task in the middle of an RCU read-side critical section. This is especially damaging if diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 654b15b..a2ceb66 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1066,11 +1066,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) } } -static void print_other_cpu_stall(struct rcu_state *rsp) +static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; long delta; unsigned long flags; + unsigned long gpa; + unsigned long j; int ndetected = 0; struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; @@ -1123,10 +1125,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp) pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), (long)rsp->gpnum, (long)rsp->completed, totqlen); - if (ndetected == 0) - pr_err("INFO: Stall ended before state dump start\n"); - else + if (ndetected) { rcu_dump_cpu_stacks(rsp); + } else { + if (ACCESS_ONCE(rsp->gpnum) != gpnum || + ACCESS_ONCE(rsp->completed) == gpnum) { + pr_err("INFO: Stall ended before state dump start\n"); + } else { + j = jiffies; + gpa = ACCESS_ONCE(rsp->gp_activity); + pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n", + rsp->name, j - gpa, j, gpa, + jiffies_till_next_fqs); + /* In this case, the current CPU might be at fault. */ + sched_show_task(current); + } + } /* Complain about tasks blocking the grace period. */ @@ -1226,7 +1240,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(rsp); + print_other_cpu_stall(rsp, gpnum); } } @@ -1622,6 +1636,7 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); @@ -1682,6 +1697,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } mutex_unlock(&rsp->onoff_mutex); @@ -1698,6 +1714,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; rsp->n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ @@ -1736,6 +1753,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); gp_duration = jiffies - rsp->gp_start; @@ -1772,6 +1790,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); @@ -1821,6 +1840,7 @@ static int __noreturn rcu_gp_kthread(void *arg) if (rcu_gp_init(rsp)) break; cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -1864,9 +1884,11 @@ static int __noreturn rcu_gp_kthread(void *arg) ACCESS_ONCE(rsp->gpnum), TPS("fqsend")); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } else { /* Deal with stray signal. */ cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e300848..5ec81cf 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -488,6 +488,8 @@ struct rcu_state { /* due to no GP active. */ unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ + unsigned long gp_activity; /* Time of last GP kthread */ + /* activity in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ unsigned long jiffies_resched; /* Time at which to resched */ -- cgit v0.10.2 From e3663b1024d1f94688e5233440ad67a9bc10b94e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Dec 2014 20:26:55 -0800 Subject: rcu: Handle gpnum/completed wrap while dyntick idle Subtle race conditions can result if a CPU stays in dyntick-idle mode long enough for the ->gpnum and ->completed fields to wrap. For example, consider the following sequence of events: o CPU 1 encounters a quiescent state while waiting for grace period 5 to complete, but then enters dyntick-idle mode. o While CPU 1 is in dyntick-idle mode, the grace-period counters wrap around so that the grace period number is now 4. o Just as CPU 1 exits dyntick-idle mode, grace period 4 completes and grace period 5 begins. o The quiescent state that CPU 1 passed through during the old grace period 5 looks like it applies to the new grace period 5. Therefore, the new grace period 5 completes without CPU 1 having passed through a quiescent state. This could clearly be a fatal surprise to any long-running RCU read-side critical section that happened to be running on CPU 1 at the time. At one time, this was not a problem, given that it takes significant time for the grace-period counters to overflow even on 32-bit systems. However, with the advent of NO_HZ_FULL and SMP embedded systems, arbitrarily long idle periods are now becoming quite feasible. It is therefore time to close this race. This commit therefore avoids this race condition by having the quiescent-state forcing code detect when a CPU is falling too far behind, and setting a new rcu_data field ->gpwrap when this happens. Whenever this new ->gpwrap field is set, the CPU's ->gpnum and ->completed fields are known to be untrustworthy, and can be ignored, along with any associated quiescent states. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a2ceb66..5987fdc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -930,6 +930,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); return 1; } else { + if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4, + rdp->mynode->gpnum)) + ACCESS_ONCE(rdp->gpwrap) = true; return 0; } } @@ -1577,7 +1580,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; /* Handle the ends of any preceding grace periods first. */ - if (rdp->completed == rnp->completed) { + if (rdp->completed == rnp->completed && + !unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* No grace period end, so just accelerate recent callbacks. */ ret = rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1592,7 +1596,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); } - if (rdp->gpnum != rnp->gpnum) { + if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1603,6 +1607,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->passed_quiesce = 0; rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); + ACCESS_ONCE(rdp->gpwrap) = false; } return ret; } @@ -1616,7 +1621,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && - rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ + rdp->completed == ACCESS_ONCE(rnp->completed) && + !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */ !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ local_irq_restore(flags); return; @@ -2066,7 +2072,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum) { + rnp->completed == rnp->gpnum || rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -3190,7 +3196,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ + if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum || + unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */ rdp->n_rp_gp_started++; return 1; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 5ec81cf..7472ff3 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -260,6 +260,7 @@ struct rcu_data { bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ + bool gpwrap; /* Possible gpnum/completed wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ #ifdef CONFIG_RCU_CPU_STALL_INFO diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 769384d7..81ff8b9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1605,7 +1605,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * completed since we last checked and there are * callbacks not yet ready to invoke. */ - if (rdp->completed != rnp->completed && + if ((rdp->completed != rnp->completed || + unlikely(ACCESS_ONCE(rdp->gpwrap))) && rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) note_gp_changes(rsp, rdp); -- cgit v0.10.2 From 79619cf5151257f82a4c016b02283e2ceb4b63ce Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Nov 2014 10:12:27 -0800 Subject: rcutorture: Issue warnings on close calls due to Reader Batch blows Normal rcutorture checking overestimates grace periods somewhat due to the fact that there is a delay from a grace-period request until the start of the corresponding grace period and another delay from the end of that grace period to notification of the requestor. This means that rcutorture's detection of RCU bugs is less sensitive than it might be. It turns out that rcutorture also checks the underlying grace-period "completed" counter (displayed in Reader Batch output), which in theory allows rcutorture to do exact checks. In practice, memory misordering (by both compiler and CPU) can result in false positives. However, experience on x86 shows that these false positives are quite rare, occurring less than one time per 1,000 hours of testing. This commit therefore does the exact checking, giving a warning if any Reader Batch blows happen, and flagging an error if they happen more often than once every three hours in long tests. Signed-off-by: Paul E. McKenney diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh index d6cc07f..559e01a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh @@ -30,6 +30,7 @@ else echo Unreadable results directory: $i exit 1 fi +. tools/testing/selftests/rcutorture/bin/functions.sh configfile=`echo $i | sed -e 's/^.*\///'` ngps=`grep ver: $i/console.log 2> /dev/null | tail -1 | sed -e 's/^.* ver: //' -e 's/ .*$//'` @@ -48,4 +49,21 @@ else title="$title ($ngpsps per second)" fi echo $title + nclosecalls=`grep --binary-files=text 'torture: Reader Batch' $i/console.log | tail -1 | awk '{for (i=NF-8;i<=NF;i++) sum+=$i; } END {print sum}'` + if test -z "$nclosecalls" + then + exit 0 + fi + if test "$nclosecalls" -eq 0 + then + exit 0 + fi + # Compute number of close calls per tenth of an hour + nclosecalls10=`awk -v nclosecalls=$nclosecalls -v dur=$dur 'BEGIN { print int(nclosecalls * 36000 / dur) }' < /dev/null` + if test $nclosecalls10 -gt 5 -a $nclosecalls -gt 1 + then + print_bug $nclosecalls "Reader Batch close calls in" $(($dur/60)) minute run: $i + else + print_warning $nclosecalls "Reader Batch close calls in" $(($dur/60)) minute run: $i + fi fi -- cgit v0.10.2 From 9733e4f0a973a354034f5dd603b4142a3095c85f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 12:49:13 -0800 Subject: rcu: Make _batches_completed() functions return unsigned long Long ago, the various ->completed fields were of type long, but now are unsigned long due to signed-integer-overflow concerns. However, the various _batches_completed() functions remained of type long, even though their only purpose in life is to return the corresponding ->completed field. This patch cleans this up by changing these functions' return types to unsigned long. Signed-off-by: Paul E. McKenney diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 0e53662..91f7e4c 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -94,7 +94,7 @@ static inline void rcu_virt_note_context_switch(int cpu) /* * Return the number of grace periods. */ -static inline long rcu_batches_completed(void) +static inline unsigned long rcu_batches_completed(void) { return 0; } @@ -102,7 +102,7 @@ static inline long rcu_batches_completed(void) /* * Return the number of bottom-half grace periods. */ -static inline long rcu_batches_completed_bh(void) +static inline unsigned long rcu_batches_completed_bh(void) { return 0; } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 5295379..9885bfb 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -81,9 +81,9 @@ void cond_synchronize_rcu(unsigned long oldstate); extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; -long rcu_batches_completed(void); -long rcu_batches_completed_bh(void); -long rcu_batches_completed_sched(void); +unsigned long rcu_batches_completed(void); +unsigned long rcu_batches_completed_bh(void); +unsigned long rcu_batches_completed_sched(void); void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4c106fc..e26d787 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -317,7 +317,7 @@ static int rcu_pending(void); /* * Return the number of RCU-sched batches processed thus far for debug & stats. */ -long rcu_batches_completed_sched(void) +unsigned long rcu_batches_completed_sched(void) { return rcu_sched_state.completed; } @@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); /* * Return the number of RCU BH batches processed thus far for debug & stats. */ -long rcu_batches_completed_bh(void) +unsigned long rcu_batches_completed_bh(void) { return rcu_bh_state.completed; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e7b184..1a07d73 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -546,7 +546,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -long rcu_batches_completed(void); +unsigned long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb..f69300d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -117,7 +117,7 @@ static void __init rcu_bootup_announce(void) * Return the number of RCU-preempt batches processed thus far * for debug and statistics. */ -static long rcu_batches_completed_preempt(void) +static unsigned long rcu_batches_completed_preempt(void) { return rcu_preempt_state.completed; } @@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); /* * Return the number of RCU batches processed thus far for debug & stats. */ -long rcu_batches_completed(void) +unsigned long rcu_batches_completed(void) { return rcu_batches_completed_preempt(); } @@ -935,7 +935,7 @@ static void __init rcu_bootup_announce(void) /* * Return the number of RCU batches processed thus far for debug & stats. */ -long rcu_batches_completed(void) +unsigned long rcu_batches_completed(void) { return rcu_batches_completed_sched(); } -- cgit v0.10.2 From 3b009c0ebe02a06d2d5bd667da0eb9cefa79321d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 13:54:57 -0800 Subject: rcutorture: Make build-output parsing correctly flag RCU's warnings Signed-off-by: Paul E. McKenney diff --git a/tools/testing/selftests/rcutorture/bin/parse-build.sh b/tools/testing/selftests/rcutorture/bin/parse-build.sh index 499d1e5..a6b5762 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-build.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-build.sh @@ -26,12 +26,15 @@ # # Authors: Paul E. McKenney -T=$1 +F=$1 title=$2 +T=/tmp/parse-build.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T . functions.sh -if grep -q CC < $T +if grep -q CC < $F then : else @@ -39,18 +42,21 @@ else exit 1 fi -if grep -q "error:" < $T +if grep -q "error:" < $F then print_bug $title build errors: - grep "error:" < $T + grep "error:" < $F exit 2 fi -exit 0 -if egrep -q "rcu[^/]*\.c.*warning:|rcu.*\.h.*warning:" < $T +grep warning: < $F > $T/warnings +grep "include/linux/*rcu*\.h:" $T/warnings > $T/hwarnings +grep "kernel/rcu/[^/]*:" $T/warnings > $T/cwarnings +cat $T/hwarnings $T/cwarnings > $T/rcuwarnings +if test -s $T/rcuwarnings then print_warning $title build errors: - egrep "rcu[^/]*\.c.*warning:|rcu.*\.h.*warning:" < $T + cat $T/rcuwarnings exit 2 fi exit 0 -- cgit v0.10.2 From 6b80da42c02bc731ab38b6a37da5366abe26717f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 14:19:26 -0800 Subject: rcutorture: Use unsigned for Reader Batch computations The counter returned by the various ->completed functions is subject to overflow, which means that subtracting two such counters might result in overflow, which invokes undefined behavior in the C standard. This commit therefore changes these functions and variables to unsigned to avoid this undefined behavior. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 4d559ba..f43e351 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -244,7 +244,7 @@ struct rcu_torture_ops { int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); - int (*completed)(void); + unsigned long (*completed)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); @@ -296,7 +296,7 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU) rcu_read_unlock(); } -static int rcu_torture_completed(void) +static unsigned long rcu_torture_completed(void) { return rcu_batches_completed(); } @@ -356,7 +356,7 @@ rcu_torture_cb(struct rcu_head *p) cur_ops->deferred_free(rp); } -static int rcu_no_completed(void) +static unsigned long rcu_no_completed(void) { return 0; } @@ -407,7 +407,7 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) rcu_read_unlock_bh(); } -static int rcu_bh_torture_completed(void) +static unsigned long rcu_bh_torture_completed(void) { return rcu_batches_completed_bh(); } @@ -510,7 +510,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) srcu_read_unlock(&srcu_ctl, idx); } -static int srcu_torture_completed(void) +static unsigned long srcu_torture_completed(void) { return srcu_batches_completed(&srcu_ctl); } @@ -1015,8 +1015,8 @@ static void rcutorture_trace_dump(void) static void rcu_torture_timer(unsigned long unused) { int idx; - int completed; - int completed_end; + unsigned long completed; + unsigned long completed_end; static DEFINE_TORTURE_RANDOM(rand); static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; @@ -1073,8 +1073,8 @@ static void rcu_torture_timer(unsigned long unused) static int rcu_torture_reader(void *arg) { - int completed; - int completed_end; + unsigned long completed; + unsigned long completed_end; int idx; DEFINE_TORTURE_RANDOM(rand); struct rcu_torture *p; -- cgit v0.10.2 From c1fe9cde4ae904fffb5b4d975d0a37e99136ff50 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 15:45:27 -0800 Subject: rcu: Provide rcu_batches_completed_sched() for TINY_RCU A bug in rcutorture has caused it to ignore completed batches. In preparation for fixing that bug, this commit provides TINY_RCU with the required rcu_batches_completed_sched(). Signed-off-by: Paul E. McKenney diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 91f7e4c..1ce2d6b 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -107,6 +107,14 @@ static inline unsigned long rcu_batches_completed_bh(void) return 0; } +/* + * Return the number of sched grace periods. + */ +static inline unsigned long rcu_batches_completed_sched(void) +{ + return 0; +} + static inline void rcu_force_quiescent_state(void) { } -- cgit v0.10.2 From 1e32eaee4c370afca20324f634bf47ae991cd240 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 16:04:34 -0800 Subject: rcutorture: Drop rcu_torture_completed() and friends Now that the return type of rcu_batches_completed() and friends matches that of the rcu_torture_ops structure's ->completed field, the wrapper functions can be deleted. This commit carries out that deletion, while also wiring "sched"'s ->completed field to rcu_batches_completed_sched(). Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f43e351..aadbc07 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -296,11 +296,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU) rcu_read_unlock(); } -static unsigned long rcu_torture_completed(void) -{ - return rcu_batches_completed(); -} - /* * Update callback in the pipe. This should be invoked after a grace period. */ @@ -377,7 +372,7 @@ static struct rcu_torture_ops rcu_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, + .completed = rcu_batches_completed, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, @@ -407,11 +402,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) rcu_read_unlock_bh(); } -static unsigned long rcu_bh_torture_completed(void) -{ - return rcu_batches_completed_bh(); -} - static void rcu_bh_torture_deferred_free(struct rcu_torture *p) { call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); @@ -423,7 +413,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, + .completed = rcu_batches_completed_bh, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, @@ -600,7 +590,7 @@ static struct rcu_torture_ops sched_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, + .completed = rcu_batches_completed_sched, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, -- cgit v0.10.2 From f9103c390257d06c162d9e3c2a90d2bdedadfe17 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 16:48:20 -0800 Subject: rcu: Remove redundant rcu_batches_completed() declaration Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 1a07d73..69eb422 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -546,7 +546,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -unsigned long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -- cgit v0.10.2 From 917963d0b30f9c4153c372c165178501d97b6b55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 17:10:16 -0800 Subject: rcutorture: Check from beginning to end of grace period Currently, rcutorture's Reader Batch checks measure from the end of the previous grace period to the end of the current one. This commit tightens up these checks by measuring from the start and end of the same grace period. This involves adding rcu_batches_started() and friends corresponding to the existing rcu_batches_completed() and friends. We leave SRCU alone for the moment, as it does not yet have a way of tracking both ends of its grace periods. Signed-off-by: Paul E. McKenney diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 1ce2d6b..9841921 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -92,7 +92,31 @@ static inline void rcu_virt_note_context_switch(int cpu) } /* - * Return the number of grace periods. + * Return the number of grace periods started. + */ +static inline unsigned long rcu_batches_started(void) +{ + return 0; +} + +/* + * Return the number of bottom-half grace periods started. + */ +static inline unsigned long rcu_batches_started_bh(void) +{ + return 0; +} + +/* + * Return the number of sched grace periods started. + */ +static inline unsigned long rcu_batches_started_sched(void) +{ + return 0; +} + +/* + * Return the number of grace periods completed. */ static inline unsigned long rcu_batches_completed(void) { @@ -100,7 +124,7 @@ static inline unsigned long rcu_batches_completed(void) } /* - * Return the number of bottom-half grace periods. + * Return the number of bottom-half grace periods completed. */ static inline unsigned long rcu_batches_completed_bh(void) { @@ -108,7 +132,7 @@ static inline unsigned long rcu_batches_completed_bh(void) } /* - * Return the number of sched grace periods. + * Return the number of sched grace periods completed. */ static inline unsigned long rcu_batches_completed_sched(void) { diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 9885bfb..c0dd124 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -81,6 +81,9 @@ void cond_synchronize_rcu(unsigned long oldstate); extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; +unsigned long rcu_batches_started(void); +unsigned long rcu_batches_started_bh(void); +unsigned long rcu_batches_started_sched(void); unsigned long rcu_batches_completed(void); unsigned long rcu_batches_completed_bh(void); unsigned long rcu_batches_completed_sched(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index aadbc07..24142c2 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -244,6 +244,7 @@ struct rcu_torture_ops { int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); + unsigned long (*started)(void); unsigned long (*completed)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); @@ -372,6 +373,7 @@ static struct rcu_torture_ops rcu_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, + .started = rcu_batches_started, .completed = rcu_batches_completed, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, @@ -413,6 +415,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, + .started = rcu_batches_started_bh, .completed = rcu_batches_completed_bh, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, @@ -456,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock, + .started = rcu_no_completed, .completed = rcu_no_completed, .deferred_free = rcu_busted_torture_deferred_free, .sync = synchronize_rcu_busted, @@ -554,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, + .started = NULL, .completed = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, @@ -590,6 +595,7 @@ static struct rcu_torture_ops sched_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, + .started = rcu_batches_started_sched, .completed = rcu_batches_completed_sched, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, @@ -628,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = { .readlock = tasks_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = tasks_torture_read_unlock, + .started = rcu_no_completed, .completed = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, @@ -1005,8 +1012,8 @@ static void rcutorture_trace_dump(void) static void rcu_torture_timer(unsigned long unused) { int idx; + unsigned long started; unsigned long completed; - unsigned long completed_end; static DEFINE_TORTURE_RANDOM(rand); static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; @@ -1014,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused) unsigned long long ts; idx = cur_ops->readlock(); - completed = cur_ops->completed(); + if (cur_ops->started) + started = cur_ops->started(); + else + started = cur_ops->completed(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1037,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed_end = cur_ops->completed(); + completed = cur_ops->completed(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, - completed, completed_end); + started, completed); rcutorture_trace_dump(); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; + completed = completed - started; + if (cur_ops->started) + completed++; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1063,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused) static int rcu_torture_reader(void *arg) { + unsigned long started; unsigned long completed; - unsigned long completed_end; int idx; DEFINE_TORTURE_RANDOM(rand); struct rcu_torture *p; @@ -1083,7 +1095,10 @@ rcu_torture_reader(void *arg) mod_timer(&t, jiffies + 1); } idx = cur_ops->readlock(); - completed = cur_ops->completed(); + if (cur_ops->started) + started = cur_ops->started(); + else + started = cur_ops->completed(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1104,14 +1119,16 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed_end = cur_ops->completed(); + completed = cur_ops->completed(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, completed, completed_end); + ts, started, completed); rcutorture_trace_dump(); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; + completed = completed - started; + if (cur_ops->started) + completed++; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e26d787..c0faad5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -315,7 +315,43 @@ static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); /* - * Return the number of RCU-sched batches processed thus far for debug & stats. + * Return the number of RCU batches started thus far for debug & stats. + */ +unsigned long rcu_batches_started(void) +{ + return rcu_state_p->gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started); + +/* + * Return the number of RCU-sched batches started thus far for debug & stats. + */ +unsigned long rcu_batches_started_sched(void) +{ + return rcu_sched_state.gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started_sched); + +/* + * Return the number of RCU BH batches started thus far for debug & stats. + */ +unsigned long rcu_batches_started_bh(void) +{ + return rcu_bh_state.gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started_bh); + +/* + * Return the number of RCU batches completed thus far for debug & stats. + */ +unsigned long rcu_batches_completed(void) +{ + return rcu_state_p->completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Return the number of RCU-sched batches completed thus far for debug & stats. */ unsigned long rcu_batches_completed_sched(void) { @@ -324,7 +360,7 @@ unsigned long rcu_batches_completed_sched(void) EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); /* - * Return the number of RCU BH batches processed thus far for debug & stats. + * Return the number of RCU BH batches completed thus far for debug & stats. */ unsigned long rcu_batches_completed_bh(void) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f69300d..07e61a0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -114,25 +114,6 @@ static void __init rcu_bootup_announce(void) } /* - * Return the number of RCU-preempt batches processed thus far - * for debug and statistics. - */ -static unsigned long rcu_batches_completed_preempt(void) -{ - return rcu_preempt_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); - -/* - * Return the number of RCU batches processed thus far for debug & stats. - */ -unsigned long rcu_batches_completed(void) -{ - return rcu_batches_completed_preempt(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is * not in a quiescent state. There might be any number of tasks blocked @@ -933,15 +914,6 @@ static void __init rcu_bootup_announce(void) } /* - * Return the number of RCU batches processed thus far for debug & stats. - */ -unsigned long rcu_batches_completed(void) -{ - return rcu_batches_completed_sched(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ -- cgit v0.10.2 From 94162c8daa839bd0de791f66d754a880e2787c82 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Dec 2014 17:42:16 -0800 Subject: rcutorture: Handle different mpstat versions The mpstat command recently added the %gnice column, which messes up the cpu2use.sh script's idle-CPU calculations. This commit therefore uses $NF instead of $12 to select the last (%idle) column. Signed-off-by: Paul E. McKenney diff --git a/tools/testing/selftests/rcutorture/bin/cpus2use.sh b/tools/testing/selftests/rcutorture/bin/cpus2use.sh index abe14b7..bb99cde 100755 --- a/tools/testing/selftests/rcutorture/bin/cpus2use.sh +++ b/tools/testing/selftests/rcutorture/bin/cpus2use.sh @@ -24,7 +24,7 @@ ncpus=`grep '^processor' /proc/cpuinfo | wc -l` idlecpus=`mpstat | tail -1 | \ - awk -v ncpus=$ncpus '{ print ncpus * ($7 + $12) / 100 }'` + awk -v ncpus=$ncpus '{ print ncpus * ($7 + $NF) / 100 }'` awk -v ncpus=$ncpus -v idlecpus=$idlecpus < /dev/null ' BEGIN { cpus2use = idlecpus; -- cgit v0.10.2 From 16c77ea7d0f4a74e49009aa2d26c275f7f93de7c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Dec 2014 17:43:58 -0800 Subject: torture: Add "-enable-kvm -soundhw pcspk" to qemu command line More recent qemu implementations really want "-enable-kvm", and the "-soundhw pcspk" makes the script a bit less dependent on odd audio libraries being installed. This commit therefore adds both to the default qemu command line. Signed-off-by: Paul E. McKenney diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 8ca9f21..35b8532 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -8,9 +8,9 @@ # # Usage: kvm-test-1-run.sh config builddir resdir minutes qemu-args boot_args # -# qemu-args defaults to "-nographic", along with arguments specifying the -# number of CPUs and other options generated from -# the underlying CPU architecture. +# qemu-args defaults to "-enable-kvm -soundhw pcspk -nographic", along with +# arguments specifying the number of CPUs and other +# options generated from the underlying CPU architecture. # boot_args defaults to value returned by the per_version_boot_params # shell function. # @@ -138,7 +138,7 @@ then fi # Generate -smp qemu argument. -qemu_args="-nographic $qemu_args" +qemu_args="-enable-kvm -soundhw pcspk -nographic $qemu_args" cpu_count=`configNR_CPUS.sh $config_template` cpu_count=`configfrag_boot_cpus "$boot_args" "$config_template" "$cpu_count"` vcpus=`identify_qemu_vcpus` -- cgit v0.10.2 From d2f74b5b488dd2b283977f64844fee3ba91676db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Dec 2014 10:50:22 -0800 Subject: torture: Flag console.log file to prevent holdovers from earlier runs A system misconfiguration that prevents qemu from running at all (for example, a missing dynamically linked library) will keep the console.log file from the previous run. This can fool the developer into thinking that this failed run actually completed correctly. This commit therefore overwrites the console.log file just before launching qemu. Signed-off-by: Paul E. McKenney diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 35b8532..5236e07 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -168,6 +168,7 @@ then touch $resdir/buildonly exit 0 fi +echo "NOTE: $QEMU either did not run or was interactive" > $builddir/console.log echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd ( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) & qemu_pid=$! -- cgit v0.10.2 From 7602de4af192b1527767538454557cb7564bae60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Dec 2014 18:39:54 -0800 Subject: rcutorture: Add more diagnostics in rcu_barrier() test failure case Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 24142c2..30d42aa 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1427,6 +1427,9 @@ static int rcu_torture_barrier(void *arg) cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { n_rcu_torture_barrier_error++; + pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n", + atomic_read(&barrier_cbs_invoked), + n_barrier_cbs); WARN_ON_ONCE(1); } n_barrier_successes++; -- cgit v0.10.2 From 28423ad283d5348793b0c45cc9b1af058e776fd6 Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Tue, 13 Jan 2015 13:16:18 -0800 Subject: ksoftirqd: Enable IRQs and call cond_resched() before poking RCU While debugging an issue with excessive softirq usage, I encountered the following note in commit 3e339b5dae24a706 ("softirq: Use hotplug thread infrastructure"): [ paulmck: Call rcu_note_context_switch() with interrupts enabled. ] ...but despite this note, the patch still calls RCU with IRQs disabled. This seemingly innocuous change caused a significant regression in softirq CPU usage on the sending side of a large TCP transfer (~1 GB/s): when introducing 0.01% packet loss, the softirq usage would jump to around 25%, spiking as high as 50%. Before the change, the usage would never exceed 5%. Moving the call to rcu_note_context_switch() after the cond_sched() call, as it was originally before the hotplug patch, completely eliminated this problem. Signed-off-by: Calvin Owens Cc: stable@vger.kernel.org Signed-off-by: Paul E. McKenney diff --git a/kernel/softirq.c b/kernel/softirq.c index 501baa9..c497fcd 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -656,9 +656,13 @@ static void run_ksoftirqd(unsigned int cpu) * in the task stack here. */ __do_softirq(); - rcu_note_context_switch(); local_irq_enable(); cond_resched(); + + preempt_disable(); + rcu_note_context_switch(); + preempt_enable(); + return; } local_irq_enable(); -- cgit v0.10.2 From 60479676eb6e9913176d93ebad194e3dc167bc63 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Jan 2015 13:20:26 -0800 Subject: ksoftirqd: Use new cond_resched_rcu_qs() function Simplify run_ksoftirqd() by using the new cond_resched_rcu_qs() function that conditionally reschedules, but unconditionally supplies an RCU quiescent state. This commit is separate from the previous commit by Calvin Owens because Calvin's approach can be backported, while this commit cannot be. The reason that this commit cannot be backported is that cond_resched_rcu_qs() does not always provide the needed quiescent state in earlier kernels. Signed-off-by: Paul E. McKenney diff --git a/kernel/softirq.c b/kernel/softirq.c index c497fcd..8cdb988 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -657,12 +657,7 @@ static void run_ksoftirqd(unsigned int cpu) */ __do_softirq(); local_irq_enable(); - cond_resched(); - - preempt_disable(); - rcu_note_context_switch(); - preempt_enable(); - + cond_resched_rcu_qs(); return; } local_irq_enable(); -- cgit v0.10.2 From a94844b22a2e2b9155bbc0878c507850477221c2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Dec 2014 07:37:48 -0800 Subject: rcu: Optionally run grace-period kthreads at real-time priority Recent testing has shown that under heavy load, running RCU's grace-period kthreads at real-time priority can improve performance (according to 0day test robot) and reduce the incidence of RCU CPU stall warnings. However, most systems do just fine with the default non-realtime priorities for these kthreads, and it does not make sense to expose the entire user base to any risk stemming from this change, given that this change is of use only to a few users running extremely heavy workloads. Therefore, this commit allows users to specify realtime priorities for the grace-period kthreads, but leaves them running SCHED_OTHER by default. The realtime priority may be specified at build time via the RCU_KTHREAD_PRIO Kconfig parameter, or at boot time via the rcutree.kthread_prio parameter. Either way, 0 says to continue the default SCHED_OTHER behavior and values from 1-99 specify that priority of SCHED_FIFO behavior. Note that a value of 0 is not permitted when the RCU_BOOST Kconfig parameter is specified. Signed-off-by: Paul E. McKenney diff --git a/init/Kconfig b/init/Kconfig index 9afb971..d3ee66a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -668,9 +668,10 @@ config RCU_BOOST config RCU_KTHREAD_PRIO int "Real-time priority to use for RCU worker threads" - range 1 99 - depends on RCU_BOOST - default 1 + range 1 99 if RCU_BOOST + range 0 99 if !RCU_BOOST + default 1 if RCU_BOOST + default 0 if !RCU_BOOST help This option specifies the SCHED_FIFO priority value that will be assigned to the rcuc/n and rcub/n threads and is also the value diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5987fdc..75ce123 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); +/* rcuc/rcub kthread realtime priority */ +static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; +module_param(kthread_prio, int, 0644); + /* * Track the rcutorture test sequence number and the update version * number within a given test. The rcutorture_testseq is incremented @@ -3597,17 +3601,35 @@ static int rcu_pm_notify(struct notifier_block *self, static int __init rcu_spawn_gp_kthread(void) { unsigned long flags; + int kthread_prio_in = kthread_prio; struct rcu_node *rnp; struct rcu_state *rsp; + struct sched_param sp; struct task_struct *t; + /* Force priority into range. */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) + kthread_prio = 1; + else if (kthread_prio < 0) + kthread_prio = 0; + else if (kthread_prio > 99) + kthread_prio = 99; + if (kthread_prio != kthread_prio_in) + pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", + kthread_prio, kthread_prio_in); + rcu_scheduler_fully_active = 1; for_each_rcu_flavor(rsp) { - t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); + t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); rnp = rcu_get_root(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); rsp->gp_kthread = t; + if (kthread_prio) { + sp.sched_priority = kthread_prio; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } + wake_up_process(t); raw_spin_unlock_irqrestore(&rnp->lock, flags); } rcu_spawn_nocb_kthreads(); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 81ff8b9..dcb3263 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -34,10 +34,6 @@ #include "../locking/rtmutex_common.h" -/* rcuc/rcub kthread realtime priority */ -static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; -module_param(kthread_prio, int, 0644); - /* * Control variables for per-CPU and per-rcu_node kthreads. These * handle all flavors of RCU. -- cgit v0.10.2 From 5cd37193ce8539be1e6ef76be226f4bcc984e0f5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 13 Dec 2014 20:32:04 -0800 Subject: rcu: Make cond_resched_rcu_qs() apply to normal RCU flavors Although cond_resched_rcu_qs() only applies to TASKS_RCU, it is used in places where it would be useful for it to apply to the normal RCU flavors, rcu_preempt, rcu_sched, and rcu_bh. This is especially the case for workloads that aggressively overload the system, particularly those that generate large numbers of RCU updates on systems running NO_HZ_FULL CPUs. This commit therefore communicates quiescent states from cond_resched_rcu_qs() to the normal RCU flavors. Note that it is unfortunately necessary to leave the old ->passed_quiesce mechanism in place to allow quiescent states that apply to only one flavor to be recorded. (Yes, we could decrement ->rcu_qs_ctr_snap in that case, but that is not so good for debugging of RCU internals.) In addition, if one of the RCU flavor's grace period has stalled, this will invoke rcu_momentary_dyntick_idle(), resulting in a heavy-weight quiescent state visible from other CPUs. Reported-by: Sasha Levin Reported-by: Dave Jones Signed-off-by: Paul E. McKenney [ paulmck: Merge commit from Sasha Levin fixing a bug where __this_cpu() was used in preemptible code. ] diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index b63b9bb..08651da 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -56,14 +56,14 @@ rcuboost: The output of "cat rcu/rcu_preempt/rcudata" looks as follows: - 0!c=30455 g=30456 pq=1 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716 - 1!c=30719 g=30720 pq=1 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982 - 2!c=30150 g=30151 pq=1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458 - 3 c=31249 g=31250 pq=1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622 - 4!c=29502 g=29503 pq=1 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521 - 5 c=31201 g=31202 pq=1 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698 - 6!c=30253 g=30254 pq=1 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353 - 7 c=31178 g=31178 pq=1 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969 + 0!c=30455 g=30456 pq=1/0 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716 + 1!c=30719 g=30720 pq=1/0 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982 + 2!c=30150 g=30151 pq=1/1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458 + 3 c=31249 g=31250 pq=1/1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622 + 4!c=29502 g=29503 pq=1/0 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521 + 5 c=31201 g=31202 pq=1/0 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698 + 6!c=30253 g=30254 pq=1/0 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353 + 7 c=31178 g=31178 pq=1/0 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969 This file has one line per CPU, or eight for this 8-CPU system. The fields are as follows: @@ -188,14 +188,14 @@ o "ca" is the number of RCU callbacks that have been adopted by this Kernels compiled with CONFIG_RCU_BOOST=y display the following from /debug/rcu/rcu_preempt/rcudata: - 0!c=12865 g=12866 pq=1 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871 - 1 c=14407 g=14408 pq=1 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485 - 2 c=14407 g=14408 pq=1 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490 - 3 c=14407 g=14408 pq=1 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290 - 4 c=14405 g=14406 pq=1 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114 - 5!c=14168 g=14169 pq=1 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722 - 6 c=14404 g=14405 pq=1 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811 - 7 c=14407 g=14408 pq=1 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042 + 0!c=12865 g=12866 pq=1/0 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871 + 1 c=14407 g=14408 pq=1/0 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485 + 2 c=14407 g=14408 pq=1/0 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490 + 3 c=14407 g=14408 pq=1/0 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290 + 4 c=14405 g=14406 pq=1/0 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114 + 5!c=14168 g=14169 pq=1/0 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722 + 6 c=14404 g=14405 pq=1/0 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811 + 7 c=14407 g=14408 pq=1/0 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042 This is similar to the output discussed above, but contains the following additional fields: diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ed4f593..4682287 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -331,12 +331,13 @@ static inline void rcu_init_nohz(void) extern struct srcu_struct tasks_rcu_exit_srcu; #define rcu_note_voluntary_context_switch(t) \ do { \ + rcu_all_qs(); \ if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \ ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \ } while (0) #else /* #ifdef CONFIG_TASKS_RCU */ #define TASKS_RCU(x) do { } while (0) -#define rcu_note_voluntary_context_switch(t) do { } while (0) +#define rcu_note_voluntary_context_switch(t) rcu_all_qs() #endif /* #else #ifdef CONFIG_TASKS_RCU */ /** diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 0e53662..fabd3fa 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -154,7 +154,10 @@ static inline bool rcu_is_watching(void) return true; } - #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ +static inline void rcu_all_qs(void) +{ +} + #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 5295379..ddba927 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -97,4 +97,6 @@ extern int rcu_scheduler_active __read_mostly; bool rcu_is_watching(void); +void rcu_all_qs(void); + #endif /* __LINUX_RCUTREE_H */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 75ce123..cb00e03 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -219,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; +DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); +EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); + /* * Let the RCU core know that this CPU has gone through the scheduler, * which is a quiescent state. This is called when the need for a @@ -288,6 +291,22 @@ void rcu_note_context_switch(void) } EXPORT_SYMBOL_GPL(rcu_note_context_switch); +/* + * Register a quiesecent state for all RCU flavors. If there is an + * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight + * dyntick-idle quiescent state visible to other CPUs (but only for those + * RCU flavors in desparate need of a quiescent state, which will normally + * be none of them). Either way, do a lightweight quiescent state for + * all RCU flavors. + */ +void rcu_all_qs(void) +{ + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + rcu_momentary_dyntick_idle(); + this_cpu_inc(rcu_qs_ctr); +} +EXPORT_SYMBOL_GPL(rcu_all_qs); + static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static long qhimark = 10000; /* If this many pending, ignore blimit. */ static long qlowmark = 100; /* Once only this many pending, use blimit. */ @@ -1609,6 +1628,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); rdp->passed_quiesce = 0; + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); ACCESS_ONCE(rdp->gpwrap) = false; @@ -2075,8 +2095,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum || rdp->gpwrap) { + if ((rdp->passed_quiesce == 0 && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || + rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || + rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -2085,6 +2107,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * within the current grace period. */ rdp->passed_quiesce = 0; /* need qs for new gp. */ + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -2129,7 +2152,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (!rdp->passed_quiesce) + if (!rdp->passed_quiesce && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) return; /* @@ -3174,9 +3198,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && - rdp->qs_pending && !rdp->passed_quiesce) { + rdp->qs_pending && !rdp->passed_quiesce && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { rdp->n_rp_qs_pending++; - } else if (rdp->qs_pending && rdp->passed_quiesce) { + } else if (rdp->qs_pending && + (rdp->passed_quiesce || + rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { rdp->n_rp_report_qs++; return 1; } @@ -3510,6 +3537,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; rdp->completed = rnp->completed; rdp->passed_quiesce = 0; + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->qs_pending = 0; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7472ff3..1e7f8b0 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -257,6 +257,8 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ + unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ + /* for rcu_all_qs() invocations. */ bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 5cdc62e..fbb6240 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -46,6 +46,8 @@ #define RCU_TREE_NONCORE #include "tree.h" +DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); + static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) { @@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", + seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->passed_quiesce, rdp->qs_pending); + rdp->passed_quiesce, + rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), + rdp->qs_pending); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, -- cgit v0.10.2 From fb81a44b88e6173ed0f6e9d6a1afa5305fb63f6e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Dec 2014 08:35:02 -0800 Subject: rcu: Add GP-kthread-starvation checks to CPU stall warnings This commit adds a message that is printed if the relevant grace-period kthread has not been able to run for the two seconds preceding the stall warning. (The two seconds is double the maximum interval between successive bouts of quiescent-state forcing.) Signed-off-by: Paul E. McKenney diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 55f9707..53e7d28 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -152,6 +152,15 @@ no non-lazy callbacks ("." is printed otherwise, as shown above) and "D" indicates that dyntick-idle processing is enabled ("." is printed otherwise, for example, if disabled via the "nohz=" kernel boot parameter). +If the relevant grace-period kthread has been unable to run prior to +the stall warning, the following additional line is printed: + + rcu_preempt kthread starved for 2023 jiffies! + +Starving the grace-period kthreads of CPU time can of course result in +RCU CPU stall warnings even when all CPUs and tasks have passed through +the required quiescent states. + Multiple Warnings From One Stall diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb00e03..e335f33 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1073,6 +1073,21 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) } /* + * Complain about starvation of grace-period kthread. + */ +static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) +{ + unsigned long gpa; + unsigned long j; + + j = jiffies; + gpa = ACCESS_ONCE(rsp->gp_activity); + if (j - gpa > 2 * HZ) + pr_err("%s kthread starved for %ld jiffies!\n", + rsp->name, j - gpa); +} + +/* * Dump stacks of all tasks running on stalled CPUs. */ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) @@ -1169,9 +1184,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) } /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(rsp); + rcu_check_gp_kthread_starvation(rsp); + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1196,6 +1212,9 @@ static void print_cpu_stall(struct rcu_state *rsp) pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", jiffies - rsp->gp_start, (long)rsp->gpnum, (long)rsp->completed, totqlen); + + rcu_check_gp_kthread_starvation(rsp); + rcu_dump_cpu_stacks(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); -- cgit v0.10.2 From ec1fe396ff42e240c9b32111ee53665c5916fe5e Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Mon, 22 Dec 2014 11:10:12 -0800 Subject: rcu: Fix RCU CPU stall detection in tiny implementation The tiny RCU CPU stall detection depends on *rcp->curtail not being NULL. It is however a tail pointer and thus NULL by definition. Instead we should check rcp->rcucblist for the presence of pending callbacks which need to be processed. With this fix INFO about the stall is printed and jiffies_stall (jiffies at next stall) correctly updated. Note that the check for pending callback is necessary to avoid spurious warnings if there are no pendings callbacks. Signed-off-by: Miroslav Benes [ paulmck: Fused identical "if" statements, ported to -rcu. ] Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 858c565..80f908a 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) rcp->ticks_this_gp++; j = jiffies; js = ACCESS_ONCE(rcp->jiffies_stall); - if (*rcp->curtail && ULONG_CMP_GE(j, js)) { + if (rcp->rcucblist && ULONG_CMP_GE(j, js)) { pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, jiffies - rcp->gp_start, rcp->qlen); dump_stack(); - } - if (*rcp->curtail && ULONG_CMP_GE(j, js)) ACCESS_ONCE(rcp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - else if (ULONG_CMP_GE(j, js)) + } else if (ULONG_CMP_GE(j, js)) { ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); + } } static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) -- cgit v0.10.2 From 630181c4a915edb0761bb282c567d3abc8ca2f35 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Dec 2014 21:33:14 -0800 Subject: rcu: Initialize tiny RCU stall-warning timeouts at boot The current tiny RCU stall-warning code assumes that the jiffies counter starts at zero, however, it is sometimes initialized to other values, for example, -30,000. This commit therefore changes rcu_init() to invoke reset_cpu_stall_ticks() for both flavors of RCU to initialize the stall-warning times properly at boot. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 0db5649..fc0f2c0 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -383,6 +383,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); + RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); rcu_early_boot_tests(); } -- cgit v0.10.2