From 2921b12362babf5af7d45e7b933637cfdfdf6a2a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Apr 2016 05:09:53 -0700 Subject: documentation: Add reference to 2014 RCU API LWN article Reported-by: Jim Roskind Signed-off-by: Paul E. McKenney diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 111770f..13266cf 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -5,6 +5,7 @@ to start learning about RCU: 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/ 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/ 4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/ +5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/ What is RCU? -- cgit v0.10.2 From db4855b5a88535746558fb756512090fc1ce5607 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Apr 2016 05:21:56 -0700 Subject: documentation: Add references to 2010 and 2014 Big API Tables Reported-by: Jim Roskind Signed-off-by: Paul E. McKenney diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 13266cf..2044227 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -5,7 +5,9 @@ to start learning about RCU: 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/ 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/ 4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/ + 2010 Big API Table http://lwn.net/Articles/419086/ 5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/ + 2014 Big API Table http://lwn.net/Articles/609973/ What is RCU? -- cgit v0.10.2 From c79dac758dc59f7461a721aaf65301cde8a448bc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Apr 2016 09:22:54 -0700 Subject: documentation: Add RCU_NONIDLE() restrictions to requirements Signed-off-by: Paul E. McKenney diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index e7e24b3..ece410f 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -2391,6 +2391,41 @@ and RCU_NONIDLE() on the other while inspecting idle-loop code. Steven Rostedt supplied _rcuidle event tracing, which is used quite heavily in the idle loop. +However, there are some restrictions on the code placed within +RCU_NONIDLE(): + +
    +
  1. Blocking is prohibited. + In practice, this is not a serious restriction given that idle + tasks are prohibited from blocking to begin with. +
  2. Although nesting RCU_NONIDLE() is permited, they cannot + nest indefinitely deeply. + However, given that they can be nested on the order of a million + deep, even on 32-bit systems, this should not be a serious + restriction. + This nesting limit would probably be reached long after the + compiler OOMed or the stack overflowed. +
  3. Any code path that enters RCU_NONIDLE() must sequence + out of that same RCU_NONIDLE(). + For example, the following is grossly illegal: + +
    +
    + 1     RCU_NONIDLE({
    + 2       do_something();
    + 3       goto bad_idea;  /* BUG!!! */
    + 4       do_something_else();});
    + 5   bad_idea:
    +	
    +
    + +

    + It is just as illegal to transfer control into the middle of + RCU_NONIDLE()'s argument. + Yes, in theory, you could transfer in as long as you also + transferred out, but in practice you could also expect to get sharply + worded review comments. +

It is similarly socially unacceptable to interrupt an -- cgit v0.10.2 From 14ef05754f397d468926187d285b9fee55603e86 Mon Sep 17 00:00:00 2001 From: Eric Engestrom Date: Mon, 25 Apr 2016 07:36:59 +0100 Subject: Documentation: Fix spelling mistake Signed-off-by: Eric Engestrom Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 0f7fb42..e93d041 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -49,7 +49,7 @@ rcupdate.rcu_task_stall_timeout This boot/sysfs parameter controls the RCU-tasks stall warning interval. A value of zero or less suppresses RCU-tasks stall warnings. A positive value sets the stall-warning interval - in jiffies. An RCU-tasks stall warning starts wtih the line: + in jiffies. An RCU-tasks stall warning starts with the line: INFO: rcu_tasks detected stalls on tasks: -- cgit v0.10.2 From 0d95092ccba1a30b74fa52ff94ec5415e63744a0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Apr 2016 05:00:03 -0700 Subject: rcu: Fix outdated rcu_scheduler_active comment The comment header for rcu_scheduler_active states that it is used to optimize synchronize_sched() at early boot. This is incorrect. The synchronize_sched() function instead checks the number of online CPUs. This commit therefore replaces the comment's synchronize_sched() with synchronize_rcu(), which really does use rcu_scheduler_active for this purpose. Reported-by: Lihao Liang Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c7f1bc4..0fa692f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -130,7 +130,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ * The rcu_scheduler_active variable transitions from zero to one just * before the first task is spawned. So when this variable is zero, RCU * can assume that there is but one task, allowing RCU to (for example) - * optimize synchronize_sched() to a simple barrier(). When this variable + * optimize synchronize_rcu() to a simple barrier(). When this variable * is one, RCU must actually do all the hard work required to detect real * grace periods. This variable is also used to suppress boot-time false * positives from lockdep-RCU error checking. -- cgit v0.10.2 From 590d1757b9d177e7fe3707963d0209d6eefbc746 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 10 Apr 2016 08:23:24 -0700 Subject: rcu: Fix outdated hotplug-exclusion comment in rcu_gp_init() In the past, RCU grace-period initialization excluded CPU-hotplug operations, but this is no longer the case. This commit therefore removed an outdated comment in rcu_gp_init() claiming that these are excluded. Reported-by: Lihao Liang Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0fa692f..6043c14 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1989,8 +1989,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * of the tree within the rsp->node[] array. Note that other CPUs * will access only the leaves of the hierarchy, thus seeing that no * grace period is in progress, at least until the corresponding - * leaf node has been initialized. In addition, we have excluded - * CPU-hotplug operations. + * leaf node has been initialized. * * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. -- cgit v0.10.2 From d3acab65f274800dd0901f0816f8bca9f2a8c8ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Mar 2016 09:49:04 +0100 Subject: rcu: Remove some superfluous lines I think you'll find this condition is superfluous, as the whole function is under #ifdef of that same. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6043c14..4aefeaf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4363,9 +4363,6 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ -- cgit v0.10.2 From 3549c2bc2c4ea8ecfeb9d21cb81cb00c6002b011 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 15 Apr 2016 16:35:29 -0700 Subject: rcu: Move expedited code from tree.c to tree_exp.h People have been having some difficulty finding their way around the RCU code. This commit therefore pulls some of the expedited grace-period code from tree.c to a new tree_exp.h file. This commit is strictly code movement, with the exception of a forward declaration that was added for the sync_sched_exp_online_cleanup() function. A subsequent commit will move the remaining expedited grace-period code from tree_plugin.h to tree_exp.h. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4aefeaf..c844b61 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -159,6 +159,7 @@ static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, bool wake); +static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ #ifdef CONFIG_RCU_KTHREAD_PRIO @@ -3447,549 +3448,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s) return ULONG_CMP_GE(READ_ONCE(*sp), s); } -/* Wrapper functions for expedited grace periods. */ -static void rcu_exp_gp_seq_start(struct rcu_state *rsp) -{ - rcu_seq_start(&rsp->expedited_sequence); -} -static void rcu_exp_gp_seq_end(struct rcu_state *rsp) -{ - rcu_seq_end(&rsp->expedited_sequence); - smp_mb(); /* Ensure that consecutive grace periods serialize. */ -} -static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) -{ - unsigned long s; - - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - s = rcu_seq_snap(&rsp->expedited_sequence); - trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); - return s; -} -static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) -{ - return rcu_seq_done(&rsp->expedited_sequence, s); -} - -/* - * Reset the ->expmaskinit values in the rcu_node tree to reflect any - * recent CPU-online activity. Note that these masks are not cleared - * when CPUs go offline, so they reflect the union of all CPUs that have - * ever been online. This means that this function normally takes its - * no-work-to-do fastpath. - */ -static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) -{ - bool done; - unsigned long flags; - unsigned long mask; - unsigned long oldmask; - int ncpus = READ_ONCE(rsp->ncpus); - struct rcu_node *rnp; - struct rcu_node *rnp_up; - - /* If no new CPUs onlined since last time, nothing to do. */ - if (likely(ncpus == rsp->ncpus_snap)) - return; - rsp->ncpus_snap = ncpus; - - /* - * Each pass through the following loop propagates newly onlined - * CPUs for the current rcu_node structure up the rcu_node tree. - */ - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rnp->expmaskinit == rnp->expmaskinitnext) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - continue; /* No new CPUs, nothing to do. */ - } - - /* Update this node's mask, track old value for propagation. */ - oldmask = rnp->expmaskinit; - rnp->expmaskinit = rnp->expmaskinitnext; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - /* If was already nonzero, nothing to propagate. */ - if (oldmask) - continue; - - /* Propagate the new CPU up the tree. */ - mask = rnp->grpmask; - rnp_up = rnp->parent; - done = false; - while (rnp_up) { - raw_spin_lock_irqsave_rcu_node(rnp_up, flags); - if (rnp_up->expmaskinit) - done = true; - rnp_up->expmaskinit |= mask; - raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); - if (done) - break; - mask = rnp_up->grpmask; - rnp_up = rnp_up->parent; - } - } -} - -/* - * Reset the ->expmask values in the rcu_node tree in preparation for - * a new expedited grace period. - */ -static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) -{ - unsigned long flags; - struct rcu_node *rnp; - - sync_exp_reset_tree_hotplug(rsp); - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - WARN_ON_ONCE(rnp->expmask); - rnp->expmask = rnp->expmaskinit; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } -} - -/* - * Return non-zero if there is no RCU expedited grace period in progress - * for the specified rcu_node structure, in other words, if all CPUs and - * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the rcu_state's exp_mutex. - */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) -{ - return rnp->exp_tasks == NULL && - READ_ONCE(rnp->expmask) == 0; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. This event is reported either to the rcu_node structure on - * which the task was queued or to one of that rcu_node structure's ancestors, - * recursively up the tree. (Calm down, calm down, we do the recursion - * iteratively!) - * - * Caller must hold the rcu_state's exp_mutex and the specified rcu_node - * structure's ->lock. - */ -static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake, unsigned long flags) - __releases(rnp->lock) -{ - unsigned long mask; - - for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { - if (!rnp->expmask) - rcu_initiate_boost(rnp, flags); - else - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - break; - } - if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ - swake_up(&rsp->expedited_wq); - } - break; - } - mask = rnp->grpmask; - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ - rnp = rnp->parent; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ - WARN_ON_ONCE(!(rnp->expmask & mask)); - rnp->expmask &= ~mask; - } -} - -/* - * Report expedited quiescent state for specified node. This is a - * lock-acquisition wrapper function for __rcu_report_exp_rnp(). - * - * Caller must hold the rcu_state's exp_mutex. - */ -static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, bool wake) -{ - unsigned long flags; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - __rcu_report_exp_rnp(rsp, rnp, wake, flags); -} - -/* - * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the rcu_state's - * exp_mutex. - */ -static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, - unsigned long mask, bool wake) -{ - unsigned long flags; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!(rnp->expmask & mask)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - rnp->expmask &= ~mask; - __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ -} - -/* - * Report expedited quiescent state for specified rcu_data (CPU). - */ -static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, - bool wake) -{ - rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); -} - -/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, - unsigned long s) -{ - if (rcu_exp_gp_seq_done(rsp, s)) { - trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); - /* Ensure test happens before caller kfree(). */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(stat); - return true; - } - return false; -} - -/* - * Funnel-lock acquisition for expedited grace periods. Returns true - * if some other task completed an expedited grace period that this task - * can piggy-back on, and with no mutex held. Otherwise, returns false - * with the mutex held, indicating that the caller must actually do the - * expedited grace period. - */ -static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) -{ - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - struct rcu_node *rnp = rdp->mynode; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - /* Low-contention fastpath. */ - if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && - (rnp == rnp_root || - ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && - !mutex_is_locked(&rsp->exp_mutex) && - mutex_trylock(&rsp->exp_mutex)) - goto fastpath; - - /* - * Each pass through the following loop works its way up - * the rcu_node tree, returning if others have done the work or - * otherwise falls through to acquire rsp->exp_mutex. The mapping - * from CPU to rcu_node structure can be inexact, as it is just - * promoting locality and is not strictly needed for correctness. - */ - for (; rnp != NULL; rnp = rnp->parent) { - if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) - return true; - - /* Work not done, either wait here or go up. */ - spin_lock(&rnp->exp_lock); - if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { - - /* Someone else doing GP, so wait for them. */ - spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, - rnp->grplo, rnp->grphi, - TPS("wait")); - wait_event(rnp->exp_wq[(s >> 1) & 0x3], - sync_exp_work_done(rsp, - &rdp->exp_workdone2, s)); - return true; - } - rnp->exp_seq_rq = s; /* Followers can wait on us. */ - spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, - rnp->grphi, TPS("nxtlvl")); - } - mutex_lock(&rsp->exp_mutex); -fastpath: - if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { - mutex_unlock(&rsp->exp_mutex); - return true; - } - rcu_exp_gp_seq_start(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); - return false; -} - -/* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *data) -{ - struct rcu_data *rdp; - struct rcu_node *rnp; - struct rcu_state *rsp = data; - - rdp = this_cpu_ptr(rsp->rda); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - if (rcu_is_cpu_rrupt_from_idle()) { - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), true); - return; - } - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); - resched_cpu(smp_processor_id()); -} - -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ - struct rcu_data *rdp; - int ret; - struct rcu_node *rnp; - struct rcu_state *rsp = &rcu_sched_state; - - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) - return; - ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); - WARN_ON_ONCE(ret); -} - -/* - * Select the nodes that the upcoming expedited grace period needs - * to wait for. - */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, - smp_call_func_t func) -{ - int cpu; - unsigned long flags; - unsigned long mask; - unsigned long mask_ofl_test; - unsigned long mask_ofl_ipi; - int ret; - struct rcu_node *rnp; - - sync_exp_reset_tree(rsp); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - - /* Each pass checks a CPU for identity, offline, and idle. */ - mask_ofl_test = 0; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - mask_ofl_test |= rdp->grpmask; - } - mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited - * GP until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - /* IPI the remaining CPUs for expedited quiescent state. */ - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - if (!(mask_ofl_ipi & mask)) - continue; -retry_ipi: - ret = smp_call_function_single(cpu, func, rsp, 0); - if (!ret) { - mask_ofl_ipi &= ~mask; - continue; - } - /* Failed, raced with offline. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (cpu_online(cpu) && - (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - } - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } - /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); - } -} - -static void synchronize_sched_expedited_wait(struct rcu_state *rsp) -{ - int cpu; - unsigned long jiffies_stall; - unsigned long jiffies_start; - unsigned long mask; - int ndetected; - struct rcu_node *rnp; - struct rcu_node *rnp_root = rcu_get_root(rsp); - int ret; - - jiffies_stall = rcu_jiffies_till_stall_check(); - jiffies_start = jiffies; - - for (;;) { - ret = swait_event_timeout( - rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root), - jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) - return; - if (ret < 0) { - /* Hit a signal, disable CPU stall warnings. */ - swait_event(rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root)); - return; - } - pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rsp->name); - ndetected = 0; - rcu_for_each_leaf_node(rsp, rnp) { - ndetected += rcu_print_task_exp_stall(rnp); - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - struct rcu_data *rdp; - - if (!(rnp->expmask & mask)) - continue; - ndetected++; - rdp = per_cpu_ptr(rsp->rda, cpu); - pr_cont(" %d-%c%c%c", cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); - } - mask <<= 1; - } - pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rsp->expedited_sequence, - rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); - if (ndetected) { - pr_err("blocking rcu_node structures:"); - rcu_for_each_node_breadth_first(rsp, rnp) { - if (rnp == rnp_root) - continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done(rnp)) - continue; - pr_cont(" l=%u:%d-%d:%#lx/%c", - rnp->level, rnp->grplo, rnp->grphi, - rnp->expmask, - ".T"[!!rnp->exp_tasks]); - } - pr_cont("\n"); - } - rcu_for_each_leaf_node(rsp, rnp) { - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - if (!(rnp->expmask & mask)) - continue; - dump_cpu_task(cpu); - } - } - jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; - } -} - -/* - * Wait for the current expedited grace period to complete, and then - * wake up everyone who piggybacked on the just-completed expedited - * grace period. Also update all the ->exp_seq_rq counters as needed - * in order to avoid counter-wrap problems. - */ -static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) -{ - struct rcu_node *rnp; - - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); - - /* - * Switch over to wakeup mode, allowing the next GP, but -only- the - * next GP, to proceed. - */ - mutex_lock(&rsp->exp_wake_mutex); - mutex_unlock(&rsp->exp_mutex); - - rcu_for_each_node_breadth_first(rsp, rnp) { - if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { - spin_lock(&rnp->exp_lock); - /* Recheck, avoid hang in case someone just arrived. */ - if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) - rnp->exp_seq_rq = s; - spin_unlock(&rnp->exp_lock); - } - wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); - } - trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); - mutex_unlock(&rsp->exp_wake_mutex); -} - -/** - * synchronize_sched_expedited - Brute-force RCU-sched grace period - * - * Wait for an RCU-sched grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. In fact, - * if you are using synchronize_sched_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_sched() instead. - * - * This implementation can be thought of as an application of sequence - * locking to expedited grace periods, but using the sequence counter to - * determine when someone else has already done the work instead of for - * retrying readers. - */ -void synchronize_sched_expedited(void) -{ - unsigned long s; - struct rcu_state *rsp = &rcu_sched_state; - - /* If only one CPU, this is automatically a grace period. */ - if (rcu_blocking_is_gp()) - return; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu_sched); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - - /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rsp, s); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -4747,4 +4205,5 @@ void __init rcu_init(void) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); } +#include "tree_exp.h" #include "tree_plugin.h" diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h new file mode 100644 index 0000000..db0909c --- /dev/null +++ b/kernel/rcu/tree_exp.h @@ -0,0 +1,564 @@ +/* + * RCU expedited grace periods + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2016 + * + * Authors: Paul E. McKenney + */ + +/* Wrapper functions for expedited grace periods. */ +static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +{ + rcu_seq_start(&rsp->expedited_sequence); +} +static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +{ + rcu_seq_end(&rsp->expedited_sequence); + smp_mb(); /* Ensure that consecutive grace periods serialize. */ +} +static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +{ + unsigned long s; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + s = rcu_seq_snap(&rsp->expedited_sequence); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); + return s; +} +static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +{ + return rcu_seq_done(&rsp->expedited_sequence, s); +} + +/* + * Reset the ->expmaskinit values in the rcu_node tree to reflect any + * recent CPU-online activity. Note that these masks are not cleared + * when CPUs go offline, so they reflect the union of all CPUs that have + * ever been online. This means that this function normally takes its + * no-work-to-do fastpath. + */ +static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) +{ + bool done; + unsigned long flags; + unsigned long mask; + unsigned long oldmask; + int ncpus = READ_ONCE(rsp->ncpus); + struct rcu_node *rnp; + struct rcu_node *rnp_up; + + /* If no new CPUs onlined since last time, nothing to do. */ + if (likely(ncpus == rsp->ncpus_snap)) + return; + rsp->ncpus_snap = ncpus; + + /* + * Each pass through the following loop propagates newly onlined + * CPUs for the current rcu_node structure up the rcu_node tree. + */ + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (rnp->expmaskinit == rnp->expmaskinitnext) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + continue; /* No new CPUs, nothing to do. */ + } + + /* Update this node's mask, track old value for propagation. */ + oldmask = rnp->expmaskinit; + rnp->expmaskinit = rnp->expmaskinitnext; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + /* If was already nonzero, nothing to propagate. */ + if (oldmask) + continue; + + /* Propagate the new CPU up the tree. */ + mask = rnp->grpmask; + rnp_up = rnp->parent; + done = false; + while (rnp_up) { + raw_spin_lock_irqsave_rcu_node(rnp_up, flags); + if (rnp_up->expmaskinit) + done = true; + rnp_up->expmaskinit |= mask; + raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); + if (done) + break; + mask = rnp_up->grpmask; + rnp_up = rnp_up->parent; + } + } +} + +/* + * Reset the ->expmask values in the rcu_node tree in preparation for + * a new expedited grace period. + */ +static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_node *rnp; + + sync_exp_reset_tree_hotplug(rsp); + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + WARN_ON_ONCE(rnp->expmask); + rnp->expmask = rnp->expmaskinit; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } +} + +/* + * Return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold the rcu_state's exp_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return rnp->exp_tasks == NULL && + READ_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold the rcu_state's exp_mutex and the specified rcu_node + * structure's ->lock. + */ +static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) { + if (!rnp->expmask) + rcu_initiate_boost(rnp, flags); + else + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + break; + } + if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ + swake_up(&rsp->expedited_wq); + } + break; + } + mask = rnp->grpmask; + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ + rnp = rnp->parent; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ + WARN_ON_ONCE(!(rnp->expmask & mask)); + rnp->expmask &= ~mask; + } +} + +/* + * Report expedited quiescent state for specified node. This is a + * lock-acquisition wrapper function for __rcu_report_exp_rnp(). + * + * Caller must hold the rcu_state's exp_mutex. + */ +static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + __rcu_report_exp_rnp(rsp, rnp, wake, flags); +} + +/* + * Report expedited quiescent state for multiple CPUs, all covered by the + * specified leaf rcu_node structure. Caller must hold the rcu_state's + * exp_mutex. + */ +static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, + unsigned long mask, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!(rnp->expmask & mask)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + rnp->expmask &= ~mask; + __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ +} + +/* + * Report expedited quiescent state for specified rcu_data (CPU). + */ +static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, + bool wake) +{ + rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); +} + +/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ +static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, + unsigned long s) +{ + if (rcu_exp_gp_seq_done(rsp, s)) { + trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); + /* Ensure test happens before caller kfree(). */ + smp_mb__before_atomic(); /* ^^^ */ + atomic_long_inc(stat); + return true; + } + return false; +} + +/* + * Funnel-lock acquisition for expedited grace periods. Returns true + * if some other task completed an expedited grace period that this task + * can piggy-back on, and with no mutex held. Otherwise, returns false + * with the mutex held, indicating that the caller must actually do the + * expedited grace period. + */ +static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + /* Low-contention fastpath. */ + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && + (rnp == rnp_root || + ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && + !mutex_is_locked(&rsp->exp_mutex) && + mutex_trylock(&rsp->exp_mutex)) + goto fastpath; + + /* + * Each pass through the following loop works its way up + * the rcu_node tree, returning if others have done the work or + * otherwise falls through to acquire rsp->exp_mutex. The mapping + * from CPU to rcu_node structure can be inexact, as it is just + * promoting locality and is not strictly needed for correctness. + */ + for (; rnp != NULL; rnp = rnp->parent) { + if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + return true; + + /* Work not done, either wait here or go up. */ + spin_lock(&rnp->exp_lock); + if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { + + /* Someone else doing GP, so wait for them. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, + TPS("wait")); + wait_event(rnp->exp_wq[(s >> 1) & 0x3], + sync_exp_work_done(rsp, + &rdp->exp_workdone2, s)); + return true; + } + rnp->exp_seq_rq = s; /* Followers can wait on us. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, + rnp->grphi, TPS("nxtlvl")); + } + mutex_lock(&rsp->exp_mutex); +fastpath: + if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + mutex_unlock(&rsp->exp_mutex); + return true; + } + rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); + return false; +} + +/* Invoked on each online non-idle CPU for expedited quiescent state. */ +static void sync_sched_exp_handler(void *data) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp = data; + + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + if (rcu_is_cpu_rrupt_from_idle()) { + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); + return; + } + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + resched_cpu(smp_processor_id()); +} + +/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ + struct rcu_data *rdp; + int ret; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_sched_state; + + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + return; + ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); + WARN_ON_ONCE(ret); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) +{ + int cpu; + unsigned long flags; + unsigned long mask; + unsigned long mask_ofl_test; + unsigned long mask_ofl_ipi; + int ret; + struct rcu_node *rnp; + + sync_exp_reset_tree(rsp); + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (raw_smp_processor_id() == cpu || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + mask_ofl_test |= rdp->grpmask; + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; + + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited + * GP until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + /* IPI the remaining CPUs for expedited quiescent state. */ + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(mask_ofl_ipi & mask)) + continue; +retry_ipi: + ret = smp_call_function_single(cpu, func, rsp, 0); + if (!ret) { + mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with offline. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + } +} + +static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +{ + int cpu; + unsigned long jiffies_stall; + unsigned long jiffies_start; + unsigned long mask; + int ndetected; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(rsp); + int ret; + + jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_start = jiffies; + + for (;;) { + ret = swait_event_timeout( + rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root), + jiffies_stall); + if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) + return; + if (ret < 0) { + /* Hit a signal, disable CPU stall warnings. */ + swait_event(rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root)); + return; + } + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", + rsp->name); + ndetected = 0; + rcu_for_each_leaf_node(rsp, rnp) { + ndetected += rcu_print_task_exp_stall(rnp); + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + struct rcu_data *rdp; + + if (!(rnp->expmask & mask)) + continue; + ndetected++; + rdp = per_cpu_ptr(rsp->rda, cpu); + pr_cont(" %d-%c%c%c", cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); + } + mask <<= 1; + } + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + jiffies - jiffies_start, rsp->expedited_sequence, + rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + if (ndetected) { + pr_err("blocking rcu_node structures:"); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_preempt_exp_done(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, + rnp->expmask, + ".T"[!!rnp->exp_tasks]); + } + pr_cont("\n"); + } + rcu_for_each_leaf_node(rsp, rnp) { + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(rnp->expmask & mask)) + continue; + dump_cpu_task(cpu); + } + } + jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + } +} + +/* + * Wait for the current expedited grace period to complete, and then + * wake up everyone who piggybacked on the just-completed expedited + * grace period. Also update all the ->exp_seq_rq counters as needed + * in order to avoid counter-wrap problems. + */ +static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_node *rnp; + + synchronize_sched_expedited_wait(rsp); + rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + + /* + * Switch over to wakeup mode, allowing the next GP, but -only- the + * next GP, to proceed. + */ + mutex_lock(&rsp->exp_wake_mutex); + mutex_unlock(&rsp->exp_mutex); + + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { + spin_lock(&rnp->exp_lock); + /* Recheck, avoid hang in case someone just arrived. */ + if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) + rnp->exp_seq_rq = s; + spin_unlock(&rnp->exp_lock); + } + wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); + } + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_wake_mutex); +} + +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. + * + * This implementation can be thought of as an application of sequence + * locking to expedited grace periods, but using the sequence counter to + * determine when someone else has already done the work instead of for + * retrying readers. + */ +void synchronize_sched_expedited(void) +{ + unsigned long s; + struct rcu_state *rsp = &rcu_sched_state; + + /* If only one CPU, this is automatically a grace period. */ + if (rcu_blocking_is_gp()) + return; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu_sched); + return; + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); + + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rsp, s); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); -- cgit v0.10.2 From 40e0a6cfd53e37d9b8863cdbc0adb1f72e9311e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 15 Apr 2016 16:44:07 -0700 Subject: rcu: Move expedited code from tree_plugin.h to tree_exp.h People have been having some difficulty finding their way around the RCU code. This commit therefore pulls some of the expedited grace-period code from tree_plugin.h to a new tree_exp.h file. This commit is strictly code movement. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index db0909c..00a02a2 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -562,3 +562,97 @@ void synchronize_sched_expedited(void) rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#ifdef CONFIG_PREEMPT_RCU + +/* + * Remote handler for smp_call_function_single(). If there is an + * RCU read-side critical section in effect, request that the + * next rcu_read_unlock() record the quiescent state up the + * ->expmask fields in the rcu_node tree. Otherwise, immediately + * report the quiescent state. + */ +static void sync_rcu_exp_handler(void *info) +{ + struct rcu_data *rdp; + struct rcu_state *rsp = info; + struct task_struct *t = current; + + /* + * Within an RCU read-side critical section, request that the next + * rcu_read_unlock() report. Unless this RCU read-side critical + * section has already blocked, in which case it is already set + * up for the expedited grace period to wait on it. + */ + if (t->rcu_read_lock_nesting > 0 && + !t->rcu_read_unlock_special.b.blocked) { + t->rcu_read_unlock_special.b.exp_need_qs = true; + return; + } + + /* + * We are either exiting an RCU read-side critical section (negative + * values of t->rcu_read_lock_nesting) or are not in one at all + * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU + * read-side critical section that blocked before this expedited + * grace period started. Either way, we can immediately report + * the quiescent state. + */ + rdp = this_cpu_ptr(rsp->rda); + rcu_report_exp_rdp(rsp, rdp, true); +} + +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU-preempt grace period, but expedite it. The basic + * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler + * checks whether the CPU is in an RCU-preempt critical section, and + * if so, it sets a flag that causes the outermost rcu_read_unlock() + * to report the quiescent state. On the other hand, if the CPU is + * not in an RCU read-side critical section, the IPI handler reports + * the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. + */ +void synchronize_rcu_expedited(void) +{ + struct rcu_state *rsp = rcu_state_p; + unsigned long s; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); + + /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ + rcu_exp_wait_wake(rsp, s); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +/* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptible RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ff1cd4e..695071d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -681,84 +681,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -/* - * Remote handler for smp_call_function_single(). If there is an - * RCU read-side critical section in effect, request that the - * next rcu_read_unlock() record the quiescent state up the - * ->expmask fields in the rcu_node tree. Otherwise, immediately - * report the quiescent state. - */ -static void sync_rcu_exp_handler(void *info) -{ - struct rcu_data *rdp; - struct rcu_state *rsp = info; - struct task_struct *t = current; - - /* - * Within an RCU read-side critical section, request that the next - * rcu_read_unlock() report. Unless this RCU read-side critical - * section has already blocked, in which case it is already set - * up for the expedited grace period to wait on it. - */ - if (t->rcu_read_lock_nesting > 0 && - !t->rcu_read_unlock_special.b.blocked) { - t->rcu_read_unlock_special.b.exp_need_qs = true; - return; - } - - /* - * We are either exiting an RCU read-side critical section (negative - * values of t->rcu_read_lock_nesting) or are not in one at all - * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU - * read-side critical section that blocked before this expedited - * grace period started. Either way, we can immediately report - * the quiescent state. - */ - rdp = this_cpu_ptr(rsp->rda); - rcu_report_exp_rdp(rsp, rdp, true); -} - -/** - * synchronize_rcu_expedited - Brute-force RCU grace period - * - * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler - * checks whether the CPU is in an RCU-preempt critical section, and - * if so, it sets a flag that causes the outermost rcu_read_unlock() - * to report the quiescent state. On the other hand, if the CPU is - * not in an RCU read-side critical section, the IPI handler reports - * the quiescent state immediately. - * - * Although this is a greate improvement over previous expedited - * implementations, it is still unfriendly to real-time workloads, so is - * thus not recommended for any sort of common-case code. In fact, if - * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() - * instead. - */ -void synchronize_rcu_expedited(void) -{ - struct rcu_state *rsp = rcu_state_p; - unsigned long s; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - - /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ - rcu_exp_wait_wake(rsp, s); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - /** * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. * @@ -883,16 +805,6 @@ static void rcu_preempt_check_callbacks(void) } /* - * Wait for an rcu-preempt grace period, but make it happen quickly. - * But because preemptible RCU does not exist, map to rcu-sched. - */ -void synchronize_rcu_expedited(void) -{ - synchronize_sched_expedited(); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -/* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). */ -- cgit v0.10.2 From 810ce8b5df1c8338065f2ae1d2ec08cc566fbb8b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Apr 2016 09:22:15 -0700 Subject: rcu: Document RCU_NONIDLE() restrictions in comment header Signed-off-by: Paul E. McKenney diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 5f1533e..c61b6b9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -379,12 +379,13 @@ static inline void rcu_init_nohz(void) * in the inner idle loop. * * This macro provides the way out: RCU_NONIDLE(do_something_with_RCU()) - * will tell RCU that it needs to pay attending, invoke its argument - * (in this example, a call to the do_something_with_RCU() function), + * will tell RCU that it needs to pay attention, invoke its argument + * (in this example, calling the do_something_with_RCU() function), * and then tell RCU to go back to ignoring this CPU. It is permissible - * to nest RCU_NONIDLE() wrappers, but the nesting level is currently - * quite limited. If deeper nesting is required, it will be necessary - * to adjust DYNTICK_TASK_NESTING_VALUE accordingly. + * to nest RCU_NONIDLE() wrappers, but not indefinitely (but the limit is + * on the order of a million or so, even on 32-bit systems). It is + * not legal to block within RCU_NONIDLE(), nor is it permissible to + * transfer control either into or out of RCU_NONIDLE()'s statement. */ #define RCU_NONIDLE(a) \ do { \ -- cgit v0.10.2 From f8cbdee99b161cb08c3fb55200941028c5fe25c8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Apr 2016 13:03:18 -0700 Subject: torture: Simplify code, eliminate RCU_PERF_TEST_RUNNABLE This commit applies the infamous IS_ENABLED() macro to eliminate a #ifdef. It also eliminates the RCU_PERF_TEST_RUNNABLE Kconfig option in favor of the already-existing rcuperf.perf_runnable kernel boot parameter. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cee0d8..afd174e 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -96,12 +96,7 @@ static int rcu_perf_writer_state; #define MAX_MEAS 10000 #define MIN_MEAS 100 -#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) -#define RCUPERF_RUNNABLE_INIT 1 -#else -#define RCUPERF_RUNNABLE_INIT 0 -#endif -static int perf_runnable = RCUPERF_RUNNABLE_INIT; +static int perf_runnable = IS_ENABLED(MODULE); module_param(perf_runnable, int, 0444); MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b9cfdbf..cf6ddcd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1307,22 +1307,6 @@ config RCU_PERF_TEST Say M if you want the RCU performance tests to build as a module. Say N if you are unsure. -config RCU_PERF_TEST_RUNNABLE - bool "performance tests for RCU runnable by default" - depends on RCU_PERF_TEST = y - default n - help - This option provides a way to build the RCU performance tests - directly into the kernel without them starting up at boot time. - You can use /sys/module to manually override this setting. - This /proc file is available only when the RCU performance - tests have been built into the kernel. - - Say Y here if you want the RCU performance tests to start during - boot (you probably don't). - Say N here if you want the RCU performance tests to start only - after being manually enabled via /sys/module. - config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL -- cgit v0.10.2 From 4e9a073f60367157fd64b65490654c39d4c44321 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Apr 2016 14:42:34 -0700 Subject: torture: Remove CONFIG_RCU_TORTURE_TEST_RUNNABLE, simplify code This commit removes CONFIG_RCU_TORTURE_TEST_RUNNABLE in favor of the already-existing rcutorture.torture_runnable kernel boot parameter. It also converts an #ifdef into IS_ENABLED(), saving a few lines of code. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 084a28a..01cb57f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) -#define RCUTORTURE_RUNNABLE_INIT 1 -#else -#define RCUTORTURE_RUNNABLE_INIT 0 -#endif -static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; +static int torture_runnable = IS_ENABLED(MODULE); module_param(torture_runnable, int, 0444); MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ff1cd4e..5b2d723 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -79,8 +79,6 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) - pr_info("\tRCU torture testing starts during boot.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cf6ddcd..805b704 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1324,23 +1324,6 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. -config RCU_TORTURE_TEST_RUNNABLE - bool "torture tests for RCU runnable by default" - depends on RCU_TORTURE_TEST = y - default n - help - This option provides a way to build the RCU torture tests - directly into the kernel without them starting up at boot - time. You can use /proc/sys/kernel/rcutorture_runnable - to manually override this setting. This /proc file is - available only when the RCU torture tests have been built - into the kernel. - - Say Y here if you want the RCU torture tests to start during - boot (you probably don't). - Say N here if you want the RCU torture tests to start only - after being manually enabled via /proc. - config RCU_TORTURE_TEST_SLOW_PREINIT bool "Slow down RCU grace-period pre-initialization to expose races" depends on RCU_TORTURE_TEST -- cgit v0.10.2 From 6e8c66c867f919128855b77623ca9917770f1894 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Apr 2016 16:50:44 -0700 Subject: torture: Forgive lengthy trace dumps and preemption This commit avoids killing qemu if a trace dump is making progress or if console log output is continuing and the console log timestamp does not exceed the total plus grace period. Signed-off-by: Paul E. McKenney diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 4109f30..9b17c52 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -229,6 +229,7 @@ fi if test $commandcompleted -eq 0 -a -n "$qemu_pid" then echo Grace period for qemu job at pid $qemu_pid + oldline="`tail $resdir/console.log`" while : do kruntime=`awk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` @@ -238,13 +239,29 @@ then else break fi - if test $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE)) + must_continue=no + newline="`tail $resdir/console.log`" + if test "$newline" != "$oldline" && echo $newline | grep -q ' [0-9]\+us : ' + then + must_continue=yes + fi + last_ts="`tail $resdir/console.log | grep '^\[ *[0-9]\+\.[0-9]\+]' | tail -1 | sed -e 's/^\[ *//' -e 's/\..*$//'`" + if test -z "last_ts" + then + last_ts=0 + fi + if test "$newline" != "$oldline" -a "$last_ts" -lt $((seconds + $TORTURE_SHUTDOWN_GRACE)) + then + must_continue=yes + fi + if test $must_continue = no -a $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE)) then echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1 kill -KILL $qemu_pid break fi - sleep 1 + oldline=$newline + sleep 10 done elif test -z "$qemu_pid" then -- cgit v0.10.2 From d95f5ba90fa6043a366958897fdef705af968b70 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Apr 2016 17:18:41 -0700 Subject: torture: Break online and offline functions out of torture_onoff() This commit breaks torture_online() and torture_offline() out of torture_onoff() in preparation for allowing waketorture finer-grained control of its CPU-hotplug workload. Signed-off-by: Paul E. McKenney diff --git a/include/linux/torture.h b/include/linux/torture.h index 7759fc3..6685a73 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -50,6 +50,10 @@ do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); } while (0) /* Definitions for online/offline exerciser. */ +bool torture_offline(int cpu, long *n_onl_attempts, long *n_onl_successes, + unsigned long *sum_offl, int *min_onl, int *max_onl); +bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, + unsigned long *sum_onl, int *min_onl, int *max_onl); int torture_onoff_init(long ooholdoff, long oointerval); void torture_onoff_stats(void); bool torture_onoff_failures(void); diff --git a/kernel/torture.c b/kernel/torture.c index fa0bdee..fb39a06 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -82,6 +82,104 @@ static int min_online = -1; static int max_online; /* + * Attempt to take a CPU offline. Return false if the CPU is already + * offline or if it is not subject to CPU-hotplug operations. The + * caller can detect other failures by looking at the statistics. + */ +bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, + unsigned long *sum_offl, int *min_offl, int *max_offl) +{ + unsigned long delta; + int ret; + unsigned long starttime; + + if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) + return false; + + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; + (*n_offl_attempts)++; + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlined %d\n", + torture_type, cpu); + (*n_offl_successes)++; + delta = jiffies - starttime; + sum_offl += delta; + if (*min_offl < 0) { + *min_offl = delta; + *max_offl = delta; + } + if (*min_offl > delta) + *min_offl = delta; + if (*max_offl < delta) + *max_offl = delta; + } + + return true; +} +EXPORT_SYMBOL_GPL(torture_offline); + +/* + * Attempt to bring a CPU online. Return false if the CPU is already + * online or if it is not subject to CPU-hotplug operations. The + * caller can detect other failures by looking at the statistics. + */ +bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, + unsigned long *sum_onl, int *min_onl, int *max_onl) +{ + unsigned long delta; + int ret; + unsigned long starttime; + + if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) + return false; + + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; + (*n_onl_attempts)++; + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlined %d\n", + torture_type, cpu); + (*n_onl_successes)++; + delta = jiffies - starttime; + *sum_onl += delta; + if (*min_onl < 0) { + *min_onl = delta; + *max_onl = delta; + } + if (*min_onl > delta) + *min_onl = delta; + if (*max_onl < delta) + *max_onl = delta; + } + + return true; +} +EXPORT_SYMBOL_GPL(torture_online); + +/* * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. */ @@ -89,11 +187,8 @@ static int torture_onoff(void *arg) { int cpu; - unsigned long delta; int maxcpu = -1; DEFINE_TORTURE_RANDOM(rand); - int ret; - unsigned long starttime; VERBOSE_TOROUT_STRING("torture_onoff task started"); for_each_online_cpu(cpu) @@ -106,67 +201,12 @@ torture_onoff(void *arg) } while (!torture_must_stop()) { cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_offline_attempts++; - ret = cpu_down(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offlined %d\n", - torture_type, cpu); - n_offline_successes++; - delta = jiffies - starttime; - sum_offline += delta; - if (min_offline < 0) { - min_offline = delta; - max_offline = delta; - } - if (min_offline > delta) - min_offline = delta; - if (max_offline < delta) - max_offline = delta; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: onlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_online_attempts++; - ret = cpu_up(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: onlined %d\n", - torture_type, cpu); - n_online_successes++; - delta = jiffies - starttime; - sum_online += delta; - if (min_online < 0) { - min_online = delta; - max_online = delta; - } - if (min_online > delta) - min_online = delta; - if (max_online < delta) - max_online = delta; - } - } + if (!torture_offline(cpu, + &n_offline_attempts, &n_offline_successes, + &sum_offline, &min_offline, &max_offline)) + torture_online(cpu, + &n_online_attempts, &n_online_successes, + &sum_online, &min_online, &max_online); schedule_timeout_interruptible(onoff_interval); } torture_kthread_stopping("torture_onoff"); -- cgit v0.10.2 From 682ed706c5bb1526b001bc69aa4ee1e8b456bfa6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Apr 2016 12:03:06 -0700 Subject: torture: Add starvation events to error summary This commit adds a string of the form "Starves: 10" to the summary line for error conditions found in the console output. Signed-off-by: Paul E. McKenney diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index 5eb49b7..08aa7d5 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -33,7 +33,7 @@ if grep -Pq '\x00' < $file then print_warning Console output contains nul bytes, old qemu still running? fi -egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags +egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags if test -s $1.diags then print_warning Assertion failure in $file $title @@ -69,6 +69,11 @@ then then summary="$summary Stalls: $n_stalls" fi + n_starves=`grep -c 'rcu_.*kthread starved for' $1` + if test "$n_starves" -ne 0 + then + summary="$summary Starves: $n_starves" + fi print_warning Summary: $summary else rm $1.diags -- cgit v0.10.2 From 750db0f5f7d0ff6b86158015f02c275702639b20 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Mon, 2 May 2016 10:30:00 +0800 Subject: torture: Stop onoff task if there is only one cpu If the whole system has only one cpu, that cpu won't be able to be offlined, so there is no need onoff task is stil running. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney diff --git a/kernel/torture.c b/kernel/torture.c index fb39a06..75961b3 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -194,6 +194,12 @@ torture_onoff(void *arg) for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); + + if (maxcpu == 0) { + VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); + goto stop; + } + if (onoff_holdoff > 0) { VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); schedule_timeout_interruptible(onoff_holdoff); @@ -209,6 +215,8 @@ torture_onoff(void *arg) &sum_online, &min_online, &max_online); schedule_timeout_interruptible(onoff_interval); } + +stop: torture_kthread_stopping("torture_onoff"); return 0; } -- cgit v0.10.2 From e5731b584b3e521e3db6fda9cdfe10646d3413a3 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 19 May 2016 11:42:21 +0800 Subject: rcutorture/doc: Create initrd using dracut Using dracut is another way to get an initramfs for KVM-based RCU torture tests, which is more flexible than using the host's initramfs image, because modules and binaries may be added or removed via dracut command options. So add an example in the document, in case that there are some situations where host's initramfs couldn't be used. Signed-off-by: Boqun Feng Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney diff --git a/tools/testing/selftests/rcutorture/doc/initrd.txt b/tools/testing/selftests/rcutorture/doc/initrd.txt index 4170e71..833f826 100644 --- a/tools/testing/selftests/rcutorture/doc/initrd.txt +++ b/tools/testing/selftests/rcutorture/doc/initrd.txt @@ -13,6 +13,22 @@ cd initrd cpio -id < /tmp/initrd.img.zcat ------------------------------------------------------------------------ +Another way to create an initramfs image is using "dracut"[1], which is +available on many distros, however the initramfs dracut generates is a cpio +archive with another cpio archive in it, so an extra step is needed to create +the initrd directory hierarchy. + +Here are the commands to create a initrd directory for rcutorture using +dracut: + +------------------------------------------------------------------------ +dracut --no-hostonly --no-hostonly-cmdline --module "base bash shutdown" /tmp/initramfs.img +cd tools/testing/selftests/rcutorture +mkdir initrd +cd initrd +/usr/lib/dracut/skipcpio /tmp/initramfs.img | zcat | cpio -id < /tmp/initramfs.img +------------------------------------------------------------------------ + Interestingly enough, if you are running rcutorture, you don't really need userspace in many cases. Running without userspace has the advantage of allowing you to test your kernel independently of the @@ -89,3 +105,9 @@ while : do sleep 10 done +------------------------------------------------------------------------ + +References: +[1]: https://dracut.wiki.kernel.org/index.php/Main_Page +[2]: http://blog.elastocloud.org/2015/06/rapid-linux-kernel-devtest-with-qemu.html +[3]: https://www.centos.org/forums/viewtopic.php?t=51621 -- cgit v0.10.2 From 1b900c6a26de26e111617d6b69b64aaee7b9de01 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 19 May 2016 11:42:22 +0800 Subject: rcutorture: Use vmlinux as the fallback kernel image The vmlinux image is available for all the architectures, and suitable for running a KVM guest by QEMU, besides, we used to copy the vmlinux to $resdir anyway. Therefore it makes sense to use it as the fallback kernel image for rcutorture KVM tests. This patch makes identify_boot_image() return vmlinux if ${TORTURE_BOOT_IMAGE} is not set on non-x86 architectures, also fixes several places that hard-code "bzImage" as $KERNEL. This also fixes a problem that PPC doesn't have a bzImage file as build results. Signed-off-by: Boqun Feng Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index b325470..6161801 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -99,8 +99,9 @@ configfrag_hotplug_cpu () { # identify_boot_image qemu-cmd # # Returns the relative path to the kernel build image. This will be -# arch//boot/bzImage unless overridden with the TORTURE_BOOT_IMAGE -# environment variable. +# arch//boot/bzImage or vmlinux if bzImage is not a target for the +# architecture, unless overridden with the TORTURE_BOOT_IMAGE environment +# variable. identify_boot_image () { if test -n "$TORTURE_BOOT_IMAGE" then @@ -110,11 +111,8 @@ identify_boot_image () { qemu-system-x86_64|qemu-system-i386) echo arch/x86/boot/bzImage ;; - qemu-system-ppc64) - echo arch/powerpc/boot/bzImage - ;; *) - echo "" + echo vmlinux ;; esac fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 9b17c52..8dc5e463 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -96,7 +96,8 @@ if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdi then # Rerunning previous test, so use that test's kernel. QEMU="`identify_qemu $base_resdir/vmlinux`" - KERNEL=$base_resdir/bzImage + BOOT_IMAGE="`identify_boot_image $QEMU`" + KERNEL=$base_resdir/${BOOT_IMAGE##*/} # use the last component of ${BOOT_IMAGE} ln -s $base_resdir/Make*.out $resdir # for kvm-recheck.sh ln -s $base_resdir/.config $resdir # for kvm-recheck.sh elif kvm-build.sh $config_template $builddir $T @@ -110,7 +111,7 @@ then if test -n "$BOOT_IMAGE" then cp $builddir/$BOOT_IMAGE $resdir - KERNEL=$resdir/bzImage + KERNEL=$resdir/${BOOT_IMAGE##*/} else echo No identifiable boot image, not running KVM, see $resdir. echo Do the torture scripts know about your architecture? -- cgit v0.10.2 From 1b2f48f29e5507a15c1fa8264362d30c02607dcd Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 19 May 2016 11:42:23 +0800 Subject: rcutorture: Make -soundhw a x86 specific option The option "-soundhw pcspk" gives me a error on PPC as follow: qemu-system-ppc64: ISA bus not available for pcspk This means this option doesn't work on ppc by default. So simply make this an x86-specific option via identify_qemu_args(). Signed-off-by: Boqun Feng Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index 6161801..77fdb46c 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -171,6 +171,7 @@ identify_qemu_append () { identify_qemu_args () { case "$1" in qemu-system-x86_64|qemu-system-i386) + echo -soundhw pcspk ;; qemu-system-ppc64) echo -enable-kvm -M pseries -cpu POWER7 -nodefaults diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 8dc5e463..ea6e373 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -8,9 +8,9 @@ # # Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args # -# qemu-args defaults to "-enable-kvm -soundhw pcspk -nographic", along with -# arguments specifying the number of CPUs and other -# options generated from the underlying CPU architecture. +# qemu-args defaults to "-enable-kvm -nographic", along with arguments +# specifying the number of CPUs and other options +# generated from the underlying CPU architecture. # boot_args defaults to value returned by the per_version_boot_params # shell function. # @@ -148,7 +148,7 @@ then fi # Generate -smp qemu argument. -qemu_args="-enable-kvm -soundhw pcspk -nographic $qemu_args" +qemu_args="-enable-kvm -nographic $qemu_args" cpu_count=`configNR_CPUS.sh $config_template` cpu_count=`configfrag_boot_cpus "$boot_args" "$config_template" "$cpu_count"` vcpus=`identify_qemu_vcpus` -- cgit v0.10.2 From d5e953739ca6874caa6a656b67593cafb223b700 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 19 May 2016 11:42:24 +0800 Subject: rcutorture: Don't specify the cpu type of QEMU on PPC Do not restrict the cpu type to POWER7 for QEMU as we have POWER8 now. Signed-off-by: Boqun Feng Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index 77fdb46c..56ac202 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -174,7 +174,7 @@ identify_qemu_args () { echo -soundhw pcspk ;; qemu-system-ppc64) - echo -enable-kvm -M pseries -cpu POWER7 -nodefaults + echo -enable-kvm -M pseries -nodefaults echo -device spapr-vscsi if test -n "$TORTURE_QEMU_INTERACTIVE" -a -n "$TORTURE_QEMU_MAC" then -- cgit v0.10.2 From 5ef20f872d10c695c30ee036f11c687a23944158 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 May 2016 14:18:48 -0700 Subject: rcutorture: Drop "-soundhw pcspkr" from x86 boot arguments Because recent testing shows that "-soundhw pcspkr" is no longer required in the kernel boot arguments, this commit drops this qemu argument. Reported-by: Josh Triplett Signed-off-by: Paul E. McKenney diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index 56ac202..1426a9b 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -171,7 +171,6 @@ identify_qemu_append () { identify_qemu_args () { case "$1" in qemu-system-x86_64|qemu-system-i386) - echo -soundhw pcspk ;; qemu-system-ppc64) echo -enable-kvm -M pseries -nodefaults -- cgit v0.10.2 From af06d4f74a7d2132c805339bfd5ab771b5706f42 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Wed, 25 May 2016 09:25:33 +0800 Subject: rcuperf: Don't treat gp_exp mis-setting as a WARN 0day found a boot warning triggered in rcu_perf_writer() on !SMP kernel: WARN_ON(rcu_gp_is_normal() && gp_exp); , the root cause of which is trying to measure expedited grace periods(by setting gp_exp to true by default) when all the grace periods are normal(TINY RCU only has normal grace periods). However, such a mis-setting would only result in failing to measure the performance for a specific kind of grace periods, therefore using a WARN_ON to check this is a little overkilling. We could handle this inside rcuperf module via some error messages to tell users about the mis-settings. Therefore this patch removes the WARN_ON in rcu_perf_writer() and handles those checkings in rcu_perf_init() with plain if() code. Moreover, this patch changes the default value of gp_exp to 1) align with rcutorture tests and 2) make the default setting work for all RCU implementations by default. Suggested-by: Paul E. McKenney Signed-off-by: Boqun Feng Fixes: http://lkml.kernel.org/r/57411b10.mFvG0+AgcrMXGtcj%fengguang.wu@intel.com Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index afd174e..7b2dbdf 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -58,7 +58,7 @@ MODULE_AUTHOR("Paul E. McKenney "); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) -torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); @@ -358,8 +358,6 @@ rcu_perf_writer(void *arg) u64 *wdpp = writer_durations[me]; VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); - WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); - WARN_ON(rcu_gp_is_normal() && gp_exp); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); sp.sched_priority = 1; @@ -626,6 +624,16 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { + VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + firsterr = -EINVAL; + goto unwind; + } + if (rcu_gp_is_normal() && gp_exp) { + VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + firsterr = -EINVAL; + goto unwind; + } for (i = 0; i < nrealwriters; i++) { writer_durations[i] = kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), -- cgit v0.10.2 From 65cbea5bbd32280f4620c20a6f651c37a6058752 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jun 2016 13:09:19 -0700 Subject: torture: Inflict default jitter This commit enables jitter by default. It may be manually disabled by passing "--jitter 0" to kvm.sh. Signed-off-by: Paul E. McKenney diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 0d59814..0aed965 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -48,7 +48,7 @@ resdir="" configs="" cpus=0 ds=`date +%Y.%m.%d-%H:%M:%S` -jitter=0 +jitter="-1" . functions.sh -- cgit v0.10.2 From 05dbbfe753792dcebb6b85d84fa5926f09723cfe Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 13 Jun 2016 15:20:39 +0000 Subject: rcutorture: Fix error return code in rcu_perf_init() Fix to return a negative error code -ENOMEM from kcalloc() error handling case instead of 0, as done elsewhere in this function. Signed-off-by: Wei Yongjun Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 7b2dbdf..d38ab08 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -638,8 +638,10 @@ rcu_perf_init(void) writer_durations[i] = kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), GFP_KERNEL); - if (!writer_durations[i]) + if (!writer_durations[i]) { + firsterr = -ENOMEM; goto unwind; + } firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, writer_tasks[i]); if (firsterr) -- cgit v0.10.2 From 3a37f7275cda5ad25c1fe9be8f20c76c60d175fa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 1 May 2016 18:46:54 -0700 Subject: rcu: No ordering for rcu_assign_pointer() of NULL This commit does a compile-time check for rcu_assign_pointer() of NULL, and uses WRITE_ONCE() rather than smp_store_release() in that case. Reported-by: Christoph Hellwig Signed-off-by: Paul E. McKenney diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c61b6b9..a8af797 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -650,7 +650,16 @@ static inline void rcu_preempt_sleep_check(void) * please be careful when making changes to rcu_assign_pointer() and the * other macros that it invokes. */ -#define rcu_assign_pointer(p, v) smp_store_release(&p, RCU_INITIALIZER(v)) +#define rcu_assign_pointer(p, v) \ +({ \ + uintptr_t _r_a_p__v = (uintptr_t)(v); \ + \ + if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \ + WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ + else \ + smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \ + _r_a_p__v; \ +}) /** * rcu_access_pointer() - fetch RCU pointer with no dereferencing -- cgit v0.10.2 From 570dd3c7424179b831decb655ea9dd1ecea38adc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 15 Jun 2016 08:56:53 -0700 Subject: rcu: Disable TASKS_RCU for usermode Linux Usermode Linux currently does not implement arch_irqs_disabled_flags(), which results in a build failure in TASKS_RCU. Therefore, this commit disables the TASKS_RCU Kconfig option in usermode Linux builds. The usermode Linux maintainers expect to merge arch_irqs_disabled_flags() into 4.8, at which point this commit may be reverted. Signed-off-by: Paul E. McKenney Cc: Jeff Dike Acked-by: Richard Weinberger diff --git a/init/Kconfig b/init/Kconfig index f755a60..a068265 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -517,6 +517,7 @@ config SRCU config TASKS_RCU bool default n + depends on !UML select SRCU help This option enables a task-based RCU implementation that uses -- cgit v0.10.2 From 4929c913bda505dbe44bb42c00da06011fee6c9d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 May 2016 11:58:56 -0700 Subject: rcu: Make call_rcu_tasks() tolerate first call with irqs disabled Currently, if the very first call to call_rcu_tasks() has irqs disabled, it will create the rcu_tasks_kthread with irqs disabled, which will result in a splat in the memory allocator, which kthread_run() invokes with the expectation that irqs are enabled. This commit fixes this problem by deferring kthread creation if called with irqs disabled. The first call to call_rcu_tasks() that has irqs enabled will create the kthread. This bug was detected by rcutorture changes that were motivated by Iftekhar Ahmed's mutation-testing efforts. Signed-off-by: Paul E. McKenney diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a8af797..3bc5de0 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -45,6 +45,7 @@ #include #include #include +#include #include diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3e888cd..f0d8322 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -528,6 +528,7 @@ static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; module_param(rcu_task_stall_timeout, int, 0644); static void rcu_spawn_tasks_kthread(void); +static struct task_struct *rcu_tasks_kthread_ptr; /* * Post an RCU-tasks callback. First call must be from process context @@ -537,6 +538,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; + bool havetask = READ_ONCE(rcu_tasks_kthread_ptr); rhp->next = NULL; rhp->func = func; @@ -545,7 +547,9 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) *rcu_tasks_cbs_tail = rhp; rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - if (needwake) { + /* We can't create the thread unless interrupts are enabled. */ + if ((needwake && havetask) || + (!havetask && !irqs_disabled_flags(flags))) { rcu_spawn_tasks_kthread(); wake_up(&rcu_tasks_cbs_wq); } @@ -790,7 +794,6 @@ static int __noreturn rcu_tasks_kthread(void *arg) static void rcu_spawn_tasks_kthread(void) { static DEFINE_MUTEX(rcu_tasks_kthread_mutex); - static struct task_struct *rcu_tasks_kthread_ptr; struct task_struct *t; if (READ_ONCE(rcu_tasks_kthread_ptr)) { -- cgit v0.10.2 From aab057382cb9b16249552684c1ebd270f070ec02 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 May 2016 12:20:51 -0700 Subject: rcu: Fix a typo in a comment In the area in hot pursuit of a bug, so might as well clean it up. Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 084a28a..60bd533 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1476,7 +1476,7 @@ static int rcu_torture_barrier_cbs(void *arg) break; /* * The above smp_load_acquire() ensures barrier_phase load - * is ordered before the folloiwng ->call(). + * is ordered before the following ->call(). */ local_irq_disable(); /* Just to test no-irq call_rcu(). */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); -- cgit v0.10.2 From 088e9d253d3a4ab7e058dd84bb532c32dadf1882 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 2 Jun 2016 13:51:41 -0300 Subject: rcu: sysctl: Panic on RCU Stall It is not always easy to determine the cause of an RCU stall just by analysing the RCU stall messages, mainly when the problem is caused by the indirect starvation of rcu threads. For example, when preempt_rcu is not awakened due to the starvation of a timer softirq. We have been hard coding panic() in the RCU stall functions for some time while testing the kernel-rt. But this is not possible in some scenarios, like when supporting customers. This patch implements the sysctl kernel.panic_on_rcu_stall. If set to 1, the system will panic() when an RCU stall takes place, enabling the capture of a vmcore. The vmcore provides a way to analyze all kernel/tasks states, helping out to point to the culprit and the solution for the stall. The kernel.panic_on_rcu_stall sysctl is disabled by default. Changes from v1: - Fixed a typo in the git log - The if(sysctl_panic_on_rcu_stall) panic() is in a static function - Fixed the CONFIG_TINY_RCU compilation issue - The var sysctl_panic_on_rcu_stall is now __read_mostly Cc: Jonathan Corbet Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Acked-by: Christian Borntraeger Reviewed-by: Josh Triplett Reviewed-by: Arnaldo Carvalho de Melo Tested-by: "Luis Claudio R. Goncalves" Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Paul E. McKenney diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index a3683ce..3320460 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -58,6 +58,7 @@ show up in /proc/sys/kernel: - panic_on_stackoverflow - panic_on_unrecovered_nmi - panic_on_warn +- panic_on_rcu_stall - perf_cpu_time_max_percent - perf_event_paranoid - perf_event_max_stack @@ -618,6 +619,17 @@ a kernel rebuild when attempting to kdump at the location of a WARN(). ============================================================== +panic_on_rcu_stall: + +When set to 1, calls panic() after RCU stall detection messages. This +is useful to define the root cause of RCU stalls using a vmcore. + +0: do not panic() when RCU stall takes place, default behavior. + +1: panic() after printing RCU stall messages. + +============================================================== + perf_cpu_time_max_percent: Hints to the kernel how much CPU time it should be allowed to diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 94aa10f..c420821 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -451,6 +451,7 @@ extern int panic_on_oops; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; extern int panic_on_warn; +extern int sysctl_panic_on_rcu_stall; extern int sysctl_panic_on_stackoverflow; extern bool crash_kexec_post_notifiers; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c844b61..e5ca15a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -125,6 +125,8 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ +/* panic() on RCU Stall sysctl. */ +int sysctl_panic_on_rcu_stall __read_mostly; /* * The rcu_scheduler_active variable transitions from zero to one just @@ -1312,6 +1314,12 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) } } +static inline void panic_on_rcu_stall(void) +{ + if (sysctl_panic_on_rcu_stall) + panic("RCU Stall\n"); +} + static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; @@ -1391,6 +1399,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rcu_check_gp_kthread_starvation(rsp); + panic_on_rcu_stall(); + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1431,6 +1441,8 @@ static void print_cpu_stall(struct rcu_state *rsp) jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + panic_on_rcu_stall(); + /* * Attempt to revive the RCU machinery by forcing a context switch. * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87b2fc3..35f0dcb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1205,6 +1205,17 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) + { + .procname = "panic_on_rcu_stall", + .data = &sysctl_panic_on_rcu_stall, + .maxlen = sizeof(sysctl_panic_on_rcu_stall), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } }; -- cgit v0.10.2 From bc75e99983df1efd977a5cd468893d55d52b8d70 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 3 Jun 2016 15:20:04 +0100 Subject: rcu: Correctly handle sparse possible cpus In many cases in the RCU tree code, we iterate over the set of cpus for a leaf node described by rcu_node::grplo and rcu_node::grphi, checking per-cpu data for each cpu in this range. However, if the set of possible cpus is sparse, some cpus described in this range are not possible, and thus no per-cpu region will have been allocated (or initialised) for them by the generic percpu code. Erroneous accesses to a per-cpu area for these !possible cpus may fault or may hit other data depending on the addressed generated when the erroneous per cpu offset is applied. In practice, both cases have been observed on arm64 hardware (the former being silent, but detectable with additional patches). To avoid issues resulting from this, we must iterate over the set of *possible* cpus for a given leaf node. This patch add a new helper, for_each_leaf_node_possible_cpu, to enable this. As iteration is often intertwined with rcu_node local bitmask manipulation, a new leaf_node_cpu_bit helper is added to make this simpler and more consistent. The RCU tree code is made to use both of these where appropriate. Without this patch, running reboot at a shell can result in an oops like: [ 3369.075979] Unable to handle kernel paging request at virtual address ffffff8008b21b4c [ 3369.083881] pgd = ffffffc3ecdda000 [ 3369.087270] [ffffff8008b21b4c] *pgd=00000083eca48003, *pud=00000083eca48003, *pmd=0000000000000000 [ 3369.096222] Internal error: Oops: 96000007 [#1] PREEMPT SMP [ 3369.101781] Modules linked in: [ 3369.104825] CPU: 2 PID: 1817 Comm: NetworkManager Tainted: G W 4.6.0+ #3 [ 3369.121239] task: ffffffc0fa13e000 ti: ffffffc3eb940000 task.ti: ffffffc3eb940000 [ 3369.128708] PC is at sync_rcu_exp_select_cpus+0x188/0x510 [ 3369.134094] LR is at sync_rcu_exp_select_cpus+0x104/0x510 [ 3369.139479] pc : [] lr : [] pstate: 200001c5 [ 3369.146860] sp : ffffffc3eb9435a0 [ 3369.150162] x29: ffffffc3eb9435a0 x28: ffffff8008be4f88 [ 3369.155465] x27: ffffff8008b66c80 x26: ffffffc3eceb2600 [ 3369.160767] x25: 0000000000000001 x24: ffffff8008be4f88 [ 3369.166070] x23: ffffff8008b51c3c x22: ffffff8008b66c80 [ 3369.171371] x21: 0000000000000001 x20: ffffff8008b21b40 [ 3369.176673] x19: ffffff8008b66c80 x18: 0000000000000000 [ 3369.181975] x17: 0000007fa951a010 x16: ffffff80086a30f0 [ 3369.187278] x15: 0000007fa9505590 x14: 0000000000000000 [ 3369.192580] x13: ffffff8008b51000 x12: ffffffc3eb940000 [ 3369.197882] x11: 0000000000000006 x10: ffffff8008b51b78 [ 3369.203184] x9 : 0000000000000001 x8 : ffffff8008be4000 [ 3369.208486] x7 : ffffff8008b21b40 x6 : 0000000000001003 [ 3369.213788] x5 : 0000000000000000 x4 : ffffff8008b27280 [ 3369.219090] x3 : ffffff8008b21b4c x2 : 0000000000000001 [ 3369.224406] x1 : 0000000000000001 x0 : 0000000000000140 ... [ 3369.972257] [] sync_rcu_exp_select_cpus+0x188/0x510 [ 3369.978685] [] synchronize_rcu_expedited+0x64/0xa8 [ 3369.985026] [] synchronize_net+0x24/0x30 [ 3369.990499] [] dev_deactivate_many+0x28c/0x298 [ 3369.996493] [] __dev_close_many+0x60/0xd0 [ 3370.002052] [] __dev_close+0x28/0x40 [ 3370.007178] [] __dev_change_flags+0x8c/0x158 [ 3370.012999] [] dev_change_flags+0x20/0x60 [ 3370.018558] [] do_setlink+0x288/0x918 [ 3370.023771] [] rtnl_newlink+0x398/0x6a8 [ 3370.029158] [] rtnetlink_rcv_msg+0xe4/0x220 [ 3370.034891] [] netlink_rcv_skb+0xc4/0xf8 [ 3370.040364] [] rtnetlink_rcv+0x2c/0x40 [ 3370.045663] [] netlink_unicast+0x160/0x238 [ 3370.051309] [] netlink_sendmsg+0x2f0/0x358 [ 3370.056956] [] sock_sendmsg+0x18/0x30 [ 3370.062168] [] ___sys_sendmsg+0x26c/0x280 [ 3370.067728] [] __sys_sendmsg+0x44/0x88 [ 3370.073027] [] SyS_sendmsg+0x10/0x20 [ 3370.078153] [] el0_svc_naked+0x24/0x28 Signed-off-by: Mark Rutland Reported-by: Dennis Chen Cc: Catalin Marinas Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: Steve Capper Cc: Steven Rostedt Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Signed-off-by: Paul E. McKenney diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e5ca15a..f433959 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1287,9 +1287,9 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) - dump_cpu_task(rnp->grplo + cpu); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + dump_cpu_task(cpu); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -1360,10 +1360,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) { - print_cpu_stall_info(rsp, - rnp->grplo + cpu); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + print_cpu_stall_info(rsp, cpu); ndetected++; } } @@ -2884,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp, unsigned long *maxj), bool *isidle, unsigned long *maxj) { - unsigned long bit; int cpu; unsigned long flags; unsigned long mask; @@ -2919,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp, continue; } } - cpu = rnp->grplo; - bit = 1; - for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; @@ -3750,7 +3747,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); - rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); + rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e3959f5..f714f87 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -254,6 +254,13 @@ struct rcu_node { } ____cacheline_internodealigned_in_smp; /* + * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and + * are indexed relative to this interval rather than the global CPU ID space. + * This generates the bit for a CPU in node-local masks. + */ +#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) + +/* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. */ @@ -281,6 +288,14 @@ struct rcu_node { (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* + * Iterate over all possible CPUs in a leaf RCU node. + */ +#define for_each_leaf_node_possible_cpu(rnp, cpu) \ + for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ + cpu <= rnp->grphi; \ + cpu = cpumask_next((cpu), cpu_possible_mask)) + +/* * Union to allow "aggregate OR" operation on the need for a quiescent * state by the normal and expedited grace periods. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 00a02a2..d400434 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -344,7 +344,6 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, { int cpu; unsigned long flags; - unsigned long mask; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; @@ -356,7 +355,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + for_each_leaf_node_possible_cpu(rnp, cpu) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); @@ -376,8 +375,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); if (!(mask_ofl_ipi & mask)) continue; retry_ipi: @@ -440,10 +439,10 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { ndetected += rcu_print_task_exp_stall(rnp); - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { struct rcu_data *rdp; + mask = leaf_node_cpu_bit(rnp, cpu); if (!(rnp->expmask & mask)) continue; ndetected++; @@ -453,7 +452,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) "o."[!!(rdp->grpmask & rnp->expmaskinit)], "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); } - mask <<= 1; } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rsp->expedited_sequence, @@ -473,8 +471,8 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) pr_cont("\n"); } rcu_for_each_leaf_node(rsp, rnp) { - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + mask = leaf_node_cpu_bit(rnp, cpu); if (!(rnp->expmask & mask)) continue; dump_cpu_task(cpu); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 695071d..534c590 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1166,8 +1166,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) return; if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) - if ((mask & 0x1) && cpu != outgoingcpu) + for_each_leaf_node_possible_cpu(rnp, cpu) + if ((mask & leaf_node_cpu_bit(rnp, cpu)) && + cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); if (cpumask_weight(cm) == 0) cpumask_setall(cm); -- cgit v0.10.2