summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c8
-rw-r--r--kernel/context_tracking.c59
-rw-r--r--kernel/cpu.c43
-rw-r--r--kernel/cpuset.c13
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/irq/chip.c16
-rw-r--r--kernel/irq/manage.c127
-rw-r--r--kernel/irq/msi.c11
-rw-r--r--kernel/livepatch/core.c69
-rw-r--r--kernel/locking/mcs_spinlock.h6
-rw-r--r--kernel/locking/mutex.c51
-rw-r--r--kernel/locking/osq_lock.c14
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rwsem-spinlock.c7
-rw-r--r--kernel/locking/rwsem-xadd.c98
-rw-r--r--kernel/locking/rwsem.c22
-rw-r--r--kernel/locking/rwsem.h20
-rw-r--r--kernel/module.c22
-rw-r--r--kernel/power/snapshot.c21
-rw-r--r--kernel/rcu/rcutorture.c27
-rw-r--r--kernel/rcu/srcu.c19
-rw-r--r--kernel/rcu/tiny.c14
-rw-r--r--kernel/rcu/tree.c437
-rw-r--r--kernel/rcu/tree.h11
-rw-r--r--kernel/rcu/tree_plugin.h267
-rw-r--r--kernel/rcu/tree_trace.c4
-rw-r--r--kernel/rcu/update.c72
-rw-r--r--kernel/sched/core.c119
-rw-r--r--kernel/sched/deadline.c77
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c429
-rw-r--r--kernel/sched/features.h13
-rw-r--r--kernel/sched/idle.c14
-rw-r--r--kernel/sched/rt.c181
-rw-r--r--kernel/sched/sched.h38
-rw-r--r--kernel/smpboot.c156
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/time/Kconfig6
-rw-r--r--kernel/time/Makefile6
-rw-r--r--kernel/time/clockevents.c201
-rw-r--r--kernel/time/clocksource.c7
-rw-r--r--kernel/time/hrtimer.c9
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c14
-rw-r--r--kernel/time/tick-broadcast.c179
-rw-r--r--kernel/time/tick-common.c82
-rw-r--r--kernel/time/tick-internal.h211
-rw-r--r--kernel/time/tick-oneshot.c6
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/time/tick-sched.h74
-rw-r--r--kernel/time/timekeeping.c145
-rw-r--r--kernel/time/timekeeping.h7
-rw-r--r--kernel/time/timer.c149
-rw-r--r--kernel/time/timer_list.c18
-rw-r--r--kernel/trace/Kconfig28
-rw-r--r--kernel/trace/ftrace.c44
-rw-r--r--kernel/trace/ring_buffer.c10
-rw-r--r--kernel/trace/trace.c491
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_events.c153
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_functions_graph.c7
-rw-r--r--kernel/trace/trace_kprobe.c15
-rw-r--r--kernel/trace/trace_probe.c19
-rw-r--r--kernel/trace/trace_probe.h12
-rw-r--r--kernel/trace/trace_stat.c10
-rw-r--r--kernel/trace/trace_uprobe.c5
-rw-r--r--kernel/workqueue.c847
69 files changed, 3505 insertions, 1780 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 29a7b2c..a220fdb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3806,10 +3806,7 @@ static void *pidlist_allocate(int count)
static void pidlist_free(void *p)
{
- if (is_vmalloc_addr(p))
- vfree(p);
- else
- kfree(p);
+ kvfree(p);
}
/*
@@ -5040,6 +5037,9 @@ int __init cgroup_init(void)
WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
}
+
+ if (ss->bind)
+ ss->bind(init_css_set.subsys[ssid]);
}
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 937ecdf..72d59a1 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -39,15 +39,15 @@ void context_tracking_cpu_set(int cpu)
}
/**
- * context_tracking_user_enter - Inform the context tracking that the CPU is going to
- * enter userspace mode.
+ * context_tracking_enter - Inform the context tracking that the CPU is going
+ * enter user or guest space mode.
*
* This function must be called right before we switch from the kernel
- * to userspace, when it's guaranteed the remaining kernel instructions
- * to execute won't use any RCU read side critical section because this
- * function sets RCU in extended quiescent state.
+ * to user or guest space, when it's guaranteed the remaining kernel
+ * instructions to execute won't use any RCU read side critical section
+ * because this function sets RCU in extended quiescent state.
*/
-void context_tracking_user_enter(void)
+void context_tracking_enter(enum ctx_state state)
{
unsigned long flags;
@@ -75,9 +75,8 @@ void context_tracking_user_enter(void)
WARN_ON_ONCE(!current->mm);
local_irq_save(flags);
- if ( __this_cpu_read(context_tracking.state) != IN_USER) {
+ if ( __this_cpu_read(context_tracking.state) != state) {
if (__this_cpu_read(context_tracking.active)) {
- trace_user_enter(0);
/*
* At this stage, only low level arch entry code remains and
* then we'll run in userspace. We can assume there won't be
@@ -85,7 +84,10 @@ void context_tracking_user_enter(void)
* user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
* on the tick.
*/
- vtime_user_enter(current);
+ if (state == CONTEXT_USER) {
+ trace_user_enter(0);
+ vtime_user_enter(current);
+ }
rcu_user_enter();
}
/*
@@ -101,24 +103,32 @@ void context_tracking_user_enter(void)
* OTOH we can spare the calls to vtime and RCU when context_tracking.active
* is false because we know that CPU is not tickless.
*/
- __this_cpu_write(context_tracking.state, IN_USER);
+ __this_cpu_write(context_tracking.state, state);
}
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(context_tracking_enter);
+EXPORT_SYMBOL_GPL(context_tracking_enter);
+
+void context_tracking_user_enter(void)
+{
+ context_tracking_enter(CONTEXT_USER);
+}
NOKPROBE_SYMBOL(context_tracking_user_enter);
/**
- * context_tracking_user_exit - Inform the context tracking that the CPU is
- * exiting userspace mode and entering the kernel.
+ * context_tracking_exit - Inform the context tracking that the CPU is
+ * exiting user or guest mode and entering the kernel.
*
- * This function must be called after we entered the kernel from userspace
- * before any use of RCU read side critical section. This potentially include
- * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ * This function must be called after we entered the kernel from user or
+ * guest space before any use of RCU read side critical section. This
+ * potentially include any high level kernel code like syscalls, exceptions,
+ * signal handling, etc...
*
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
-void context_tracking_user_exit(void)
+void context_tracking_exit(enum ctx_state state)
{
unsigned long flags;
@@ -129,20 +139,29 @@ void context_tracking_user_exit(void)
return;
local_irq_save(flags);
- if (__this_cpu_read(context_tracking.state) == IN_USER) {
+ if (__this_cpu_read(context_tracking.state) == state) {
if (__this_cpu_read(context_tracking.active)) {
/*
* We are going to run code that may use RCU. Inform
* RCU core about that (ie: we may need the tick again).
*/
rcu_user_exit();
- vtime_user_exit(current);
- trace_user_exit(0);
+ if (state == CONTEXT_USER) {
+ vtime_user_exit(current);
+ trace_user_exit(0);
+ }
}
- __this_cpu_write(context_tracking.state, IN_KERNEL);
+ __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(context_tracking_exit);
+EXPORT_SYMBOL_GPL(context_tracking_exit);
+
+void context_tracking_user_exit(void)
+{
+ context_tracking_exit(CONTEXT_USER);
+}
NOKPROBE_SYMBOL(context_tracking_user_exit);
/**
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1972b16..94bbe46 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,7 @@
#include <linux/gfp.h>
#include <linux/suspend.h>
#include <linux/lockdep.h>
+#include <linux/tick.h>
#include <trace/events/power.h>
#include "smpboot.h"
@@ -338,6 +339,8 @@ static int __ref take_cpu_down(void *_param)
return err;
cpu_notify(CPU_DYING | param->mod, param->hcpu);
+ /* Give up timekeeping duties */
+ tick_handover_do_timer();
/* Park the stopper thread */
kthread_park(current);
return 0;
@@ -408,13 +411,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
*
* Wait for the stop thread to go away.
*/
- while (!idle_cpu(cpu))
+ while (!per_cpu(cpu_dead_idle, cpu))
cpu_relax();
+ smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
+ per_cpu(cpu_dead_idle, cpu) = false;
+ hotplug_cpu__broadcast_tick_pull(cpu);
/* This actually kills the CPU. */
__cpu_die(cpu);
/* CPU is completely dead: tell everyone. Too late to complain. */
+ tick_cleanup_dead_cpu(cpu);
cpu_notify_nofail(CPU_DEAD | mod, hcpu);
check_for_tasks(cpu);
@@ -446,6 +453,37 @@ out:
EXPORT_SYMBOL(cpu_down);
#endif /*CONFIG_HOTPLUG_CPU*/
+/*
+ * Unpark per-CPU smpboot kthreads at CPU-online time.
+ */
+static int smpboot_thread_call(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+
+ case CPU_ONLINE:
+ smpboot_unpark_threads(cpu);
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block smpboot_thread_notifier = {
+ .notifier_call = smpboot_thread_call,
+ .priority = CPU_PRI_SMPBOOT,
+};
+
+void __cpuinit smpboot_thread_init(void)
+{
+ register_cpu_notifier(&smpboot_thread_notifier);
+}
+
/* Requires cpu_add_remove_lock to be held */
static int _cpu_up(unsigned int cpu, int tasks_frozen)
{
@@ -485,9 +523,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
goto out_notify;
BUG_ON(!cpu_online(cpu));
- /* Wake the per cpu threads */
- smpboot_unpark_threads(cpu);
-
/* Now call notifier in preparation. */
cpu_notify(CPU_ONLINE | mod, hcpu);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fc7f474..c68f072 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -622,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
+ cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
@@ -631,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains,
dattr = NULL;
csa = NULL;
+ if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
+ goto done;
+ cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
ndoms = 1;
@@ -643,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, &top_cpuset);
}
- cpumask_copy(doms[0], top_cpuset.effective_cpus);
+ cpumask_and(doms[0], top_cpuset.effective_cpus,
+ non_isolated_cpus);
goto done;
}
@@ -666,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
* the corresponding sched domain.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
- !is_sched_load_balance(cp))
+ !(is_sched_load_balance(cp) &&
+ cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
continue;
if (is_sched_load_balance(cp))
@@ -748,6 +755,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
+ cpumask_and(dp, dp, non_isolated_cpus);
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -760,6 +768,7 @@ restart:
BUG_ON(nslot != ndoms);
done:
+ free_cpumask_var(non_isolated_cpus);
kfree(csa);
/*
diff --git a/kernel/futex.c b/kernel/futex.c
index 2a5e383..2579e40 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -900,7 +900,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
if (!p)
return -ESRCH;
- if (!p->mm) {
+ if (unlikely(p->flags & PF_KTHREAD)) {
put_task_struct(p);
return -EPERM;
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6f1c7a5..eb9a4ea 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -948,6 +948,22 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
return -ENOSYS;
}
+
+/**
+ * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @on: Whether to set or reset the wake-up capability of this irq
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_wake)
+ return data->chip->irq_set_wake(data, on);
+
+ return -ENOSYS;
+}
#endif
/**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 886d09e..e68932b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -68,14 +68,20 @@ static void __synchronize_hardirq(struct irq_desc *desc)
* Do not use this for shutdown scenarios where you must be sure
* that all parts (hardirq and threaded handler) have completed.
*
+ * Returns: false if a threaded handler is active.
+ *
* This function may be called - with care - from IRQ context.
*/
-void synchronize_hardirq(unsigned int irq)
+bool synchronize_hardirq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (desc)
+ if (desc) {
__synchronize_hardirq(desc);
+ return !atomic_read(&desc->threads_active);
+ }
+
+ return true;
}
EXPORT_SYMBOL(synchronize_hardirq);
@@ -440,6 +446,32 @@ void disable_irq(unsigned int irq)
}
EXPORT_SYMBOL(disable_irq);
+/**
+ * disable_hardirq - disables an irq and waits for hardirq completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are
+ * nested.
+ * This function waits for any pending hard IRQ handlers for this
+ * interrupt to complete before returning. If you use this function while
+ * holding a resource the hard IRQ handler may need you will deadlock.
+ *
+ * When used to optimistically disable an interrupt from atomic context
+ * the return value must be checked.
+ *
+ * Returns: false if a threaded handler is active.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+bool disable_hardirq(unsigned int irq)
+{
+ if (!__disable_irq_nosync(irq))
+ return synchronize_hardirq(irq);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(disable_hardirq);
+
void __enable_irq(struct irq_desc *desc, unsigned int irq)
{
switch (desc->depth) {
@@ -1766,3 +1798,94 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
+
+/**
+ * irq_get_irqchip_state - returns the irqchip state of a interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: One of IRQCHIP_STATE_* the caller wants to know about
+ * @state: a pointer to a boolean where the state is to be storeed
+ *
+ * This call snapshots the internal irqchip state of an
+ * interrupt, returning into @state the bit corresponding to
+ * stage @which
+ *
+ * This function should be called with preemption disabled if the
+ * interrupt controller has per-cpu registers.
+ */
+int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
+ bool *state)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+ unsigned long flags;
+ int err = -EINVAL;
+
+ desc = irq_get_desc_buslock(irq, &flags, 0);
+ if (!desc)
+ return err;
+
+ data = irq_desc_get_irq_data(desc);
+
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_get_irqchip_state)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
+ err = chip->irq_get_irqchip_state(data, which, state);
+
+ irq_put_desc_busunlock(desc, flags);
+ return err;
+}
+
+/**
+ * irq_set_irqchip_state - set the state of a forwarded interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: State to be restored (one of IRQCHIP_STATE_*)
+ * @val: Value corresponding to @which
+ *
+ * This call sets the internal irqchip state of an interrupt,
+ * depending on the value of @which.
+ *
+ * This function should be called with preemption disabled if the
+ * interrupt controller has per-cpu registers.
+ */
+int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
+ bool val)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+ unsigned long flags;
+ int err = -EINVAL;
+
+ desc = irq_get_desc_buslock(irq, &flags, 0);
+ if (!desc)
+ return err;
+
+ data = irq_desc_get_irq_data(desc);
+
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_set_irqchip_state)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
+ err = chip->irq_set_irqchip_state(data, which, val);
+
+ irq_put_desc_busunlock(desc, flags);
+ return err;
+}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3e18163..474de5c 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -310,8 +310,15 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
struct msi_desc *desc;
for_each_msi_entry(desc, dev) {
- irq_domain_free_irqs(desc->irq, desc->nvec_used);
- desc->irq = 0;
+ /*
+ * We might have failed to allocate an MSI early
+ * enough that there is no IRQ associated to this
+ * entry. If that's the case, don't do anything.
+ */
+ if (desc->irq) {
+ irq_domain_free_irqs(desc->irq, desc->nvec_used);
+ desc->irq = 0;
+ }
}
}
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 3f9f1d6..284e269 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -335,32 +335,20 @@ unlock:
rcu_read_unlock();
}
-static int klp_disable_func(struct klp_func *func)
+static void klp_disable_func(struct klp_func *func)
{
struct klp_ops *ops;
- int ret;
-
- if (WARN_ON(func->state != KLP_ENABLED))
- return -EINVAL;
- if (WARN_ON(!func->old_addr))
- return -EINVAL;
+ WARN_ON(func->state != KLP_ENABLED);
+ WARN_ON(!func->old_addr);
ops = klp_find_ops(func->old_addr);
if (WARN_ON(!ops))
- return -EINVAL;
+ return;
if (list_is_singular(&ops->func_stack)) {
- ret = unregister_ftrace_function(&ops->fops);
- if (ret) {
- pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
- func->old_name, ret);
- return ret;
- }
-
- ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
- if (ret)
- pr_warn("function unregister succeeded but failed to clear the filter\n");
+ WARN_ON(unregister_ftrace_function(&ops->fops));
+ WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0));
list_del_rcu(&func->stack_node);
list_del(&ops->node);
@@ -370,8 +358,6 @@ static int klp_disable_func(struct klp_func *func)
}
func->state = KLP_DISABLED;
-
- return 0;
}
static int klp_enable_func(struct klp_func *func)
@@ -432,23 +418,15 @@ err:
return ret;
}
-static int klp_disable_object(struct klp_object *obj)
+static void klp_disable_object(struct klp_object *obj)
{
struct klp_func *func;
- int ret;
- for (func = obj->funcs; func->old_name; func++) {
- if (func->state != KLP_ENABLED)
- continue;
-
- ret = klp_disable_func(func);
- if (ret)
- return ret;
- }
+ for (func = obj->funcs; func->old_name; func++)
+ if (func->state == KLP_ENABLED)
+ klp_disable_func(func);
obj->state = KLP_DISABLED;
-
- return 0;
}
static int klp_enable_object(struct klp_object *obj)
@@ -464,22 +442,19 @@ static int klp_enable_object(struct klp_object *obj)
for (func = obj->funcs; func->old_name; func++) {
ret = klp_enable_func(func);
- if (ret)
- goto unregister;
+ if (ret) {
+ klp_disable_object(obj);
+ return ret;
+ }
}
obj->state = KLP_ENABLED;
return 0;
-
-unregister:
- WARN_ON(klp_disable_object(obj));
- return ret;
}
static int __klp_disable_patch(struct klp_patch *patch)
{
struct klp_object *obj;
- int ret;
/* enforce stacking: only the last enabled patch can be disabled */
if (!list_is_last(&patch->list, &klp_patches) &&
@@ -489,12 +464,8 @@ static int __klp_disable_patch(struct klp_patch *patch)
pr_notice("disabling patch '%s'\n", patch->mod->name);
for (obj = patch->objs; obj->funcs; obj++) {
- if (obj->state != KLP_ENABLED)
- continue;
-
- ret = klp_disable_object(obj);
- if (ret)
- return ret;
+ if (obj->state == KLP_ENABLED)
+ klp_disable_object(obj);
}
patch->state = KLP_DISABLED;
@@ -553,8 +524,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
pr_notice("enabling patch '%s'\n", patch->mod->name);
for (obj = patch->objs; obj->funcs; obj++) {
- klp_find_object_module(obj);
-
if (!klp_is_object_loaded(obj))
continue;
@@ -945,7 +914,6 @@ static void klp_module_notify_going(struct klp_patch *patch,
{
struct module *pmod = patch->mod;
struct module *mod = obj->mod;
- int ret;
if (patch->state == KLP_DISABLED)
goto disabled;
@@ -953,10 +921,7 @@ static void klp_module_notify_going(struct klp_patch *patch,
pr_notice("reverting patch '%s' on unloading module '%s'\n",
pmod->name, mod->name);
- ret = klp_disable_object(obj);
- if (ret)
- pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
- pmod->name, mod->name, ret);
+ klp_disable_object(obj);
disabled:
klp_free_object_loaded(obj);
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index d1fe2ba..75e114b 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -78,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
*/
return;
}
- ACCESS_ONCE(prev->next) = node;
+ WRITE_ONCE(prev->next, node);
/* Wait until the lock holder passes the lock down. */
arch_mcs_spin_lock_contended(&node->locked);
@@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
static inline
void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{
- struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+ struct mcs_spinlock *next = READ_ONCE(node->next);
if (likely(!next)) {
/*
@@ -100,7 +100,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
if (likely(cmpxchg(lock, node, NULL) == node))
return;
/* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
+ while (!(next = READ_ONCE(node->next)))
cpu_relax_lowlatency();
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 94674e5..4cccea6 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,7 +25,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
-#include "mcs_spinlock.h"
+#include <linux/osq_lock.h>
/*
* In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -217,44 +217,35 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock,
}
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
- if (lock->owner != owner)
- return false;
-
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_ checking
- * lock->owner still matches owner, if that fails, owner might
- * point to free()d memory, if it still matches, the rcu_read_lock()
- * ensures the memory stays valid.
- */
- barrier();
-
- return owner->on_cpu;
-}
-
/*
* Look out! "owner" is an entirely speculative pointer
* access and not reliable.
*/
static noinline
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
{
+ bool ret = true;
+
rcu_read_lock();
- while (owner_running(lock, owner)) {
- if (need_resched())
+ while (lock->owner == owner) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking lock->owner still matches owner. If that fails,
+ * owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ if (!owner->on_cpu || need_resched()) {
+ ret = false;
break;
+ }
cpu_relax_lowlatency();
}
rcu_read_unlock();
- /*
- * We break out the loop above on need_resched() and when the
- * owner changed, which is a sign for heavy contention. Return
- * success only when lock->owner is NULL.
- */
- return lock->owner == NULL;
+ return ret;
}
/*
@@ -269,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
return 0;
rcu_read_lock();
- owner = ACCESS_ONCE(lock->owner);
+ owner = READ_ONCE(lock->owner);
if (owner)
retval = owner->on_cpu;
rcu_read_unlock();
@@ -343,7 +334,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
* As such, when deadlock detection needs to be
* performed the optimistic spinning cannot be done.
*/
- if (ACCESS_ONCE(ww->ctx))
+ if (READ_ONCE(ww->ctx))
break;
}
@@ -351,7 +342,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
- owner = ACCESS_ONCE(lock->owner);
+ owner = READ_ONCE(lock->owner);
if (owner && !mutex_spin_on_owner(lock, owner))
break;
@@ -490,7 +481,7 @@ static inline int __sched
__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
{
struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
- struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
+ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
if (!hold_ctx)
return 0;
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index c112d00..dc85ee2 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -98,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
prev = decode_cpu(old);
node->prev = prev;
- ACCESS_ONCE(prev->next) = node;
+ WRITE_ONCE(prev->next, node);
/*
* Normally @prev is untouchable after the above store; because at that
@@ -109,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* cmpxchg in an attempt to undo our queueing.
*/
- while (!ACCESS_ONCE(node->locked)) {
+ while (!READ_ONCE(node->locked)) {
/*
* If we need to reschedule bail... so we can block.
*/
@@ -148,7 +148,7 @@ unqueue:
* Or we race against a concurrent unqueue()'s step-B, in which
* case its step-C will write us a new @node->prev pointer.
*/
- prev = ACCESS_ONCE(node->prev);
+ prev = READ_ONCE(node->prev);
}
/*
@@ -170,8 +170,8 @@ unqueue:
* it will wait in Step-A.
*/
- ACCESS_ONCE(next->prev) = prev;
- ACCESS_ONCE(prev->next) = next;
+ WRITE_ONCE(next->prev, prev);
+ WRITE_ONCE(prev->next, next);
return false;
}
@@ -193,11 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock)
node = this_cpu_ptr(&osq_node);
next = xchg(&node->next, NULL);
if (next) {
- ACCESS_ONCE(next->locked) = 1;
+ WRITE_ONCE(next->locked, 1);
return;
}
next = osq_wait_next(lock, node, NULL);
if (next)
- ACCESS_ONCE(next->locked) = 1;
+ WRITE_ONCE(next->locked, 1);
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6357265..b732793 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -349,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
*
* @task: the task owning the mutex (owner) for which a chain walk is
* probably needed
- * @deadlock_detect: do we have to carry out deadlock detection?
+ * @chwalk: do we have to carry out deadlock detection?
* @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
* things for a task that has just got its priority adjusted, and
* is waiting on a mutex)
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2555ae1..3a50485 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
list_del(&waiter->list);
tsk = waiter->task;
+ /*
+ * Make sure we do not wakeup the next reader before
+ * setting the nil condition to grant the next reader;
+ * otherwise we could miss the wakeup on the other
+ * side and end up sleeping again. See the pairing
+ * in rwsem_down_read_failed().
+ */
smp_mb();
waiter->task = NULL;
wake_up_process(tsk);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2f7cc40..3417d01 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -14,8 +14,9 @@
#include <linux/init.h>
#include <linux/export.h>
#include <linux/sched/rt.h>
+#include <linux/osq_lock.h>
-#include "mcs_spinlock.h"
+#include "rwsem.h"
/*
* Guide to the rw_semaphore's count field for common values.
@@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
waiter = list_entry(next, struct rwsem_waiter, list);
next = waiter->list.next;
tsk = waiter->task;
+ /*
+ * Make sure we do not wakeup the next reader before
+ * setting the nil condition to grant the next reader;
+ * otherwise we could miss the wakeup on the other
+ * side and end up sleeping again. See the pairing
+ * in rwsem_down_read_failed().
+ */
smp_mb();
waiter->task = NULL;
wake_up_process(tsk);
@@ -258,6 +266,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
if (!list_is_singular(&sem->wait_list))
rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ rwsem_set_owner(sem);
return true;
}
@@ -270,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
*/
static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
{
- long old, count = ACCESS_ONCE(sem->count);
+ long old, count = READ_ONCE(sem->count);
while (true) {
if (!(count == 0 || count == RWSEM_WAITING_BIAS))
return false;
old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
- if (old == count)
+ if (old == count) {
+ rwsem_set_owner(sem);
return true;
+ }
count = old;
}
@@ -287,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
- bool on_cpu = false;
+ bool ret = true;
if (need_resched())
return false;
rcu_read_lock();
- owner = ACCESS_ONCE(sem->owner);
- if (owner)
- on_cpu = owner->on_cpu;
- rcu_read_unlock();
-
- /*
- * If sem->owner is not set, yet we have just recently entered the
- * slowpath, then there is a possibility reader(s) may have the lock.
- * To be safe, avoid spinning in these situations.
- */
- return on_cpu;
-}
-
-static inline bool owner_running(struct rw_semaphore *sem,
- struct task_struct *owner)
-{
- if (sem->owner != owner)
- return false;
-
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_ checking
- * sem->owner still matches owner, if that fails, owner might
- * point to free()d memory, if it still matches, the rcu_read_lock()
- * ensures the memory stays valid.
- */
- barrier();
+ owner = READ_ONCE(sem->owner);
+ if (!owner) {
+ long count = READ_ONCE(sem->count);
+ /*
+ * If sem->owner is not set, yet we have just recently entered the
+ * slowpath with the lock being active, then there is a possibility
+ * reader(s) may have the lock. To be safe, bail spinning in these
+ * situations.
+ */
+ if (count & RWSEM_ACTIVE_MASK)
+ ret = false;
+ goto done;
+ }
- return owner->on_cpu;
+ ret = owner->on_cpu;
+done:
+ rcu_read_unlock();
+ return ret;
}
static noinline
bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
{
+ long count;
+
rcu_read_lock();
- while (owner_running(sem, owner)) {
- if (need_resched())
- break;
+ while (sem->owner == owner) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking sem->owner still matches owner, if that fails,
+ * owner might point to free()d memory, if it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ /* abort spinning when need_resched or owner is not running */
+ if (!owner->on_cpu || need_resched()) {
+ rcu_read_unlock();
+ return false;
+ }
cpu_relax_lowlatency();
}
rcu_read_unlock();
+ if (READ_ONCE(sem->owner))
+ return true; /* new owner, continue spinning */
+
/*
- * We break out the loop above on need_resched() or when the
- * owner changed, which is a sign for heavy contention. Return
- * success only when sem->owner is NULL.
+ * When the owner is not set, the lock could be free or
+ * held by readers. Check the counter to verify the
+ * state.
*/
- return sem->owner == NULL;
+ count = READ_ONCE(sem->count);
+ return (count == 0 || count == RWSEM_WAITING_BIAS);
}
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
@@ -358,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
goto done;
while (true) {
- owner = ACCESS_ONCE(sem->owner);
+ owner = READ_ONCE(sem->owner);
if (owner && !rwsem_spin_on_owner(sem, owner))
break;
@@ -432,7 +450,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
/* we're now waiting on the lock, but no longer actively locking */
if (waiting) {
- count = ACCESS_ONCE(sem->count);
+ count = READ_ONCE(sem->count);
/*
* If there were already threads queued before us and there are
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e2d3bc7..205be0c 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -9,29 +9,9 @@
#include <linux/sched.h>
#include <linux/export.h>
#include <linux/rwsem.h>
-
#include <linux/atomic.h>
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
- sem->owner = current;
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
- sem->owner = NULL;
-}
-
-#else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-#endif
+#include "rwsem.h"
/*
* lock for reading
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
new file mode 100644
index 0000000..870ed9a
--- /dev/null
+++ b/kernel/locking/rwsem.h
@@ -0,0 +1,20 @@
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ sem->owner = current;
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ sem->owner = NULL;
+}
+
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
diff --git a/kernel/module.c b/kernel/module.c
index 99fdf94..650b038 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2479,6 +2479,23 @@ static int elf_header_check(struct load_info *info)
return 0;
}
+#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
+
+static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
+{
+ do {
+ unsigned long n = min(len, COPY_CHUNK_SIZE);
+
+ if (copy_from_user(dst, usrc, n) != 0)
+ return -EFAULT;
+ cond_resched();
+ dst += n;
+ usrc += n;
+ len -= n;
+ } while (len);
+ return 0;
+}
+
/* Sets info->hdr and info->len. */
static int copy_module_from_user(const void __user *umod, unsigned long len,
struct load_info *info)
@@ -2498,7 +2515,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
if (!info->hdr)
return -ENOMEM;
- if (copy_from_user(info->hdr, umod, info->len) != 0) {
+ if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
vfree(info->hdr);
return -EFAULT;
}
@@ -2753,6 +2770,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
mod->trace_events = section_objs(info, "_ftrace_events",
sizeof(*mod->trace_events),
&mod->num_trace_events);
+ mod->trace_enums = section_objs(info, "_ftrace_enum_map",
+ sizeof(*mod->trace_enums),
+ &mod->num_trace_enums);
#endif
#ifdef CONFIG_TRACING
mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index c24d5a2..5235dd4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -955,25 +955,6 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
}
}
-static bool is_nosave_page(unsigned long pfn)
-{
- struct nosave_region *region;
-
- list_for_each_entry(region, &nosave_regions, list) {
- if (pfn >= region->start_pfn && pfn < region->end_pfn) {
- pr_err("PM: %#010llx in e820 nosave region: "
- "[mem %#010llx-%#010llx]\n",
- (unsigned long long) pfn << PAGE_SHIFT,
- (unsigned long long) region->start_pfn << PAGE_SHIFT,
- ((unsigned long long) region->end_pfn << PAGE_SHIFT)
- - 1);
- return true;
- }
- }
-
- return false;
-}
-
/**
* create_basic_memory_bitmaps - create bitmaps needed for marking page
* frames that should not be saved and free page frames. The pointers
@@ -2042,7 +2023,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
do {
pfn = memory_bm_next_pfn(bm);
if (likely(pfn != BM_END_OF_MAP)) {
- if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn))
+ if (likely(pfn_valid(pfn)))
swsusp_set_page_free(pfn_to_page(pfn));
else
return -EFAULT;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 30d42aa..8dbe276 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -853,6 +853,8 @@ rcu_torture_fqs(void *arg)
static int
rcu_torture_writer(void *arg)
{
+ bool can_expedite = !rcu_gp_is_expedited();
+ int expediting = 0;
unsigned long gp_snap;
bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
bool gp_sync1 = gp_sync;
@@ -865,9 +867,15 @@ rcu_torture_writer(void *arg)
int nsynctypes = 0;
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+ pr_alert("%s" TORTURE_FLAG
+ " Grace periods expedited from boot/sysfs for %s,\n",
+ torture_type, cur_ops->name);
+ pr_alert("%s" TORTURE_FLAG
+ " Testing of dynamic grace-period expediting diabled.\n",
+ torture_type);
/* Initialize synctype[] array. If none set, take default. */
- if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
+ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
synctype[nsynctypes++] = RTWS_COND_GET;
@@ -949,9 +957,26 @@ rcu_torture_writer(void *arg)
}
}
rcutorture_record_progress(++rcu_torture_current_version);
+ /* Cycle through nesting levels of rcu_expedite_gp() calls. */
+ if (can_expedite &&
+ !(torture_random(&rand) & 0xff & (!!expediting - 1))) {
+ WARN_ON_ONCE(expediting == 0 && rcu_gp_is_expedited());
+ if (expediting >= 0)
+ rcu_expedite_gp();
+ else
+ rcu_unexpedite_gp();
+ if (++expediting > 3)
+ expediting = -expediting;
+ }
rcu_torture_writer_state = RTWS_STUTTER;
stutter_wait("rcu_torture_writer");
} while (!torture_must_stop());
+ /* Reset expediting back to unexpedited. */
+ if (expediting > 0)
+ expediting = -expediting;
+ while (can_expedite && expediting++ < 0)
+ rcu_unexpedite_gp();
+ WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
rcu_torture_writer_state = RTWS_STOPPING;
torture_kthread_stopping("rcu_torture_writer");
return 0;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 445bf8f..cad76e7 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -402,23 +402,6 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
}
EXPORT_SYMBOL_GPL(call_srcu);
-struct rcu_synchronize {
- struct rcu_head head;
- struct completion completion;
-};
-
-/*
- * Awaken the corresponding synchronize_srcu() instance now that a
- * grace period has elapsed.
- */
-static void wakeme_after_rcu(struct rcu_head *head)
-{
- struct rcu_synchronize *rcu;
-
- rcu = container_of(head, struct rcu_synchronize, head);
- complete(&rcu->completion);
-}
-
static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
static void srcu_reschedule(struct srcu_struct *sp);
@@ -507,7 +490,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
*/
void synchronize_srcu(struct srcu_struct *sp)
{
- __synchronize_srcu(sp, rcu_expedited
+ __synchronize_srcu(sp, rcu_gp_is_expedited()
? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
: SYNCHRONIZE_SRCU_TRYCOUNT);
}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index cc9ceca..069742d 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -103,8 +103,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
{
RCU_TRACE(reset_cpu_stall_ticks(rcp));
- if (rcp->rcucblist != NULL &&
- rcp->donetail != rcp->curtail) {
+ if (rcp->donetail != rcp->curtail) {
rcp->donetail = rcp->curtail;
return 1;
}
@@ -169,17 +168,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
unsigned long flags;
RCU_TRACE(int cb_count = 0);
- /* If no RCU callbacks ready to invoke, just return. */
- if (&rcp->rcucblist == rcp->donetail) {
- RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
- RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
- !!ACCESS_ONCE(rcp->rcucblist),
- need_resched(),
- is_idle_task(current),
- false));
- return;
- }
-
/* Move the ready-to-invoke callbacks to a local list. */
local_irq_save(flags);
RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640c..233165d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -91,8 +91,10 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var
#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
DEFINE_RCU_TPS(sname) \
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
struct rcu_state sname##_state = { \
.level = { &sname##_state.node[0] }, \
+ .rda = &sname##_data, \
.call = cr, \
.fqs_state = RCU_GP_IDLE, \
.gpnum = 0UL - 300UL, \
@@ -101,11 +103,9 @@ struct rcu_state sname##_state = { \
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
- .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
-}; \
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data)
+}
RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
@@ -152,6 +152,8 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
*/
static int rcu_scheduler_fully_active __read_mostly;
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -160,6 +162,12 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
module_param(kthread_prio, int, 0644);
+/* Delay in jiffies for grace-period initialization delays. */
+static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)
+ ? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY
+ : 0;
+module_param(gp_init_delay, int, 0644);
+
/*
* Track the rcutorture test sequence number and the update version
* number within a given test. The rcutorture_testseq is incremented
@@ -173,6 +181,17 @@ unsigned long rcutorture_testseq;
unsigned long rcutorture_vernum;
/*
+ * Compute the mask of online CPUs for the specified rcu_node structure.
+ * This will not be stable unless the rcu_node structure's ->lock is
+ * held, but the bit corresponding to the current CPU will be stable
+ * in most contexts.
+ */
+unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+{
+ return ACCESS_ONCE(rnp->qsmaskinitnext);
+}
+
+/*
* Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
* permit this function to be invoked without holding the root rcu_node
* structure's ->lock, but of course results can be subject to change.
@@ -292,10 +311,10 @@ void rcu_note_context_switch(void)
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
/*
- * Register a quiesecent state for all RCU flavors. If there is an
+ * Register a quiescent state for all RCU flavors. If there is an
* emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
* dyntick-idle quiescent state visible to other CPUs (but only for those
- * RCU flavors in desparate need of a quiescent state, which will normally
+ * RCU flavors in desperate need of a quiescent state, which will normally
* be none of them). Either way, do a lightweight quiescent state for
* all RCU flavors.
*/
@@ -410,6 +429,15 @@ void rcu_bh_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
/*
+ * Force a quiescent state for RCU-sched.
+ */
+void rcu_sched_force_quiescent_state(void)
+{
+ force_quiescent_state(&rcu_sched_state);
+}
+EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
+
+/*
* Show the state of the grace-period kthreads.
*/
void show_rcu_gp_kthreads(void)
@@ -483,15 +511,6 @@ void rcutorture_record_progress(unsigned long vernum)
EXPORT_SYMBOL_GPL(rcutorture_record_progress);
/*
- * Force a quiescent state for RCU-sched.
- */
-void rcu_sched_force_quiescent_state(void)
-{
- force_quiescent_state(&rcu_sched_state);
-}
-EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
-
-/*
* Does the CPU have callbacks ready to be invoked?
*/
static int
@@ -954,7 +973,7 @@ bool rcu_lockdep_current_cpu_online(void)
preempt_disable();
rdp = this_cpu_ptr(&rcu_sched_data);
rnp = rdp->mynode;
- ret = (rdp->grpmask & rnp->qsmaskinit) ||
+ ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ||
!rcu_scheduler_fully_active;
preempt_enable();
return ret;
@@ -1196,9 +1215,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
} else {
j = jiffies;
gpa = ACCESS_ONCE(rsp->gp_activity);
- pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
+ pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
rsp->name, j - gpa, j, gpa,
- jiffies_till_next_fqs);
+ jiffies_till_next_fqs,
+ rcu_get_root(rsp)->qsmask);
/* In this case, the current CPU might be at fault. */
sched_show_task(current);
}
@@ -1328,20 +1348,30 @@ void rcu_cpu_stall_reset(void)
}
/*
- * Initialize the specified rcu_data structure's callback list to empty.
+ * Initialize the specified rcu_data structure's default callback list
+ * to empty. The default callback list is the one that is not used by
+ * no-callbacks CPUs.
*/
-static void init_callback_list(struct rcu_data *rdp)
+static void init_default_callback_list(struct rcu_data *rdp)
{
int i;
- if (init_nocb_callback_list(rdp))
- return;
rdp->nxtlist = NULL;
for (i = 0; i < RCU_NEXT_SIZE; i++)
rdp->nxttail[i] = &rdp->nxtlist;
}
/*
+ * Initialize the specified rcu_data structure's callback list to empty.
+ */
+static void init_callback_list(struct rcu_data *rdp)
+{
+ if (init_nocb_callback_list(rdp))
+ return;
+ init_default_callback_list(rdp);
+}
+
+/*
* Determine the value that ->completed will have at the end of the
* next subsequent grace period. This is used to tag callbacks so that
* a CPU can invoke callbacks in a timely fashion even if that CPU has
@@ -1703,11 +1733,11 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
*/
static int rcu_gp_init(struct rcu_state *rsp)
{
+ unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
ACCESS_ONCE(rsp->gp_activity) = jiffies;
- rcu_bind_gp_kthread();
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
if (!ACCESS_ONCE(rsp->gp_flags)) {
@@ -1733,9 +1763,54 @@ static int rcu_gp_init(struct rcu_state *rsp)
trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
raw_spin_unlock_irq(&rnp->lock);
- /* Exclude any concurrent CPU-hotplug operations. */
- mutex_lock(&rsp->onoff_mutex);
- smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
+ /*
+ * Apply per-leaf buffered online and offline operations to the
+ * rcu_node tree. Note that this new grace period need not wait
+ * for subsequent online CPUs, and that quiescent-state forcing
+ * will handle subsequent offline CPUs.
+ */
+ rcu_for_each_leaf_node(rsp, rnp) {
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+ if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
+ !rnp->wait_blkd_tasks) {
+ /* Nothing to do on this leaf rcu_node structure. */
+ raw_spin_unlock_irq(&rnp->lock);
+ continue;
+ }
+
+ /* Record old state, apply changes to ->qsmaskinit field. */
+ oldmask = rnp->qsmaskinit;
+ rnp->qsmaskinit = rnp->qsmaskinitnext;
+
+ /* If zero-ness of ->qsmaskinit changed, propagate up tree. */
+ if (!oldmask != !rnp->qsmaskinit) {
+ if (!oldmask) /* First online CPU for this rcu_node. */
+ rcu_init_new_rnp(rnp);
+ else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */
+ rnp->wait_blkd_tasks = true;
+ else /* Last offline CPU and can propagate. */
+ rcu_cleanup_dead_rnp(rnp);
+ }
+
+ /*
+ * If all waited-on tasks from prior grace period are
+ * done, and if all this rcu_node structure's CPUs are
+ * still offline, propagate up the rcu_node tree and
+ * clear ->wait_blkd_tasks. Otherwise, if one of this
+ * rcu_node structure's CPUs has since come back online,
+ * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp()
+ * checks for this, so just call it unconditionally).
+ */
+ if (rnp->wait_blkd_tasks &&
+ (!rcu_preempt_has_tasks(rnp) ||
+ rnp->qsmaskinit)) {
+ rnp->wait_blkd_tasks = false;
+ rcu_cleanup_dead_rnp(rnp);
+ }
+
+ raw_spin_unlock_irq(&rnp->lock);
+ }
/*
* Set the quiescent-state-needed bits in all the rcu_node
@@ -1757,8 +1832,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
- WARN_ON_ONCE(rnp->completed != rsp->completed);
- ACCESS_ONCE(rnp->completed) = rsp->completed;
+ if (WARN_ON_ONCE(rnp->completed != rsp->completed))
+ ACCESS_ONCE(rnp->completed) = rsp->completed;
if (rnp == rdp->mynode)
(void)__note_gp_changes(rsp, rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
@@ -1768,9 +1843,12 @@ static int rcu_gp_init(struct rcu_state *rsp)
raw_spin_unlock_irq(&rnp->lock);
cond_resched_rcu_qs();
ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) &&
+ gp_init_delay > 0 &&
+ !(rsp->gpnum % (rcu_num_nodes * 10)))
+ schedule_timeout_uninterruptible(gp_init_delay);
}
- mutex_unlock(&rsp->onoff_mutex);
return 1;
}
@@ -1798,7 +1876,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
fqs_state = RCU_FORCE_QS;
} else {
/* Handle dyntick-idle and offline CPUs. */
- isidle = false;
+ isidle = true;
force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
}
/* Clear flag to prevent immediate re-entry. */
@@ -1852,6 +1930,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
rcu_for_each_node_breadth_first(rsp, rnp) {
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
+ WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+ WARN_ON_ONCE(rnp->qsmask);
ACCESS_ONCE(rnp->completed) = rsp->gpnum;
rdp = this_cpu_ptr(rsp->rda);
if (rnp == rdp->mynode)
@@ -1895,6 +1975,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
struct rcu_state *rsp = arg;
struct rcu_node *rnp = rcu_get_root(rsp);
+ rcu_bind_gp_kthread();
for (;;) {
/* Handle grace-period start. */
@@ -2062,25 +2143,32 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
* Similar to rcu_report_qs_rdp(), for which it is a helper function.
* Allows quiescent states for a group of CPUs to be reported at one go
* to the specified rcu_node structure, though all the CPUs in the group
- * must be represented by the same rcu_node structure (which need not be
- * a leaf rcu_node structure, though it often will be). That structure's
- * lock must be held upon entry, and it is released before return.
+ * must be represented by the same rcu_node structure (which need not be a
+ * leaf rcu_node structure, though it often will be). The gps parameter
+ * is the grace-period snapshot, which means that the quiescent states
+ * are valid only if rnp->gpnum is equal to gps. That structure's lock
+ * must be held upon entry, and it is released before return.
*/
static void
rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
- struct rcu_node *rnp, unsigned long flags)
+ struct rcu_node *rnp, unsigned long gps, unsigned long flags)
__releases(rnp->lock)
{
+ unsigned long oldmask = 0;
struct rcu_node *rnp_c;
/* Walk up the rcu_node hierarchy. */
for (;;) {
- if (!(rnp->qsmask & mask)) {
+ if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
- /* Our bit has already been cleared, so done. */
+ /*
+ * Our bit has already been cleared, or the
+ * relevant grace period is already over, so done.
+ */
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
+ WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
rnp->qsmask &= ~mask;
trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
mask, rnp->qsmask, rnp->level,
@@ -2104,7 +2192,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
rnp = rnp->parent;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- WARN_ON_ONCE(rnp_c->qsmask);
+ oldmask = rnp_c->qsmask;
}
/*
@@ -2116,6 +2204,46 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
}
/*
+ * Record a quiescent state for all tasks that were previously queued
+ * on the specified rcu_node structure and that were blocking the current
+ * RCU grace period. The caller must hold the specified rnp->lock with
+ * irqs disabled, and this lock is released upon return, but irqs remain
+ * disabled.
+ */
+static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
+ struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
+{
+ unsigned long gps;
+ unsigned long mask;
+ struct rcu_node *rnp_p;
+
+ if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
+ rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return; /* Still need more quiescent states! */
+ }
+
+ rnp_p = rnp->parent;
+ if (rnp_p == NULL) {
+ /*
+ * Only one rcu_node structure in the tree, so don't
+ * try to report up to its nonexistent parent!
+ */
+ rcu_report_qs_rsp(rsp, flags);
+ return;
+ }
+
+ /* Report up the rest of the hierarchy, tracking current ->gpnum. */
+ gps = rnp->gpnum;
+ mask = rnp->grpmask;
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock();
+ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
+}
+
+/*
* Record a quiescent state for the specified CPU to that CPU's rcu_data
* structure. This must be either called from the specified CPU, or
* called when the specified CPU is known to be offline (and when it is
@@ -2163,7 +2291,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
*/
needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
- rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
+ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
+ /* ^^^ Released rnp->lock */
if (needwake)
rcu_gp_kthread_wake(rsp);
}
@@ -2256,8 +2385,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
}
- /* Finally, initialize the rcu_data structure's list to empty. */
+ /*
+ * Finally, initialize the rcu_data structure's list to empty and
+ * disallow further callbacks on this CPU.
+ */
init_callback_list(rdp);
+ rdp->nxttail[RCU_NEXT_TAIL] = NULL;
}
/*
@@ -2355,6 +2488,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
smp_mb__after_unlock_lock(); /* GP memory ordering. */
rnp->qsmaskinit &= ~mask;
+ rnp->qsmask &= ~mask;
if (rnp->qsmaskinit) {
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
return;
@@ -2364,6 +2498,26 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
}
/*
+ * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
+ * function. We now remove it from the rcu_node tree's ->qsmaskinit
+ * bit masks.
+ */
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+
+ /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+ mask = rdp->grpmask;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
+ rnp->qsmaskinitnext &= ~mask;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
* The CPU has been completely removed, and some other CPU is reporting
* this fact from process context. Do the remainder of the cleanup,
* including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2379,29 +2533,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
- /* Exclude any attempts to start a new grace period. */
- mutex_lock(&rsp->onoff_mutex);
- raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
-
/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
+ raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
rcu_adopt_orphan_cbs(rsp, flags);
raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
- /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
- rnp->qsmaskinit &= ~rdp->grpmask;
- if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
- rcu_cleanup_dead_rnp(rnp);
- rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
"rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
cpu, rdp->qlen, rdp->nxtlist);
- init_callback_list(rdp);
- /* Disallow further callbacks on this CPU. */
- rdp->nxttail[RCU_NEXT_TAIL] = NULL;
- mutex_unlock(&rsp->onoff_mutex);
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -2414,6 +2554,10 @@ static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
{
}
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+}
+
static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
{
}
@@ -2589,26 +2733,47 @@ static void force_qs_rnp(struct rcu_state *rsp,
return;
}
if (rnp->qsmask == 0) {
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
- continue;
+ if (rcu_state_p == &rcu_sched_state ||
+ rsp != rcu_state_p ||
+ rcu_preempt_blocked_readers_cgp(rnp)) {
+ /*
+ * No point in scanning bits because they
+ * are all zero. But we might need to
+ * priority-boost blocked readers.
+ */
+ rcu_initiate_boost(rnp, flags);
+ /* rcu_initiate_boost() releases rnp->lock */
+ continue;
+ }
+ if (rnp->parent &&
+ (rnp->parent->qsmask & rnp->grpmask)) {
+ /*
+ * Race between grace-period
+ * initialization and task exiting RCU
+ * read-side critical section: Report.
+ */
+ rcu_report_unblock_qs_rnp(rsp, rnp, flags);
+ /* rcu_report_unblock_qs_rnp() rlses ->lock */
+ continue;
+ }
}
cpu = rnp->grplo;
bit = 1;
for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
if ((rnp->qsmask & bit) != 0) {
- if ((rnp->qsmaskinit & bit) != 0)
- *isidle = false;
+ if ((rnp->qsmaskinit & bit) == 0)
+ *isidle = false; /* Pending hotplug. */
if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
mask |= bit;
}
}
if (mask != 0) {
-
- /* rcu_report_qs_rnp() releases rnp->lock. */
- rcu_report_qs_rnp(mask, rsp, rnp, flags);
- continue;
+ /* Idle/offline CPUs, report (releases rnp->lock. */
+ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
+ } else {
+ /* Nothing to do here, so just drop the lock. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
}
@@ -2741,7 +2906,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
* If called from an extended quiescent state, invoke the RCU
* core in order to force a re-evaluation of RCU's idleness.
*/
- if (!rcu_is_watching() && cpu_online(smp_processor_id()))
+ if (!rcu_is_watching())
invoke_rcu_core();
/* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2827,11 +2992,22 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
if (cpu != -1)
rdp = per_cpu_ptr(rsp->rda, cpu);
- offline = !__call_rcu_nocb(rdp, head, lazy, flags);
- WARN_ON_ONCE(offline);
- /* _call_rcu() is illegal on offline CPU; leak the callback. */
- local_irq_restore(flags);
- return;
+ if (likely(rdp->mynode)) {
+ /* Post-boot, so this should be for a no-CBs CPU. */
+ offline = !__call_rcu_nocb(rdp, head, lazy, flags);
+ WARN_ON_ONCE(offline);
+ /* Offline CPU, _call_rcu() illegal, leak callback. */
+ local_irq_restore(flags);
+ return;
+ }
+ /*
+ * Very early boot, before rcu_init(). Initialize if needed
+ * and then drop through to queue the callback.
+ */
+ BUG_ON(cpu != -1);
+ WARN_ON_ONCE(!rcu_is_watching());
+ if (!likely(rdp->nxtlist))
+ init_default_callback_list(rdp);
}
ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
if (lazy)
@@ -2954,7 +3130,7 @@ void synchronize_sched(void)
"Illegal synchronize_sched() in RCU-sched read-side critical section");
if (rcu_blocking_is_gp())
return;
- if (rcu_expedited)
+ if (rcu_gp_is_expedited())
synchronize_sched_expedited();
else
wait_rcu_gp(call_rcu_sched);
@@ -2981,7 +3157,7 @@ void synchronize_rcu_bh(void)
"Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
if (rcu_blocking_is_gp())
return;
- if (rcu_expedited)
+ if (rcu_gp_is_expedited())
synchronize_rcu_bh_expedited();
else
wait_rcu_gp(call_rcu_bh);
@@ -3518,6 +3694,28 @@ void rcu_barrier_sched(void)
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
/*
+ * Propagate ->qsinitmask bits up the rcu_node tree to account for the
+ * first CPU in a given leaf rcu_node structure coming online. The caller
+ * must hold the corresponding leaf rcu_node ->lock with interrrupts
+ * disabled.
+ */
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
+{
+ long mask;
+ struct rcu_node *rnp = rnp_leaf;
+
+ for (;;) {
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ if (rnp == NULL)
+ return;
+ raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
+ rnp->qsmaskinit |= mask;
+ raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
+ }
+}
+
+/*
* Do boot-time initialization of a CPU's per-CPU RCU data.
*/
static void __init
@@ -3553,49 +3751,37 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
- /* Exclude new grace periods. */
- mutex_lock(&rsp->onoff_mutex);
-
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->beenonline = 1; /* We have now been online. */
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
- init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
+ if (!rdp->nxtlist)
+ init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
rcu_sysidle_init_percpu_data(rdp->dynticks);
atomic_set(&rdp->dynticks->dynticks,
(atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- /* Add CPU to rcu_node bitmasks. */
+ /*
+ * Add CPU to leaf rcu_node pending-online bitmask. Any needed
+ * propagation up the rcu_node tree will happen at the beginning
+ * of the next grace period.
+ */
rnp = rdp->mynode;
mask = rdp->grpmask;
- do {
- /* Exclude any attempts to start a new GP on small systems. */
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- rnp->qsmaskinit |= mask;
- mask = rnp->grpmask;
- if (rnp == rdp->mynode) {
- /*
- * If there is a grace period in progress, we will
- * set up to wait for it next time we run the
- * RCU core code.
- */
- rdp->gpnum = rnp->completed;
- rdp->completed = rnp->completed;
- rdp->passed_quiesce = 0;
- rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
- rdp->qs_pending = 0;
- trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
- }
- raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
- rnp = rnp->parent;
- } while (rnp != NULL && !(rnp->qsmaskinit & mask));
- local_irq_restore(flags);
-
- mutex_unlock(&rsp->onoff_mutex);
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock();
+ rnp->qsmaskinitnext |= mask;
+ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
+ rdp->completed = rnp->completed;
+ rdp->passed_quiesce = false;
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+ rdp->qs_pending = false;
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
static void rcu_prepare_cpu(int cpu)
@@ -3609,15 +3795,14 @@ static void rcu_prepare_cpu(int cpu)
/*
* Handle CPU online/offline notification events.
*/
-static int rcu_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+int rcu_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
struct rcu_node *rnp = rdp->mynode;
struct rcu_state *rsp;
- trace_rcu_utilization(TPS("Start CPU hotplug"));
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
@@ -3637,6 +3822,11 @@ static int rcu_cpu_notify(struct notifier_block *self,
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_cpu(rsp);
break;
+ case CPU_DYING_IDLE:
+ for_each_rcu_flavor(rsp) {
+ rcu_cleanup_dying_idle_cpu(cpu, rsp);
+ }
+ break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
case CPU_UP_CANCELED:
@@ -3649,7 +3839,6 @@ static int rcu_cpu_notify(struct notifier_block *self,
default:
break;
}
- trace_rcu_utilization(TPS("End CPU hotplug"));
return NOTIFY_OK;
}
@@ -3660,11 +3849,12 @@ static int rcu_pm_notify(struct notifier_block *self,
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
- rcu_expedited = 1;
+ rcu_expedite_gp();
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
- rcu_expedited = 0;
+ if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
+ rcu_unexpedite_gp();
break;
default:
break;
@@ -3734,30 +3924,26 @@ void rcu_scheduler_starting(void)
* Compute the per-level fanout, either using the exact fanout specified
* or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
*/
-#ifdef CONFIG_RCU_FANOUT_EXACT
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
-{
- int i;
-
- rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
- for (i = rcu_num_lvls - 2; i >= 0; i--)
- rsp->levelspread[i] = CONFIG_RCU_FANOUT;
-}
-#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
static void __init rcu_init_levelspread(struct rcu_state *rsp)
{
- int ccur;
- int cprv;
int i;
- cprv = nr_cpu_ids;
- for (i = rcu_num_lvls - 1; i >= 0; i--) {
- ccur = rsp->levelcnt[i];
- rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
- cprv = ccur;
+ if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
+ rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+ for (i = rcu_num_lvls - 2; i >= 0; i--)
+ rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+ } else {
+ int ccur;
+ int cprv;
+
+ cprv = nr_cpu_ids;
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
+ ccur = rsp->levelcnt[i];
+ rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+ cprv = ccur;
+ }
}
}
-#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
/*
* Helper function for rcu_init() that initializes one rcu_state structure.
@@ -3833,7 +4019,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
}
}
- rsp->rda = rda;
init_waitqueue_head(&rsp->gp_wq);
rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
@@ -3926,6 +4111,8 @@ void __init rcu_init(void)
{
int cpu;
+ rcu_early_boot_tests();
+
rcu_bootup_announce();
rcu_init_geometry();
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
@@ -3942,8 +4129,6 @@ void __init rcu_init(void)
pm_notifier(rcu_pm_notify, 0);
for_each_online_cpu(cpu)
rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
-
- rcu_early_boot_tests();
}
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 119de39..a69d3da 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -141,12 +141,20 @@ struct rcu_node {
/* complete (only for PREEMPT_RCU). */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
+ /* Initialized from ->qsmaskinitnext at the */
+ /* beginning of each grace period. */
+ unsigned long qsmaskinitnext;
+ /* Online CPUs for next grace period. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
int grplo; /* lowest-numbered CPU or group here. */
int grphi; /* highest-numbered CPU or group here. */
u8 grpnum; /* CPU/group number for next level up. */
u8 level; /* root is at level 0. */
+ bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
+ /* exit RCU read-side critical sections */
+ /* before propagating offline up the */
+ /* rcu_node tree? */
struct rcu_node *parent;
struct list_head blkd_tasks;
/* Tasks blocked in RCU read-side critical */
@@ -448,8 +456,6 @@ struct rcu_state {
long qlen; /* Total number of callbacks. */
/* End of fields guarded by orphan_lock. */
- struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
-
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
@@ -559,6 +565,7 @@ static void rcu_prepare_kthreads(int cpu);
static void rcu_cleanup_after_idle(void);
static void rcu_prepare_for_idle(void);
static void rcu_idle_count_callbacks_posted(void);
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static void print_cpu_stall_info_begin(void);
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9..8c0ec0f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -58,38 +58,33 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
*/
static void __init rcu_bootup_announce_oddness(void)
{
-#ifdef CONFIG_RCU_TRACE
- pr_info("\tRCU debugfs-based tracing is enabled.\n");
-#endif
-#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
- pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
- CONFIG_RCU_FANOUT);
-#endif
-#ifdef CONFIG_RCU_FANOUT_EXACT
- pr_info("\tHierarchical RCU autobalancing is disabled.\n");
-#endif
-#ifdef CONFIG_RCU_FAST_NO_HZ
- pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
-#endif
-#ifdef CONFIG_PROVE_RCU
- pr_info("\tRCU lockdep checking is enabled.\n");
-#endif
-#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
- pr_info("\tRCU torture testing starts during boot.\n");
-#endif
-#if defined(CONFIG_RCU_CPU_STALL_INFO)
- pr_info("\tAdditional per-CPU info printed with stalls.\n");
-#endif
-#if NUM_RCU_LVL_4 != 0
- pr_info("\tFour-level hierarchy is enabled.\n");
-#endif
+ if (IS_ENABLED(CONFIG_RCU_TRACE))
+ pr_info("\tRCU debugfs-based tracing is enabled.\n");
+ if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
+ (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
+ pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
+ CONFIG_RCU_FANOUT);
+ if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
+ pr_info("\tHierarchical RCU autobalancing is disabled.\n");
+ if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
+ pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ pr_info("\tRCU lockdep checking is enabled.\n");
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
+ pr_info("\tRCU torture testing starts during boot.\n");
+ if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO))
+ pr_info("\tAdditional per-CPU info printed with stalls.\n");
+ if (NUM_RCU_LVL_4 != 0)
+ pr_info("\tFour-level hierarchy is enabled.\n");
+ if (CONFIG_RCU_FANOUT_LEAF != 16)
+ pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
+ CONFIG_RCU_FANOUT_LEAF);
if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
-#ifdef CONFIG_RCU_BOOST
- pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
-#endif
+ if (IS_ENABLED(CONFIG_RCU_BOOST))
+ pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
}
#ifdef CONFIG_PREEMPT_RCU
@@ -180,7 +175,7 @@ static void rcu_preempt_note_context_switch(void)
* But first, note that the current CPU must still be
* on line!
*/
- WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
+ WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
@@ -233,43 +228,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
}
/*
- * Record a quiescent state for all tasks that were previously queued
- * on the specified rcu_node structure and that were blocking the current
- * RCU grace period. The caller must hold the specified rnp->lock with
- * irqs disabled, and this lock is released upon return, but irqs remain
- * disabled.
- */
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
- __releases(rnp->lock)
-{
- unsigned long mask;
- struct rcu_node *rnp_p;
-
- if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return; /* Still need more quiescent states! */
- }
-
- rnp_p = rnp->parent;
- if (rnp_p == NULL) {
- /*
- * Either there is only one rcu_node in the tree,
- * or tasks were kicked up to root rcu_node due to
- * CPUs going offline.
- */
- rcu_report_qs_rsp(&rcu_preempt_state, flags);
- return;
- }
-
- /* Report up the rest of the hierarchy. */
- mask = rnp->grpmask;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
- rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
-}
-
-/*
* Advance a ->blkd_tasks-list pointer to the next entry, instead
* returning NULL if at the end of the list.
*/
@@ -300,7 +258,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
*/
void rcu_read_unlock_special(struct task_struct *t)
{
- bool empty;
bool empty_exp;
bool empty_norm;
bool empty_exp_now;
@@ -334,7 +291,13 @@ void rcu_read_unlock_special(struct task_struct *t)
}
/* Hardware IRQ handlers cannot block, complain if they get here. */
- if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
+ if (in_irq() || in_serving_softirq()) {
+ lockdep_rcu_suspicious(__FILE__, __LINE__,
+ "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
+ pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n",
+ t->rcu_read_unlock_special.s,
+ t->rcu_read_unlock_special.b.blocked,
+ t->rcu_read_unlock_special.b.need_qs);
local_irq_restore(flags);
return;
}
@@ -356,7 +319,6 @@ void rcu_read_unlock_special(struct task_struct *t)
break;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
- empty = !rcu_preempt_has_tasks(rnp);
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = !rcu_preempted_readers_exp(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -377,14 +339,6 @@ void rcu_read_unlock_special(struct task_struct *t)
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
- * If this was the last task on the list, go see if we
- * need to propagate ->qsmaskinit bit clearing up the
- * rcu_node tree.
- */
- if (!empty && !rcu_preempt_has_tasks(rnp))
- rcu_cleanup_dead_rnp(rnp);
-
- /*
* If this was the last task on the current list, and if
* we aren't waiting on any CPUs, report the quiescent state.
* Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
@@ -399,7 +353,8 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp->grplo,
rnp->grphi,
!!rnp->gp_tasks);
- rcu_report_unblock_qs_rnp(rnp, flags);
+ rcu_report_unblock_qs_rnp(&rcu_preempt_state,
+ rnp, flags);
} else {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -520,10 +475,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
WARN_ON_ONCE(rnp->qsmask);
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Check for a quiescent state from the current CPU. When a task blocks,
* the task is recorded in the corresponding CPU's rcu_node structure,
@@ -585,7 +536,7 @@ void synchronize_rcu(void)
"Illegal synchronize_rcu() in RCU read-side critical section");
if (!rcu_scheduler_active)
return;
- if (rcu_expedited)
+ if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
else
wait_rcu_gp(call_rcu);
@@ -630,9 +581,6 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
*
- * Most callers will set the "wake" flag, but the task initiating the
- * expedited grace period need not wake itself.
- *
* Caller must hold sync_rcu_preempt_exp_mutex.
*/
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@ -667,29 +615,85 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
/*
* Snapshot the tasks blocking the newly started preemptible-RCU expedited
- * grace period for the specified rcu_node structure. If there are no such
- * tasks, report it up the rcu_node hierarchy.
+ * grace period for the specified rcu_node structure, phase 1. If there
+ * are such tasks, set the ->expmask bits up the rcu_node tree and also
+ * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
+ * that work is needed here.
*
- * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
- * CPU hotplug operations.
+ * Caller must hold sync_rcu_preempt_exp_mutex.
*/
static void
-sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
+sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
{
unsigned long flags;
- int must_wait = 0;
+ unsigned long mask;
+ struct rcu_node *rnp_up;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
+ WARN_ON_ONCE(rnp->expmask);
+ WARN_ON_ONCE(rnp->exp_tasks);
if (!rcu_preempt_has_tasks(rnp)) {
+ /* No blocked tasks, nothing to do. */
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- } else {
+ return;
+ }
+ /* Call for Phase 2 and propagate ->expmask bits up the tree. */
+ rnp->expmask = 1;
+ rnp_up = rnp;
+ while (rnp_up->parent) {
+ mask = rnp_up->grpmask;
+ rnp_up = rnp_up->parent;
+ if (rnp_up->expmask & mask)
+ break;
+ raw_spin_lock(&rnp_up->lock); /* irqs already off */
+ smp_mb__after_unlock_lock();
+ rnp_up->expmask |= mask;
+ raw_spin_unlock(&rnp_up->lock); /* irqs still off */
+ }
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Snapshot the tasks blocking the newly started preemptible-RCU expedited
+ * grace period for the specified rcu_node structure, phase 2. If the
+ * leaf rcu_node structure has its ->expmask field set, check for tasks.
+ * If there are some, clear ->expmask and set ->exp_tasks accordingly,
+ * then initiate RCU priority boosting. Otherwise, clear ->expmask and
+ * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
+ * enabling rcu_read_unlock_special() to do the bit-clearing.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static void
+sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ if (!rnp->expmask) {
+ /* Phase 1 didn't do anything, so Phase 2 doesn't either. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+
+ /* Phase 1 is over. */
+ rnp->expmask = 0;
+
+ /*
+ * If there are still blocked tasks, set up ->exp_tasks so that
+ * rcu_read_unlock_special() will wake us and then boost them.
+ */
+ if (rcu_preempt_has_tasks(rnp)) {
rnp->exp_tasks = rnp->blkd_tasks.next;
rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
- must_wait = 1;
+ return;
}
- if (!must_wait)
- rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
+
+ /* No longer any blocked tasks, so undo bit setting. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rcu_report_exp_rnp(rsp, rnp, false);
}
/**
@@ -706,7 +710,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
*/
void synchronize_rcu_expedited(void)
{
- unsigned long flags;
struct rcu_node *rnp;
struct rcu_state *rsp = &rcu_preempt_state;
unsigned long snap;
@@ -757,19 +760,16 @@ void synchronize_rcu_expedited(void)
/* force all RCU readers onto ->blkd_tasks lists. */
synchronize_sched_expedited();
- /* Initialize ->expmask for all non-leaf rcu_node structures. */
- rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- rnp->expmask = rnp->qsmaskinit;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- }
-
- /* Snapshot current state of ->blkd_tasks lists. */
+ /*
+ * Snapshot current state of ->blkd_tasks lists into ->expmask.
+ * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
+ * to start clearing them. Doing this in one phase leads to
+ * strange races between setting and clearing bits, so just say "no"!
+ */
+ rcu_for_each_leaf_node(rsp, rnp)
+ sync_rcu_preempt_exp_init1(rsp, rnp);
rcu_for_each_leaf_node(rsp, rnp)
- sync_rcu_preempt_exp_init(rsp, rnp);
- if (NUM_RCU_NODES > 1)
- sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
+ sync_rcu_preempt_exp_init2(rsp, rnp);
put_online_cpus();
@@ -859,8 +859,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
return 0;
}
-#ifdef CONFIG_HOTPLUG_CPU
-
/*
* Because there is no preemptible RCU, there can be no readers blocked.
*/
@@ -869,8 +867,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
return false;
}
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections.
@@ -1170,7 +1166,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
* Returns zero if all is well, a negated errno otherwise.
*/
static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp)
+ struct rcu_node *rnp)
{
int rnp_index = rnp - &rsp->node[0];
unsigned long flags;
@@ -1180,7 +1176,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
if (&rcu_preempt_state != rsp)
return 0;
- if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
+ if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
return 0;
rsp->boost = 1;
@@ -1273,7 +1269,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
struct task_struct *t = rnp->boost_kthread_task;
- unsigned long mask = rnp->qsmaskinit;
+ unsigned long mask = rcu_rnp_online_cpus(rnp);
cpumask_var_t cm;
int cpu;
@@ -1945,7 +1941,8 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
rhp = ACCESS_ONCE(rdp->nocb_follower_head);
/* Having no rcuo kthread but CBs after scheduler starts is bad! */
- if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) {
+ if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
+ rcu_scheduler_fully_active) {
/* RCU callback enqueued before CPU first came online??? */
pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
cpu, rhp->func);
@@ -2392,18 +2389,8 @@ void __init rcu_init_nohz(void)
pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
for_each_rcu_flavor(rsp) {
- for_each_cpu(cpu, rcu_nocb_mask) {
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-
- /*
- * If there are early callbacks, they will need
- * to be moved to the nocb lists.
- */
- WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] !=
- &rdp->nxtlist &&
- rdp->nxttail[RCU_NEXT_TAIL] != NULL);
- init_nocb_callback_list(rdp);
- }
+ for_each_cpu(cpu, rcu_nocb_mask)
+ init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu));
rcu_organize_nocb_kthreads(rsp);
}
}
@@ -2540,6 +2527,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
if (!rcu_is_nocb_cpu(rdp->cpu))
return false;
+ /* If there are early-boot callbacks, move them to nocb lists. */
+ if (rdp->nxtlist) {
+ rdp->nocb_head = rdp->nxtlist;
+ rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
+ atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
+ atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
+ rdp->nxtlist = NULL;
+ rdp->qlen = 0;
+ rdp->qlen_lazy = 0;
+ }
rdp->nxttail[RCU_NEXT_TAIL] = NULL;
return true;
}
@@ -2763,7 +2760,8 @@ static void rcu_sysidle_exit(int irq)
/*
* Check to see if the current CPU is idle. Note that usermode execution
- * does not count as idle. The caller must have disabled interrupts.
+ * does not count as idle. The caller must have disabled interrupts,
+ * and must be running on tick_do_timer_cpu.
*/
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long *maxj)
@@ -2784,8 +2782,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
if (!*isidle || rdp->rsp != rcu_state_p ||
cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
return;
- if (rcu_gp_in_progress(rdp->rsp))
- WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
+ /* Verify affinity of current kthread. */
+ WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
/* Pick up current idle and NMI-nesting counter and check. */
cur = atomic_read(&rdtp->dynticks_idle);
@@ -3068,11 +3066,10 @@ static void rcu_bind_gp_kthread(void)
return;
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
cpu = tick_do_timer_cpu;
- if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu)
+ if (cpu >= 0 && cpu < nr_cpu_ids)
set_cpus_allowed_ptr(current, cpumask_of(cpu));
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
- if (!is_housekeeping_cpu(raw_smp_processor_id()))
- housekeeping_affine(current);
+ housekeeping_affine(current);
#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index fbb6240..f92361e 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -283,8 +283,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
seq_puts(m, "\n");
level = rnp->level;
}
- seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ",
- rnp->qsmask, rnp->qsmaskinit,
+ seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ",
+ rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
".G"[rnp->gp_tasks != NULL],
".E"[rnp->exp_tasks != NULL],
".T"[!list_empty(&rnp->blkd_tasks)],
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index e0d31a3..1f13335 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,6 +62,63 @@ MODULE_ALIAS("rcupdate");
module_param(rcu_expedited, int, 0);
+#ifndef CONFIG_TINY_RCU
+
+static atomic_t rcu_expedited_nesting =
+ ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
+
+/*
+ * Should normal grace-period primitives be expedited? Intended for
+ * use within RCU. Note that this function takes the rcu_expedited
+ * sysfs/boot variable into account as well as the rcu_expedite_gp()
+ * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
+ * returns false is a -really- bad idea.
+ */
+bool rcu_gp_is_expedited(void)
+{
+ return rcu_expedited || atomic_read(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
+
+/**
+ * rcu_expedite_gp - Expedite future RCU grace periods
+ *
+ * After a call to this function, future calls to synchronize_rcu() and
+ * friends act as the corresponding synchronize_rcu_expedited() function
+ * had instead been called.
+ */
+void rcu_expedite_gp(void)
+{
+ atomic_inc(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_expedite_gp);
+
+/**
+ * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation
+ *
+ * Undo a prior call to rcu_expedite_gp(). If all prior calls to
+ * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(),
+ * and if the rcu_expedited sysfs/boot parameter is not set, then all
+ * subsequent calls to synchronize_rcu() and friends will return to
+ * their normal non-expedited behavior.
+ */
+void rcu_unexpedite_gp(void)
+{
+ atomic_dec(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
+
+#endif /* #ifndef CONFIG_TINY_RCU */
+
+/*
+ * Inform RCU of the end of the in-kernel boot sequence.
+ */
+void rcu_end_inkernel_boot(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
+ rcu_unexpedite_gp();
+}
+
#ifdef CONFIG_PREEMPT_RCU
/*
@@ -199,16 +256,13 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-struct rcu_synchronize {
- struct rcu_head head;
- struct completion completion;
-};
-
-/*
- * Awaken the corresponding synchronize_rcu() instance now that a
- * grace period has elapsed.
+/**
+ * wakeme_after_rcu() - Callback function to awaken a task after grace period
+ * @head: Pointer to rcu_head member within rcu_synchronize structure
+ *
+ * Awaken the corresponding task now that a grace period has elapsed.
*/
-static void wakeme_after_rcu(struct rcu_head *head)
+void wakeme_after_rcu(struct rcu_head *head)
{
struct rcu_synchronize *rcu;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62671f5..f9123a8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -306,6 +306,9 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
+/* cpus with isolated domains */
+cpumask_var_t cpu_isolated_map;
+
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
@@ -690,6 +693,23 @@ static inline bool got_nohz_idle_kick(void)
bool sched_can_stop_tick(void)
{
/*
+ * FIFO realtime policy runs the highest priority task. Other runnable
+ * tasks are of a lower priority. The scheduler tick does nothing.
+ */
+ if (current->policy == SCHED_FIFO)
+ return true;
+
+ /*
+ * Round-robin realtime tasks time slice with other tasks at the same
+ * realtime priority. Is this task the only one at this priority?
+ */
+ if (current->policy == SCHED_RR) {
+ struct sched_rt_entity *rt_se = &current->rt;
+
+ return rt_se->run_list.prev == rt_se->run_list.next;
+ }
+
+ /*
* More than one running task need preemption.
* nr_running update is assumed to be visible
* after IPI is sent from wakers.
@@ -996,6 +1016,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
rq_clock_skip_update(rq, true);
}
+static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
+
+void register_task_migration_notifier(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&task_migration_notifier, n);
+}
+
#ifdef CONFIG_SMP
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
@@ -1026,10 +1053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
+ struct task_migration_notifier tmn;
+
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+
+ tmn.task = p;
+ tmn.from_cpu = task_cpu(p);
+ tmn.to_cpu = new_cpu;
+
+ atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
}
__set_task_cpu(p, new_cpu);
@@ -2818,7 +2853,7 @@ asmlinkage __visible void __sched schedule_user(void)
* we find a better solution.
*
* NB: There are buggy callers of this function. Ideally we
- * should warn if prev_state != IN_USER, but that will trigger
+ * should warn if prev_state != CONTEXT_USER, but that will trigger
* too frequently to make sense yet.
*/
enum ctx_state prev_state = exception_enter();
@@ -5320,36 +5355,13 @@ static int sched_cpu_active(struct notifier_block *nfb,
static int sched_cpu_inactive(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
- unsigned long flags;
- long cpu = (long)hcpu;
- struct dl_bw *dl_b;
-
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
- set_cpu_active(cpu, false);
-
- /* explicitly allow suspend */
- if (!(action & CPU_TASKS_FROZEN)) {
- bool overflow;
- int cpus;
-
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- rcu_read_unlock_sched();
-
- if (overflow)
- return notifier_from_errno(-EBUSY);
- }
+ set_cpu_active((long)hcpu, false);
return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
}
-
- return NOTIFY_DONE;
}
static int __init migration_init(void)
@@ -5430,17 +5442,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- /*
- * Even though we initialize ->capacity to something semi-sane,
- * we leave capacity_orig unset. This allows us to detect if
- * domain iteration is still funny without causing /0 traps.
- */
- if (!group->sgc->capacity_orig) {
- printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
- break;
- }
-
if (!cpumask_weight(sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
@@ -5813,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
update_top_cache_domain(cpu);
}
-/* cpus with isolated domains */
-static cpumask_var_t cpu_isolated_map;
-
/* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
{
@@ -5924,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
- sg->sgc->capacity_orig = sg->sgc->capacity;
/*
* Make sure the first group of this domain contains the
@@ -6235,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
*/
if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
@@ -7000,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
*/
case CPU_ONLINE:
- case CPU_DOWN_FAILED:
cpuset_update_active_cpus(true);
break;
default:
@@ -7012,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action) {
+ unsigned long flags;
+ long cpu = (long)hcpu;
+ struct dl_bw *dl_b;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
+ /* explicitly allow suspend */
+ if (!(action & CPU_TASKS_FROZEN)) {
+ bool overflow;
+ int cpus;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
+
+ if (overflow)
+ return notifier_from_errno(-EBUSY);
+ }
cpuset_update_active_cpus(false);
break;
case CPU_DOWN_PREPARE_FROZEN:
@@ -7158,8 +7177,8 @@ void __init sched_init(void)
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs);
- init_rt_rq(&rq->rt, rq);
- init_dl_rq(&rq->dl, rq);
+ init_rt_rq(&rq->rt);
+ init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -7199,7 +7218,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+ rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -7798,7 +7817,7 @@ static int sched_rt_global_constraints(void)
}
#endif /* CONFIG_RT_GROUP_SCHED */
-static int sched_dl_global_constraints(void)
+static int sched_dl_global_validate(void)
{
u64 runtime = global_rt_runtime();
u64 period = global_rt_period();
@@ -7899,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write,
if (ret)
goto undo;
- ret = sched_rt_global_constraints();
+ ret = sched_dl_global_validate();
if (ret)
goto undo;
- ret = sched_dl_global_constraints();
+ ret = sched_rt_global_constraints();
if (ret)
goto undo;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3fa8fa6..5e95145 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b)
dl_b->total_bw = 0;
}
-void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
+void init_dl_rq(struct dl_rq *dl_rq)
{
dl_rq->rb_root = RB_ROOT;
@@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq)
rq->post_schedule = has_pushable_dl_tasks(rq);
}
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
+
+static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+{
+ struct rq *later_rq = NULL;
+ bool fallback = false;
+
+ later_rq = find_lock_later_rq(p, rq);
+
+ if (!later_rq) {
+ int cpu;
+
+ /*
+ * If we cannot preempt any rq, fall back to pick any
+ * online cpu.
+ */
+ fallback = true;
+ cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+ if (cpu >= nr_cpu_ids) {
+ /*
+ * Fail to find any suitable cpu.
+ * The task will never come back!
+ */
+ BUG_ON(dl_bandwidth_enabled());
+
+ /*
+ * If admission control is disabled we
+ * try a little harder to let the task
+ * run.
+ */
+ cpu = cpumask_any(cpu_active_mask);
+ }
+ later_rq = cpu_rq(cpu);
+ double_lock_balance(rq, later_rq);
+ }
+
+ deactivate_task(rq, p, 0);
+ set_task_cpu(p, later_rq->cpu);
+ activate_task(later_rq, p, ENQUEUE_REPLENISH);
+
+ if (!fallback)
+ resched_curr(later_rq);
+
+ double_unlock_balance(rq, later_rq);
+}
+
#else
static inline
@@ -514,7 +560,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
unsigned long flags;
struct rq *rq;
- rq = task_rq_lock(current, &flags);
+ rq = task_rq_lock(p, &flags);
/*
* We need to take care of several possible races here:
@@ -536,6 +582,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
sched_clock_tick();
update_rq_clock(rq);
+#ifdef CONFIG_SMP
+ /*
+ * If we find that the rq the task was on is no longer
+ * available, we need to select a new rq.
+ */
+ if (unlikely(!rq->online)) {
+ dl_task_offline_migration(rq, p);
+ goto unlock;
+ }
+#endif
+
/*
* If the throttle happened during sched-out; like:
*
@@ -569,7 +626,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
push_dl_task(rq);
#endif
unlock:
- task_rq_unlock(rq, current, &flags);
+ task_rq_unlock(rq, p, &flags);
return HRTIMER_NORESTART;
}
@@ -914,6 +971,12 @@ static void yield_task_dl(struct rq *rq)
}
update_rq_clock(rq);
update_curr_dl(rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq_clock_skip_update(rq, true);
}
#ifdef CONFIG_SMP
@@ -1659,14 +1722,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
int check_resched = 1;
- /*
- * If p is throttled, don't consider the possibility
- * of preempting rq->curr, the check will be done right
- * after its runtime will get replenished.
- */
- if (unlikely(p->dl.dl_throttled))
- return;
-
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8baaf85..a245c1f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
if (!se) {
struct sched_avg *avg = &cpu_rq(cpu)->avg;
P(avg->runnable_avg_sum);
- P(avg->runnable_avg_period);
+ P(avg->avg_period);
return;
}
@@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->load.weight);
#ifdef CONFIG_SMP
P(se->avg.runnable_avg_sum);
- P(se->avg.runnable_avg_period);
+ P(se->avg.running_avg_sum);
+ P(se->avg.avg_period);
P(se->avg.load_avg_contrib);
+ P(se->avg.utilization_avg_contrib);
P(se->avg.decay_count);
#endif
#undef PN
@@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->runnable_load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
cfs_rq->blocked_load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg",
+ cfs_rq->utilization_load_avg);
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
cfs_rq->tg_load_contrib);
@@ -636,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.load.weight);
#ifdef CONFIG_SMP
P(se.avg.runnable_avg_sum);
- P(se.avg.runnable_avg_period);
+ P(se.avg.running_avg_sum);
+ P(se.avg.avg_period);
P(se.avg.load_avg_contrib);
+ P(se.avg.utilization_avg_contrib);
P(se.avg.decay_count);
#endif
P(policy);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bcfe320..ffeaa41 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu);
static unsigned long task_h_load(struct task_struct *p);
static inline void __update_task_entity_contrib(struct sched_entity *se);
+static inline void __update_task_entity_utilization(struct sched_entity *se);
/* Give new task start runnable values to heavy its load in infant time */
void init_task_runnable_average(struct task_struct *p)
@@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p)
u32 slice;
slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
- p->se.avg.runnable_avg_sum = slice;
- p->se.avg.runnable_avg_period = slice;
+ p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
+ p->se.avg.avg_period = slice;
__update_task_entity_contrib(&p->se);
+ __update_task_entity_utilization(&p->se);
}
#else
void init_task_runnable_average(struct task_struct *p)
@@ -1196,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env,
static bool load_too_imbalanced(long src_load, long dst_load,
struct task_numa_env *env)
{
- long imb, old_imb;
- long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;
+ long orig_src_load;
+ long load_a, load_b;
+ long moved_load;
+ long imb;
/*
* The load is corrected for the CPU capacity available on each node.
@@ -1211,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load,
dst_capacity = env->dst_stats.compute_capacity;
/* We care about the slope of the imbalance, not the direction. */
- if (dst_load < src_load)
- swap(dst_load, src_load);
+ load_a = dst_load;
+ load_b = src_load;
+ if (load_a < load_b)
+ swap(load_a, load_b);
/* Is the difference below the threshold? */
- imb = dst_load * src_capacity * 100 -
- src_load * dst_capacity * env->imbalance_pct;
+ imb = load_a * src_capacity * 100 -
+ load_b * dst_capacity * env->imbalance_pct;
if (imb <= 0)
return false;
/*
* The imbalance is above the allowed threshold.
- * Compare it with the old imbalance.
+ * Allow a move that brings us closer to a balanced situation,
+ * without moving things past the point of balance.
*/
orig_src_load = env->src_stats.load;
- orig_dst_load = env->dst_stats.load;
- if (orig_dst_load < orig_src_load)
- swap(orig_dst_load, orig_src_load);
-
- old_imb = orig_dst_load * src_capacity * 100 -
- orig_src_load * dst_capacity * env->imbalance_pct;
+ /*
+ * In a task swap, there will be one load moving from src to dst,
+ * and another moving back. This is the net sum of both moves.
+ * A simple task move will always have a positive value.
+ * Allow the move if it brings the system closer to a balanced
+ * situation, without crossing over the balance point.
+ */
+ moved_load = orig_src_load - src_load;
- /* Would this change make things worse? */
- return (imb > old_imb);
+ if (moved_load > 0)
+ /* Moving src -> dst. Did we overshoot balance? */
+ return src_load * dst_capacity < dst_load * src_capacity;
+ else
+ /* Moving dst -> src. Did we overshoot balance? */
+ return dst_load * src_capacity < src_load * dst_capacity;
}
/*
@@ -1675,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
*period = now - p->last_task_numa_placement;
} else {
delta = p->se.avg.runnable_avg_sum;
- *period = p->se.avg.runnable_avg_period;
+ *period = p->se.avg.avg_period;
}
p->last_sum_exec_runtime = runtime;
@@ -1765,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
}
}
/* Next round, evaluate the nodes within max_group. */
+ if (!max_faults)
+ break;
nodes = max_group;
}
return nid;
@@ -2165,8 +2180,10 @@ void task_numa_work(struct callback_head *work)
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma) || !vma_policy_mof(vma))
+ if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+ is_vm_hugetlb_page(vma)) {
continue;
+ }
/*
* Shared library pages mapped by multiple processes are not
@@ -2501,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n)
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
-static __always_inline int __update_entity_runnable_avg(u64 now,
+static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
struct sched_avg *sa,
- int runnable)
+ int runnable,
+ int running)
{
u64 delta, periods;
u32 runnable_contrib;
int delta_w, decayed = 0;
+ unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
delta = now - sa->last_runnable_update;
/*
@@ -2529,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
sa->last_runnable_update = now;
/* delta_w is the amount already accumulated against our next period */
- delta_w = sa->runnable_avg_period % 1024;
+ delta_w = sa->avg_period % 1024;
if (delta + delta_w >= 1024) {
/* period roll-over */
decayed = 1;
@@ -2542,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
delta_w = 1024 - delta_w;
if (runnable)
sa->runnable_avg_sum += delta_w;
- sa->runnable_avg_period += delta_w;
+ if (running)
+ sa->running_avg_sum += delta_w * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += delta_w;
delta -= delta_w;
@@ -2552,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
periods + 1);
- sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+ sa->running_avg_sum = decay_load(sa->running_avg_sum,
+ periods + 1);
+ sa->avg_period = decay_load(sa->avg_period,
periods + 1);
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
runnable_contrib = __compute_runnable_contrib(periods);
if (runnable)
sa->runnable_avg_sum += runnable_contrib;
- sa->runnable_avg_period += runnable_contrib;
+ if (running)
+ sa->running_avg_sum += runnable_contrib * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += runnable_contrib;
}
/* Remainder of delta accrued against u_0` */
if (runnable)
sa->runnable_avg_sum += delta;
- sa->runnable_avg_period += delta;
+ if (running)
+ sa->running_avg_sum += delta * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += delta;
return decayed;
}
@@ -2582,6 +2612,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
return 0;
se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ se->avg.utilization_avg_contrib =
+ decay_load(se->avg.utilization_avg_contrib, decays);
return decays;
}
@@ -2617,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
/* The fraction of a cpu used by this cfs_rq */
contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
- sa->runnable_avg_period + 1);
+ sa->avg_period + 1);
contrib -= cfs_rq->tg_runnable_contrib;
if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
@@ -2670,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
{
- __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
+ __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
+ runnable, runnable);
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
@@ -2688,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
- contrib /= (se->avg.runnable_avg_period + 1);
+ contrib /= (se->avg.avg_period + 1);
se->avg.load_avg_contrib = scale_load(contrib);
}
@@ -2707,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
return se->avg.load_avg_contrib - old_contrib;
}
+
+static inline void __update_task_entity_utilization(struct sched_entity *se)
+{
+ u32 contrib;
+
+ /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+ contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
+ contrib /= (se->avg.avg_period + 1);
+ se->avg.utilization_avg_contrib = scale_load(contrib);
+}
+
+static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
+{
+ long old_contrib = se->avg.utilization_avg_contrib;
+
+ if (entity_is_task(se))
+ __update_task_entity_utilization(se);
+ else
+ se->avg.utilization_avg_contrib =
+ group_cfs_rq(se)->utilization_load_avg;
+
+ return se->avg.utilization_avg_contrib - old_contrib;
+}
+
static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
long load_contrib)
{
@@ -2723,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- long contrib_delta;
+ long contrib_delta, utilization_delta;
+ int cpu = cpu_of(rq_of(cfs_rq));
u64 now;
/*
@@ -2735,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se,
else
now = cfs_rq_clock_task(group_cfs_rq(se));
- if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+ if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
+ cfs_rq->curr == se))
return;
contrib_delta = __update_entity_load_avg_contrib(se);
+ utilization_delta = __update_entity_utilization_avg_contrib(se);
if (!update_cfs_rq)
return;
- if (se->on_rq)
+ if (se->on_rq) {
cfs_rq->runnable_load_avg += contrib_delta;
- else
+ cfs_rq->utilization_load_avg += utilization_delta;
+ } else {
subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+ }
}
/*
@@ -2821,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
}
cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
/* we force update consideration on load-balancer moves */
update_cfs_rq_blocked_load(cfs_rq, !wakeup);
}
@@ -2839,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
update_cfs_rq_blocked_load(cfs_rq, !sleep);
cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
if (sleep) {
cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
@@ -3176,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
+ update_entity_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -4302,6 +4367,11 @@ static unsigned long capacity_of(int cpu)
return cpu_rq(cpu)->cpu_capacity;
}
+static unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -4715,6 +4785,33 @@ next:
done:
return target;
}
+/*
+ * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the usage with the capacity of the CPU that is available for CFS
+ * task (ie cpu_capacity).
+ * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
+ * CPU. It represents the amount of utilization of a CPU in the range
+ * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
+ * capacity of the CPU because it's about the running time on this CPU.
+ * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
+ * because of unfortunate rounding in avg_period and running_load_avg or just
+ * after migrating tasks until the average stabilizes with the new running
+ * time. So we need to check that the usage stays into the range
+ * [0..cpu_capacity_orig] and cap if necessary.
+ * Without capping the usage, a group could be seen as overloaded (CPU0 usage
+ * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
+ */
+static int get_cpu_usage(int cpu)
+{
+ unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
+ unsigned long capacity = capacity_orig_of(cpu);
+
+ if (usage >= SCHED_LOAD_SCALE)
+ return capacity;
+
+ return (usage * capacity) >> SCHED_LOAD_SHIFT;
+}
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
@@ -5841,12 +5938,12 @@ struct sg_lb_stats {
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
unsigned long group_capacity;
+ unsigned long group_usage; /* Total usage of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
- unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
- int group_has_free_capacity;
+ int group_no_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -5917,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
-{
- return SCHED_CAPACITY_SCALE;
-}
-
-unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
-{
- return default_scale_capacity(sd, cpu);
-}
-
static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
@@ -5943,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available, age_stamp, avg;
+ u64 total, used, age_stamp, avg;
s64 delta;
/*
@@ -5959,19 +6046,12 @@ static unsigned long scale_rt_capacity(int cpu)
total = sched_avg_period() + delta;
- if (unlikely(total < avg)) {
- /* Ensures that capacity won't end up being negative */
- available = 0;
- } else {
- available = total - avg;
- }
-
- if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
- total = SCHED_CAPACITY_SCALE;
+ used = div_u64(avg, total);
- total >>= SCHED_CAPACITY_SHIFT;
+ if (likely(used < SCHED_CAPACITY_SCALE))
+ return SCHED_CAPACITY_SCALE - used;
- return div_u64(available, total);
+ return 1;
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -5986,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
capacity >>= SCHED_CAPACITY_SHIFT;
- sdg->sgc->capacity_orig = capacity;
-
- if (sched_feat(ARCH_CAPACITY))
- capacity *= arch_scale_freq_capacity(sd, cpu);
- else
- capacity *= default_scale_capacity(sd, cpu);
-
- capacity >>= SCHED_CAPACITY_SHIFT;
+ cpu_rq(cpu)->cpu_capacity_orig = capacity;
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
@@ -6009,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity, capacity_orig;
+ unsigned long capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -6021,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
return;
}
- capacity_orig = capacity = 0;
+ capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -6041,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
* Use capacity_of(), which is set irrespective of domains
* in update_cpu_capacity().
*
- * This avoids capacity/capacity_orig from being 0 and
+ * This avoids capacity from being 0 and
* causing divide-by-zero issues on boot.
- *
- * Runtime updates will correct capacity_orig.
*/
if (unlikely(!rq->sd)) {
- capacity_orig += capacity_of(cpu);
capacity += capacity_of(cpu);
continue;
}
sgc = rq->sd->groups->sgc;
- capacity_orig += sgc->capacity_orig;
capacity += sgc->capacity;
}
} else {
@@ -6064,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- capacity_orig += group->sgc->capacity_orig;
capacity += group->sgc->capacity;
group = group->next;
} while (group != child->groups);
}
- sdg->sgc->capacity_orig = capacity_orig;
sdg->sgc->capacity = capacity;
}
/*
- * Try and fix up capacity for tiny siblings, this is needed when
- * things like SD_ASYM_PACKING need f_b_g to select another sibling
- * which on its own isn't powerful enough.
- *
- * See update_sd_pick_busiest() and check_asym_packing().
+ * Check whether the capacity of the rq has been noticeably reduced by side
+ * activity. The imbalance_pct is used for the threshold.
+ * Return true is the capacity is reduced
*/
static inline int
-fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
- /*
- * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
- */
- if (!(sd->flags & SD_SHARE_CPUCAPACITY))
- return 0;
-
- /*
- * If ~90% of the cpu_capacity is still there, we're good.
- */
- if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
- return 1;
-
- return 0;
+ return ((rq->cpu_capacity * sd->imbalance_pct) <
+ (rq->cpu_capacity_orig * 100));
}
/*
@@ -6134,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group)
}
/*
- * Compute the group capacity factor.
- *
- * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
- * first dividing out the smt factor and computing the actual number of cores
- * and limit unit capacity with that.
+ * group_has_capacity returns true if the group has spare capacity that could
+ * be used by some tasks.
+ * We consider that a group has spare capacity if the * number of task is
+ * smaller than the number of CPUs or if the usage is lower than the available
+ * capacity for CFS tasks.
+ * For the latter, we use a threshold to stabilize the state, to take into
+ * account the variance of the tasks' load and to return true if the available
+ * capacity in meaningful for the load balancer.
+ * As an example, an available capacity of 1% can appear but it doesn't make
+ * any benefit for the load balance.
*/
-static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
+static inline bool
+group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
{
- unsigned int capacity_factor, smt, cpus;
- unsigned int capacity, capacity_orig;
+ if (sgs->sum_nr_running < sgs->group_weight)
+ return true;
- capacity = group->sgc->capacity;
- capacity_orig = group->sgc->capacity_orig;
- cpus = group->group_weight;
+ if ((sgs->group_capacity * 100) >
+ (sgs->group_usage * env->sd->imbalance_pct))
+ return true;
+
+ return false;
+}
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
- capacity_factor = cpus / smt; /* cores */
+/*
+ * group_is_overloaded returns true if the group has more tasks than it can
+ * handle.
+ * group_is_overloaded is not equals to !group_has_capacity because a group
+ * with the exact right number of tasks, has no more spare capacity but is not
+ * overloaded so both group_has_capacity and group_is_overloaded return
+ * false.
+ */
+static inline bool
+group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running <= sgs->group_weight)
+ return false;
- capacity_factor = min_t(unsigned,
- capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
- if (!capacity_factor)
- capacity_factor = fix_small_capacity(env->sd, group);
+ if ((sgs->group_capacity * 100) <
+ (sgs->group_usage * env->sd->imbalance_pct))
+ return true;
- return capacity_factor;
+ return false;
}
-static enum group_type
-group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+static enum group_type group_classify(struct lb_env *env,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs)
{
- if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ if (sgs->group_no_capacity)
return group_overloaded;
if (sg_imbalanced(group))
@@ -6202,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
load = source_load(i, load_idx);
sgs->group_load += load;
+ sgs->group_usage += get_cpu_usage(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
if (rq->nr_running > 1)
@@ -6224,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
- sgs->group_capacity_factor = sg_capacity_factor(env, group);
- sgs->group_type = group_classify(group, sgs);
- if (sgs->group_capacity_factor > sgs->sum_nr_running)
- sgs->group_has_free_capacity = 1;
+ sgs->group_no_capacity = group_is_overloaded(env, sgs);
+ sgs->group_type = group_classify(env, group, sgs);
}
/**
@@ -6350,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity factor to one so that we'll try
+ * first, lower the sg capacity so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
- * these excess tasks, i.e. nr_running < group_capacity_factor. The
- * extra check prevents the case where you always pull from the
- * heaviest group when it is already under-utilized (possible
- * with a large weight task outweighs the tasks on the system).
+ * these excess tasks. The extra check prevents the case where
+ * you always pull from the heaviest group when it is already
+ * under-utilized (possible with a large weight task outweighs
+ * the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_free_capacity) {
- sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
- sgs->group_type = group_classify(sg, sgs);
+ group_has_capacity(env, &sds->local_stat) &&
+ (sgs->sum_nr_running > 1)) {
+ sgs->group_no_capacity = 1;
+ sgs->group_type = group_overloaded;
}
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
@@ -6541,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (busiest->group_type == group_overloaded &&
local->group_type == group_overloaded) {
- load_above_capacity =
- (busiest->sum_nr_running - busiest->group_capacity_factor);
-
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
- load_above_capacity /= busiest->group_capacity;
+ load_above_capacity = busiest->sum_nr_running *
+ SCHED_LOAD_SCALE;
+ if (load_above_capacity > busiest->group_capacity)
+ load_above_capacity -= busiest->group_capacity;
+ else
+ load_above_capacity = ~0UL;
}
/*
@@ -6608,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
local = &sds.local_stat;
busiest = &sds.busiest_stat;
+ /* ASYM feature bypasses nice load balance check */
if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
check_asym_packing(env, &sds))
return sds.busiest;
@@ -6628,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
- !busiest->group_has_free_capacity)
+ if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ busiest->group_no_capacity)
goto force_balance;
/*
@@ -6688,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
int i;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- unsigned long capacity, capacity_factor, wl;
+ unsigned long capacity, wl;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -6717,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
continue;
capacity = capacity_of(i);
- capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
- if (!capacity_factor)
- capacity_factor = fix_small_capacity(env->sd, group);
wl = weighted_cpuload(i);
@@ -6727,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the cpu capacity.
*/
- if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
+
+ if (rq->nr_running == 1 && wl > env->imbalance &&
+ !check_cpu_capacity(rq, env->sd))
continue;
/*
@@ -6775,6 +6849,19 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
+ /*
+ * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
+ * It's worth migrating the task if the src_cpu's capacity is reduced
+ * because of other sched_class or IRQs if more capacity stays
+ * available on dst_cpu.
+ */
+ if ((env->idle != CPU_NOT_IDLE) &&
+ (env->src_rq->cfs.h_nr_running == 1)) {
+ if ((check_cpu_capacity(env->src_rq, sd)) &&
+ (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
+ return 1;
+ }
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -6874,6 +6961,9 @@ redo:
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+
ld_moved = 0;
if (busiest->nr_running > 1) {
/*
@@ -6883,8 +6973,6 @@ redo:
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.src_cpu = busiest->cpu;
- env.src_rq = busiest;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
@@ -7584,22 +7672,25 @@ end:
/*
* Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu is the system.
+ * of an idle cpu in the system.
* - This rq has more than one task.
- * - At any scheduler domain level, this cpu's scheduler group has multiple
- * busy cpu's exceeding the group's capacity.
+ * - This rq has at least one CFS task and the capacity of the CPU is
+ * significantly reduced because of RT tasks or IRQs.
+ * - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ * multiple busy cpu.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline int nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain *sd;
struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu;
+ bool kick = false;
if (unlikely(rq->idle_balance))
- return 0;
+ return false;
/*
* We may be recently in ticked or tickless idle mode. At the first
@@ -7613,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq)
* balancing.
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
- return 0;
+ return false;
if (time_before(now, nohz.next_balance))
- return 0;
+ return false;
if (rq->nr_running >= 2)
- goto need_kick;
+ return true;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
-
if (sd) {
sgc = sd->groups->sgc;
nr_busy = atomic_read(&sgc->nr_busy_cpus);
- if (nr_busy > 1)
- goto need_kick_unlock;
+ if (nr_busy > 1) {
+ kick = true;
+ goto unlock;
+ }
+
}
- sd = rcu_dereference(per_cpu(sd_asym, cpu));
+ sd = rcu_dereference(rq->sd);
+ if (sd) {
+ if ((rq->cfs.h_nr_running >= 1) &&
+ check_cpu_capacity(rq, sd)) {
+ kick = true;
+ goto unlock;
+ }
+ }
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
- goto need_kick_unlock;
-
- rcu_read_unlock();
- return 0;
+ sched_domain_span(sd)) < cpu)) {
+ kick = true;
+ goto unlock;
+ }
-need_kick_unlock:
+unlock:
rcu_read_unlock();
-need_kick:
- return 1;
+ return kick;
}
#else
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
@@ -7660,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h)
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
- rebalance_domains(this_rq, idle);
-
/*
* If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
- * stopped.
+ * stopped. Do nohz_idle_balance *before* rebalance_domains to
+ * give the idle cpus a chance to load balance. Else we may
+ * load balance only within the local sched_domain hierarchy
+ * and abort nohz_idle_balance altogether if we pull some load.
*/
nohz_idle_balance(this_rq, idle);
+ rebalance_domains(this_rq, idle);
}
/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 90284d1..91e33cd 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
*/
SCHED_FEAT(TTWU_QUEUE, true)
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * In order to avoid a thundering herd attack of CPUs that are
+ * lowering their priorities at the same time, and there being
+ * a single CPU that has an RT task that can migrate and is waiting
+ * to run, where the other CPUs will try to take that CPUs
+ * rq lock and possibly create a large contention, sending an
+ * IPI to that CPU and let that CPU push the RT task to where
+ * it should go may be a better scenario.
+ */
+SCHED_FEAT(RT_PUSH_IPI, true)
+#endif
+
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 80014a1..deef1ca 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -158,8 +158,7 @@ static void cpuidle_idle_call(void)
* is used from another cpu as a broadcast timer, this call may
* fail if it is not available
*/
- if (broadcast &&
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
+ if (broadcast && tick_broadcast_enter())
goto use_default;
/* Take note of the planned idle state. */
@@ -176,7 +175,7 @@ static void cpuidle_idle_call(void)
idle_set_state(this_rq(), NULL);
if (broadcast)
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+ tick_broadcast_exit();
/*
* Give the governor an opportunity to reflect on the outcome
@@ -210,6 +209,8 @@ use_default:
goto exit_idle;
}
+DEFINE_PER_CPU(bool, cpu_dead_idle);
+
/*
* Generic idle loop implementation
*
@@ -234,8 +235,13 @@ static void cpu_idle_loop(void)
check_pgt_cache();
rmb();
- if (cpu_is_offline(smp_processor_id()))
+ if (cpu_is_offline(smp_processor_id())) {
+ rcu_cpu_notify(NULL, CPU_DYING_IDLE,
+ (void *)(long)smp_processor_id());
+ smp_mb(); /* all activity before dead. */
+ this_cpu_write(cpu_dead_idle, true);
arch_cpu_idle_dead();
+ }
local_irq_disable();
arch_cpu_idle_enter();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f4d4b07..575da76 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -6,6 +6,7 @@
#include "sched.h"
#include <linux/slab.h>
+#include <linux/irq_work.h>
int sched_rr_timeslice = RR_TIMESLICE;
@@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
-void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+#ifdef CONFIG_SMP
+static void push_irq_work_func(struct irq_work *work);
+#endif
+
+void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
int i;
@@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
+
+#ifdef HAVE_RT_PUSH_IPI
+ rt_rq->push_flags = 0;
+ rt_rq->push_cpu = nr_cpu_ids;
+ raw_spin_lock_init(&rt_rq->push_lock);
+ init_irq_work(&rt_rq->push_work, push_irq_work_func);
#endif
+#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
if (!rt_se)
goto err_free_rq;
- init_rt_rq(rt_rq, cpu_rq(i));
+ init_rt_rq(rt_rq);
rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
}
@@ -1778,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq)
;
}
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * The search for the next cpu always starts at rq->cpu and ends
+ * when we reach rq->cpu again. It will never return rq->cpu.
+ * This returns the next cpu to check, or nr_cpu_ids if the loop
+ * is complete.
+ *
+ * rq->rt.push_cpu holds the last cpu returned by this function,
+ * or if this is the first instance, it must hold rq->cpu.
+ */
+static int rto_next_cpu(struct rq *rq)
+{
+ int prev_cpu = rq->rt.push_cpu;
+ int cpu;
+
+ cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
+
+ /*
+ * If the previous cpu is less than the rq's CPU, then it already
+ * passed the end of the mask, and has started from the beginning.
+ * We end if the next CPU is greater or equal to rq's CPU.
+ */
+ if (prev_cpu < rq->cpu) {
+ if (cpu >= rq->cpu)
+ return nr_cpu_ids;
+
+ } else if (cpu >= nr_cpu_ids) {
+ /*
+ * We passed the end of the mask, start at the beginning.
+ * If the result is greater or equal to the rq's CPU, then
+ * the loop is finished.
+ */
+ cpu = cpumask_first(rq->rd->rto_mask);
+ if (cpu >= rq->cpu)
+ return nr_cpu_ids;
+ }
+ rq->rt.push_cpu = cpu;
+
+ /* Return cpu to let the caller know if the loop is finished or not */
+ return cpu;
+}
+
+static int find_next_push_cpu(struct rq *rq)
+{
+ struct rq *next_rq;
+ int cpu;
+
+ while (1) {
+ cpu = rto_next_cpu(rq);
+ if (cpu >= nr_cpu_ids)
+ break;
+ next_rq = cpu_rq(cpu);
+
+ /* Make sure the next rq can push to this rq */
+ if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+ break;
+ }
+
+ return cpu;
+}
+
+#define RT_PUSH_IPI_EXECUTING 1
+#define RT_PUSH_IPI_RESTART 2
+
+static void tell_cpu_to_push(struct rq *rq)
+{
+ int cpu;
+
+ if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+ raw_spin_lock(&rq->rt.push_lock);
+ /* Make sure it's still executing */
+ if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+ /*
+ * Tell the IPI to restart the loop as things have
+ * changed since it started.
+ */
+ rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
+ raw_spin_unlock(&rq->rt.push_lock);
+ return;
+ }
+ raw_spin_unlock(&rq->rt.push_lock);
+ }
+
+ /* When here, there's no IPI going around */
+
+ rq->rt.push_cpu = rq->cpu;
+ cpu = find_next_push_cpu(rq);
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+
+ irq_work_queue_on(&rq->rt.push_work, cpu);
+}
+
+/* Called from hardirq context */
+static void try_to_push_tasks(void *arg)
+{
+ struct rt_rq *rt_rq = arg;
+ struct rq *rq, *src_rq;
+ int this_cpu;
+ int cpu;
+
+ this_cpu = rt_rq->push_cpu;
+
+ /* Paranoid check */
+ BUG_ON(this_cpu != smp_processor_id());
+
+ rq = cpu_rq(this_cpu);
+ src_rq = rq_of_rt_rq(rt_rq);
+
+again:
+ if (has_pushable_tasks(rq)) {
+ raw_spin_lock(&rq->lock);
+ push_rt_task(rq);
+ raw_spin_unlock(&rq->lock);
+ }
+
+ /* Pass the IPI to the next rt overloaded queue */
+ raw_spin_lock(&rt_rq->push_lock);
+ /*
+ * If the source queue changed since the IPI went out,
+ * we need to restart the search from that CPU again.
+ */
+ if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
+ rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
+ rt_rq->push_cpu = src_rq->cpu;
+ }
+
+ cpu = find_next_push_cpu(src_rq);
+
+ if (cpu >= nr_cpu_ids)
+ rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
+ raw_spin_unlock(&rt_rq->push_lock);
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ /*
+ * It is possible that a restart caused this CPU to be
+ * chosen again. Don't bother with an IPI, just see if we
+ * have more to push.
+ */
+ if (unlikely(cpu == rq->cpu))
+ goto again;
+
+ /* Try the next RT overloaded CPU */
+ irq_work_queue_on(&rt_rq->push_work, cpu);
+}
+
+static void push_irq_work_func(struct irq_work *work)
+{
+ struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
+
+ try_to_push_tasks(rt_rq);
+}
+#endif /* HAVE_RT_PUSH_IPI */
+
static int pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, ret = 0, cpu;
@@ -1793,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq)
*/
smp_rmb();
+#ifdef HAVE_RT_PUSH_IPI
+ if (sched_feat(RT_PUSH_IPI)) {
+ tell_cpu_to_push(this_rq);
+ return 0;
+ }
+#endif
+
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dc0f435..e0e1299 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
+#include <linux/irq_work.h>
#include <linux/tick.h>
#include <linux/slab.h>
@@ -362,8 +363,14 @@ struct cfs_rq {
* Under CFS, load is tracked on a per-entity basis and aggregated up.
* This allows for the description of both thread and group usage (in
* the FAIR_GROUP_SCHED case).
+ * runnable_load_avg is the sum of the load_avg_contrib of the
+ * sched_entities on the rq.
+ * blocked_load_avg is similar to runnable_load_avg except that its
+ * the blocked sched_entities on the rq.
+ * utilization_load_avg is the sum of the average running time of the
+ * sched_entities on the rq.
*/
- unsigned long runnable_load_avg, blocked_load_avg;
+ unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
atomic64_t decay_counter;
u64 last_decay;
atomic_long_t removed_load;
@@ -418,6 +425,11 @@ static inline int rt_bandwidth_enabled(void)
return sysctl_sched_rt_runtime >= 0;
}
+/* RT IPI pull logic requires IRQ_WORK */
+#ifdef CONFIG_IRQ_WORK
+# define HAVE_RT_PUSH_IPI
+#endif
+
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
@@ -435,7 +447,13 @@ struct rt_rq {
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
+#ifdef HAVE_RT_PUSH_IPI
+ int push_flags;
+ int push_cpu;
+ struct irq_work push_work;
+ raw_spinlock_t push_lock;
#endif
+#endif /* CONFIG_SMP */
int rt_queued;
int rt_throttled;
@@ -597,6 +615,7 @@ struct rq {
struct sched_domain *sd;
unsigned long cpu_capacity;
+ unsigned long cpu_capacity_orig;
unsigned char idle_balance;
/* For active balancing */
@@ -807,7 +826,7 @@ struct sched_group_capacity {
* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
* for a single CPU.
*/
- unsigned int capacity, capacity_orig;
+ unsigned int capacity;
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
@@ -1368,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq)
#ifdef CONFIG_SMP
extern void sched_avg_update(struct rq *rq);
+
+#ifndef arch_scale_freq_capacity
+static __always_inline
+unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
- rq->rt_avg += rt_delta;
+ rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
sched_avg_update(rq);
}
#else
@@ -1643,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
-extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
+extern void init_rt_rq(struct rt_rq *rt_rq);
+extern void init_dl_rq(struct dl_rq *dl_rq);
extern void cfs_bandwidth_usage_inc(void);
extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 40190f2..c697f73 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -4,6 +4,7 @@
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/smp.h>
+#include <linux/delay.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>
@@ -314,3 +315,158 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
put_online_cpus();
}
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
+
+static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
+
+/*
+ * Called to poll specified CPU's state, for example, when waiting for
+ * a CPU to come online.
+ */
+int cpu_report_state(int cpu)
+{
+ return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+}
+
+/*
+ * If CPU has died properly, set its state to CPU_UP_PREPARE and
+ * return success. Otherwise, return -EBUSY if the CPU died after
+ * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN
+ * if cpu_wait_death() timed out and the CPU still hasn't gotten around
+ * to dying. In the latter two cases, the CPU might not be set up
+ * properly, but it is up to the arch-specific code to decide.
+ * Finally, -EIO indicates an unanticipated problem.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+int cpu_check_up_prepare(int cpu)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
+ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+ return 0;
+ }
+
+ switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
+
+ case CPU_POST_DEAD:
+
+ /* The CPU died properly, so just start it up again. */
+ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+ return 0;
+
+ case CPU_DEAD_FROZEN:
+
+ /*
+ * Timeout during CPU death, so let caller know.
+ * The outgoing CPU completed its processing, but after
+ * cpu_wait_death() timed out and reported the error. The
+ * caller is free to proceed, in which case the state
+ * will be reset properly by cpu_set_state_online().
+ * Proceeding despite this -EBUSY return makes sense
+ * for systems where the outgoing CPUs take themselves
+ * offline, with no post-death manipulation required from
+ * a surviving CPU.
+ */
+ return -EBUSY;
+
+ case CPU_BROKEN:
+
+ /*
+ * The most likely reason we got here is that there was
+ * a timeout during CPU death, and the outgoing CPU never
+ * did complete its processing. This could happen on
+ * a virtualized system if the outgoing VCPU gets preempted
+ * for more than five seconds, and the user attempts to
+ * immediately online that same CPU. Trying again later
+ * might return -EBUSY above, hence -EAGAIN.
+ */
+ return -EAGAIN;
+
+ default:
+
+ /* Should not happen. Famous last words. */
+ return -EIO;
+ }
+}
+
+/*
+ * Mark the specified CPU online.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+void cpu_set_state_online(int cpu)
+{
+ (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Wait for the specified CPU to exit the idle loop and die.
+ */
+bool cpu_wait_death(unsigned int cpu, int seconds)
+{
+ int jf_left = seconds * HZ;
+ int oldstate;
+ bool ret = true;
+ int sleep_jf = 1;
+
+ might_sleep();
+
+ /* The outgoing CPU will normally get done quite quickly. */
+ if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
+ goto update_state;
+ udelay(5);
+
+ /* But if the outgoing CPU dawdles, wait increasingly long times. */
+ while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
+ schedule_timeout_uninterruptible(sleep_jf);
+ jf_left -= sleep_jf;
+ if (jf_left <= 0)
+ break;
+ sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
+ }
+update_state:
+ oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+ if (oldstate == CPU_DEAD) {
+ /* Outgoing CPU died normally, update state. */
+ smp_mb(); /* atomic_read() before update. */
+ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
+ } else {
+ /* Outgoing CPU still hasn't died, set state accordingly. */
+ if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+ oldstate, CPU_BROKEN) != oldstate)
+ goto update_state;
+ ret = false;
+ }
+ return ret;
+}
+
+/*
+ * Called by the outgoing CPU to report its successful death. Return
+ * false if this report follows the surviving CPU's timing out.
+ *
+ * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
+ * timed out. This approach allows architectures to omit calls to
+ * cpu_check_up_prepare() and cpu_set_state_online() without defeating
+ * the next cpu_wait_death()'s polling loop.
+ */
+bool cpu_report_death(void)
+{
+ int oldstate;
+ int newstate;
+ int cpu = smp_processor_id();
+
+ do {
+ oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+ if (oldstate != CPU_BROKEN)
+ newstate = CPU_DEAD;
+ else
+ newstate = CPU_DEAD_FROZEN;
+ } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+ oldstate, newstate) != oldstate);
+ return newstate == CPU_DEAD;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 88ea2d6..ce410bb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1228,6 +1228,14 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
+ .procname = "dirtytime_expire_seconds",
+ .data = &dirtytime_expire_interval,
+ .maxlen = sizeof(dirty_expire_interval),
+ .mode = 0644,
+ .proc_handler = dirtytime_interval_handler,
+ .extra1 = &zero,
+ },
+ {
.procname = "nr_pdflush_threads",
.mode = 0444 /* read-only */,
.proc_handler = pdflush_proc_obsolete,
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index d626dc9..579ce1b 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -33,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET
config GENERIC_CLOCKEVENTS
bool
-# Migration helper. Builds, but does not invoke
-config GENERIC_CLOCKEVENTS_BUILD
- bool
- default y
- depends on GENERIC_CLOCKEVENTS
-
# Architecture can handle broadcast in a driver-agnostic way
config ARCH_HAS_TICK_BROADCAST
bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index c09c078..01f0312 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -2,15 +2,13 @@ obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o
ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
obj-y += tick-broadcast.o
obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o
endif
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
-obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
-obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
+obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
obj-$(CONFIG_TIMER_STATS) += timer_stats.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 489642b..25d942d 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,51 +94,49 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
}
EXPORT_SYMBOL_GPL(clockevent_delta2ns);
-static int __clockevents_set_mode(struct clock_event_device *dev,
- enum clock_event_mode mode)
+static int __clockevents_set_state(struct clock_event_device *dev,
+ enum clock_event_state state)
{
/* Transition with legacy set_mode() callback */
if (dev->set_mode) {
/* Legacy callback doesn't support new modes */
- if (mode > CLOCK_EVT_MODE_RESUME)
+ if (state > CLOCK_EVT_STATE_ONESHOT)
return -ENOSYS;
- dev->set_mode(mode, dev);
+ /*
+ * 'clock_event_state' and 'clock_event_mode' have 1-to-1
+ * mapping until *_ONESHOT, and so a simple cast will work.
+ */
+ dev->set_mode((enum clock_event_mode)state, dev);
+ dev->mode = (enum clock_event_mode)state;
return 0;
}
if (dev->features & CLOCK_EVT_FEAT_DUMMY)
return 0;
- /* Transition with new mode-specific callbacks */
- switch (mode) {
- case CLOCK_EVT_MODE_UNUSED:
+ /* Transition with new state-specific callbacks */
+ switch (state) {
+ case CLOCK_EVT_STATE_DETACHED:
/*
* This is an internal state, which is guaranteed to go from
- * SHUTDOWN to UNUSED. No driver interaction required.
+ * SHUTDOWN to DETACHED. No driver interaction required.
*/
return 0;
- case CLOCK_EVT_MODE_SHUTDOWN:
- return dev->set_mode_shutdown(dev);
+ case CLOCK_EVT_STATE_SHUTDOWN:
+ return dev->set_state_shutdown(dev);
- case CLOCK_EVT_MODE_PERIODIC:
+ case CLOCK_EVT_STATE_PERIODIC:
/* Core internal bug */
if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
return -ENOSYS;
- return dev->set_mode_periodic(dev);
+ return dev->set_state_periodic(dev);
- case CLOCK_EVT_MODE_ONESHOT:
+ case CLOCK_EVT_STATE_ONESHOT:
/* Core internal bug */
if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
return -ENOSYS;
- return dev->set_mode_oneshot(dev);
-
- case CLOCK_EVT_MODE_RESUME:
- /* Optional callback */
- if (dev->set_mode_resume)
- return dev->set_mode_resume(dev);
- else
- return 0;
+ return dev->set_state_oneshot(dev);
default:
return -ENOSYS;
@@ -146,26 +144,26 @@ static int __clockevents_set_mode(struct clock_event_device *dev,
}
/**
- * clockevents_set_mode - set the operating mode of a clock event device
+ * clockevents_set_state - set the operating state of a clock event device
* @dev: device to modify
- * @mode: new mode
+ * @state: new state
*
* Must be called with interrupts disabled !
*/
-void clockevents_set_mode(struct clock_event_device *dev,
- enum clock_event_mode mode)
+void clockevents_set_state(struct clock_event_device *dev,
+ enum clock_event_state state)
{
- if (dev->mode != mode) {
- if (__clockevents_set_mode(dev, mode))
+ if (dev->state != state) {
+ if (__clockevents_set_state(dev, state))
return;
- dev->mode = mode;
+ dev->state = state;
/*
* A nsec2cyc multiplicator of 0 is invalid and we'd crash
* on it, so fix it up and emit a warning:
*/
- if (mode == CLOCK_EVT_MODE_ONESHOT) {
+ if (state == CLOCK_EVT_STATE_ONESHOT) {
if (unlikely(!dev->mult)) {
dev->mult = 1;
WARN_ON(1);
@@ -180,10 +178,28 @@ void clockevents_set_mode(struct clock_event_device *dev,
*/
void clockevents_shutdown(struct clock_event_device *dev)
{
- clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
dev->next_event.tv64 = KTIME_MAX;
}
+/**
+ * clockevents_tick_resume - Resume the tick device before using it again
+ * @dev: device to resume
+ */
+int clockevents_tick_resume(struct clock_event_device *dev)
+{
+ int ret = 0;
+
+ if (dev->set_mode) {
+ dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
+ dev->mode = CLOCK_EVT_MODE_RESUME;
+ } else if (dev->tick_resume) {
+ ret = dev->tick_resume(dev);
+ }
+
+ return ret;
+}
+
#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
/* Limit min_delta to a jiffie */
@@ -236,7 +252,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
delta = dev->min_delta_ns;
dev->next_event = ktime_add_ns(ktime_get(), delta);
- if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
return 0;
dev->retries++;
@@ -273,7 +289,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
delta = dev->min_delta_ns;
dev->next_event = ktime_add_ns(ktime_get(), delta);
- if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
return 0;
dev->retries++;
@@ -305,7 +321,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
dev->next_event = expires;
- if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
return 0;
/* Shortcut for clockevent devices that can deal with ktime. */
@@ -350,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced)
struct clock_event_device *dev, *newdev = NULL;
list_for_each_entry(dev, &clockevent_devices, list) {
- if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED)
+ if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED)
continue;
if (!tick_check_replacement(newdev, dev))
@@ -376,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced)
static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
{
/* Fast track. Device is unused */
- if (ced->mode == CLOCK_EVT_MODE_UNUSED) {
+ if (ced->state == CLOCK_EVT_STATE_DETACHED) {
list_del_init(&ced->list);
return 0;
}
@@ -426,30 +442,32 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
}
EXPORT_SYMBOL_GPL(clockevents_unbind);
-/* Sanity check of mode transition callbacks */
+/* Sanity check of state transition callbacks */
static int clockevents_sanity_check(struct clock_event_device *dev)
{
/* Legacy set_mode() callback */
if (dev->set_mode) {
/* We shouldn't be supporting new modes now */
- WARN_ON(dev->set_mode_periodic || dev->set_mode_oneshot ||
- dev->set_mode_shutdown || dev->set_mode_resume);
+ WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
+ dev->set_state_shutdown || dev->tick_resume);
+
+ BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
return 0;
}
if (dev->features & CLOCK_EVT_FEAT_DUMMY)
return 0;
- /* New mode-specific callbacks */
- if (!dev->set_mode_shutdown)
+ /* New state-specific callbacks */
+ if (!dev->set_state_shutdown)
return -EINVAL;
if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
- !dev->set_mode_periodic)
+ !dev->set_state_periodic)
return -EINVAL;
if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
- !dev->set_mode_oneshot)
+ !dev->set_state_oneshot)
return -EINVAL;
return 0;
@@ -463,9 +481,11 @@ void clockevents_register_device(struct clock_event_device *dev)
{
unsigned long flags;
- BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
BUG_ON(clockevents_sanity_check(dev));
+ /* Initialize state to DETACHED */
+ dev->state = CLOCK_EVT_STATE_DETACHED;
+
if (!dev->cpumask) {
WARN_ON(num_possible_cpus() > 1);
dev->cpumask = cpumask_of(smp_processor_id());
@@ -529,11 +549,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
{
clockevents_config(dev, freq);
- if (dev->mode == CLOCK_EVT_MODE_ONESHOT)
+ if (dev->state == CLOCK_EVT_STATE_ONESHOT)
return clockevents_program_event(dev, dev->next_event, false);
- if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
- return __clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
+ if (dev->state == CLOCK_EVT_STATE_PERIODIC)
+ return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
return 0;
}
@@ -575,30 +595,27 @@ void clockevents_handle_noop(struct clock_event_device *dev)
* @old: device to release (can be NULL)
* @new: device to request (can be NULL)
*
- * Called from the notifier chain. clockevents_lock is held already
+ * Called from various tick functions with clockevents_lock held and
+ * interrupts disabled.
*/
void clockevents_exchange_device(struct clock_event_device *old,
struct clock_event_device *new)
{
- unsigned long flags;
-
- local_irq_save(flags);
/*
* Caller releases a clock event device. We queue it into the
* released list and do a notify add later.
*/
if (old) {
module_put(old->owner);
- clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
+ clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED);
list_del(&old->list);
list_add(&old->list, &clockevents_released);
}
if (new) {
- BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
+ BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED);
clockevents_shutdown(new);
}
- local_irq_restore(flags);
}
/**
@@ -625,74 +642,40 @@ void clockevents_resume(void)
dev->resume(dev);
}
-#ifdef CONFIG_GENERIC_CLOCKEVENTS
+#ifdef CONFIG_HOTPLUG_CPU
/**
- * clockevents_notify - notification about relevant events
- * Returns 0 on success, any other value on error
+ * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
*/
-int clockevents_notify(unsigned long reason, void *arg)
+void tick_cleanup_dead_cpu(int cpu)
{
struct clock_event_device *dev, *tmp;
unsigned long flags;
- int cpu, ret = 0;
raw_spin_lock_irqsave(&clockevents_lock, flags);
- switch (reason) {
- case CLOCK_EVT_NOTIFY_BROADCAST_ON:
- case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
- case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
- tick_broadcast_on_off(reason, arg);
- break;
-
- case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
- case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
- ret = tick_broadcast_oneshot_control(reason);
- break;
-
- case CLOCK_EVT_NOTIFY_CPU_DYING:
- tick_handover_do_timer(arg);
- break;
-
- case CLOCK_EVT_NOTIFY_SUSPEND:
- tick_suspend();
- tick_suspend_broadcast();
- break;
-
- case CLOCK_EVT_NOTIFY_RESUME:
- tick_resume();
- break;
-
- case CLOCK_EVT_NOTIFY_CPU_DEAD:
- tick_shutdown_broadcast_oneshot(arg);
- tick_shutdown_broadcast(arg);
- tick_shutdown(arg);
- /*
- * Unregister the clock event devices which were
- * released from the users in the notify chain.
- */
- list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
+ tick_shutdown_broadcast_oneshot(cpu);
+ tick_shutdown_broadcast(cpu);
+ tick_shutdown(cpu);
+ /*
+ * Unregister the clock event devices which were
+ * released from the users in the notify chain.
+ */
+ list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
+ list_del(&dev->list);
+ /*
+ * Now check whether the CPU has left unused per cpu devices
+ */
+ list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
+ if (cpumask_test_cpu(cpu, dev->cpumask) &&
+ cpumask_weight(dev->cpumask) == 1 &&
+ !tick_is_broadcast_device(dev)) {
+ BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED);
list_del(&dev->list);
- /*
- * Now check whether the CPU has left unused per cpu devices
- */
- cpu = *((int *)arg);
- list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
- if (cpumask_test_cpu(cpu, dev->cpumask) &&
- cpumask_weight(dev->cpumask) == 1 &&
- !tick_is_broadcast_device(dev)) {
- BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
- list_del(&dev->list);
- }
}
- break;
- default:
- break;
}
raw_spin_unlock_irqrestore(&clockevents_lock, flags);
- return ret;
}
-EXPORT_SYMBOL_GPL(clockevents_notify);
+#endif
#ifdef CONFIG_SYSFS
struct bus_type clockevents_subsys = {
@@ -811,5 +794,3 @@ static int __init clockevents_init_sysfs(void)
}
device_initcall(clockevents_init_sysfs);
#endif /* SYSFS */
-
-#endif /* GENERIC_CLOCK_EVENTS */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c3be3c7..15facb1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -472,8 +472,11 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
* @max_cyc: maximum cycle value before potential overflow (does not include
* any safety margin)
*
- * NOTE: This function includes a safety margin of 50%, so that bad clock values
- * can be detected.
+ * NOTE: This function includes a safety margin of 50%, in other words, we
+ * return half the number of nanoseconds the hardware counter can technically
+ * cover. This is done so that we can potentially detect problems caused by
+ * delayed timers or bad hardware, which might result in time intervals that
+ * are larger then what the math used can handle without overflows.
*/
u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
{
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index bee0c1f..76d4bd9 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -54,7 +54,7 @@
#include <trace/events/timer.h>
-#include "timekeeping.h"
+#include "tick-internal.h"
/*
* The timer bases:
@@ -1707,17 +1707,10 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DYING:
- case CPU_DYING_FROZEN:
- clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
- break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- {
- clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
migrate_hrtimers(scpu);
break;
- }
#endif
default:
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index c4bb518..347fecf 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -25,7 +25,7 @@
#include <linux/module.h>
#include <linux/init.h>
-#include "tick-internal.h"
+#include "timekeeping.h"
/* The Jiffies based clocksource is the lowest common
* denominator clock source which should function on
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 0f60b08..7a68100 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -17,7 +17,6 @@
#include <linux/module.h>
#include <linux/rtc.h>
-#include "tick-internal.h"
#include "ntp_internal.h"
/*
@@ -459,6 +458,16 @@ out:
return leap;
}
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+int __weak update_persistent_clock64(struct timespec64 now64)
+{
+ struct timespec now;
+
+ now = timespec64_to_timespec(now64);
+ return update_persistent_clock(now);
+}
+#endif
+
#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
static void sync_cmos_clock(struct work_struct *work);
@@ -494,8 +503,9 @@ static void sync_cmos_clock(struct work_struct *work)
if (persistent_clock_is_local)
adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
#ifdef CONFIG_GENERIC_CMOS_UPDATE
- fail = update_persistent_clock(timespec64_to_timespec(adjust));
+ fail = update_persistent_clock64(adjust);
#endif
+
#ifdef CONFIG_RTC_SYSTOHC
if (fail == -ENODEV)
fail = rtc_set_ntp_time(adjust);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 066f0ec..7e8ca4f 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -33,12 +33,14 @@ static cpumask_var_t tick_broadcast_mask;
static cpumask_var_t tick_broadcast_on;
static cpumask_var_t tmpmask;
static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
-static int tick_broadcast_force;
+static int tick_broadcast_forced;
#ifdef CONFIG_TICK_ONESHOT
static void tick_broadcast_clear_oneshot(int cpu);
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
#else
static inline void tick_broadcast_clear_oneshot(int cpu) { }
+static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
#endif
/*
@@ -303,7 +305,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
/*
* The device is in periodic mode. No reprogramming necessary:
*/
- if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+ if (dev->state == CLOCK_EVT_STATE_PERIODIC)
goto unlock;
/*
@@ -324,49 +326,54 @@ unlock:
raw_spin_unlock(&tick_broadcast_lock);
}
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop
+/**
+ * tick_broadcast_control - Enable/disable or force broadcast mode
+ * @mode: The selected broadcast mode
+ *
+ * Called when the system enters a state where affected tick devices
+ * might stop. Note: TICK_BROADCAST_FORCE cannot be undone.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
*/
-static void tick_do_broadcast_on_off(unsigned long *reason)
+void tick_broadcast_control(enum tick_broadcast_mode mode)
{
struct clock_event_device *bc, *dev;
struct tick_device *td;
- unsigned long flags;
int cpu, bc_stopped;
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
- cpu = smp_processor_id();
- td = &per_cpu(tick_cpu_device, cpu);
+ td = this_cpu_ptr(&tick_cpu_device);
dev = td->evtdev;
- bc = tick_broadcast_device.evtdev;
/*
* Is the device not affected by the powerstate ?
*/
if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
- goto out;
+ return;
if (!tick_device_is_functional(dev))
- goto out;
+ return;
+ raw_spin_lock(&tick_broadcast_lock);
+ cpu = smp_processor_id();
+ bc = tick_broadcast_device.evtdev;
bc_stopped = cpumask_empty(tick_broadcast_mask);
- switch (*reason) {
- case CLOCK_EVT_NOTIFY_BROADCAST_ON:
- case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+ switch (mode) {
+ case TICK_BROADCAST_FORCE:
+ tick_broadcast_forced = 1;
+ case TICK_BROADCAST_ON:
cpumask_set_cpu(cpu, tick_broadcast_on);
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
if (tick_broadcast_device.mode ==
TICKDEV_MODE_PERIODIC)
clockevents_shutdown(dev);
}
- if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
- tick_broadcast_force = 1;
break;
- case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
- if (tick_broadcast_force)
+
+ case TICK_BROADCAST_OFF:
+ if (tick_broadcast_forced)
break;
cpumask_clear_cpu(cpu, tick_broadcast_on);
if (!tick_device_is_functional(dev))
@@ -388,22 +395,9 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
else
tick_broadcast_setup_oneshot(bc);
}
-out:
- raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-}
-
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop.
- */
-void tick_broadcast_on_off(unsigned long reason, int *oncpu)
-{
- if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
- printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
- "offline CPU #%d\n", *oncpu);
- else
- tick_do_broadcast_on_off(&reason);
+ raw_spin_unlock(&tick_broadcast_lock);
}
+EXPORT_SYMBOL_GPL(tick_broadcast_control);
/*
* Set the periodic handler depending on broadcast on/off
@@ -416,14 +410,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
dev->event_handler = tick_handle_periodic_broadcast;
}
+#ifdef CONFIG_HOTPLUG_CPU
/*
* Remove a CPU from broadcasting
*/
-void tick_shutdown_broadcast(unsigned int *cpup)
+void tick_shutdown_broadcast(unsigned int cpu)
{
struct clock_event_device *bc;
unsigned long flags;
- unsigned int cpu = *cpup;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -438,6 +432,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
+#endif
void tick_suspend_broadcast(void)
{
@@ -453,38 +448,48 @@ void tick_suspend_broadcast(void)
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
-int tick_resume_broadcast(void)
+/*
+ * This is called from tick_resume_local() on a resuming CPU. That's
+ * called from the core resume function, tick_unfreeze() and the magic XEN
+ * resume hackery.
+ *
+ * In none of these cases the broadcast device mode can change and the
+ * bit of the resuming CPU in the broadcast mask is safe as well.
+ */
+bool tick_resume_check_broadcast(void)
+{
+ if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT)
+ return false;
+ else
+ return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask);
+}
+
+void tick_resume_broadcast(void)
{
struct clock_event_device *bc;
unsigned long flags;
- int broadcast = 0;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
bc = tick_broadcast_device.evtdev;
if (bc) {
- clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
+ clockevents_tick_resume(bc);
switch (tick_broadcast_device.mode) {
case TICKDEV_MODE_PERIODIC:
if (!cpumask_empty(tick_broadcast_mask))
tick_broadcast_start_periodic(bc);
- broadcast = cpumask_test_cpu(smp_processor_id(),
- tick_broadcast_mask);
break;
case TICKDEV_MODE_ONESHOT:
if (!cpumask_empty(tick_broadcast_mask))
- broadcast = tick_resume_broadcast_oneshot(bc);
+ tick_resume_broadcast_oneshot(bc);
break;
}
}
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-
- return broadcast;
}
-
#ifdef CONFIG_TICK_ONESHOT
static cpumask_var_t tick_broadcast_oneshot_mask;
@@ -532,8 +537,8 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
{
int ret;
- if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+ if (bc->state != CLOCK_EVT_STATE_ONESHOT)
+ clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
ret = clockevents_program_event(bc, expires, force);
if (!ret)
@@ -541,10 +546,9 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
return ret;
}
-int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
{
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
- return 0;
+ clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
}
/*
@@ -562,8 +566,8 @@ void tick_check_oneshot_broadcast_this_cpu(void)
* switched over, leave the device alone.
*/
if (td->mode == TICKDEV_MODE_ONESHOT) {
- clockevents_set_mode(td->evtdev,
- CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(td->evtdev,
+ CLOCK_EVT_STATE_ONESHOT);
}
}
}
@@ -666,31 +670,26 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
if (dev->next_event.tv64 < bc->next_event.tv64)
return;
}
- clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
}
-static void broadcast_move_bc(int deadcpu)
-{
- struct clock_event_device *bc = tick_broadcast_device.evtdev;
-
- if (!bc || !broadcast_needs_cpu(bc, deadcpu))
- return;
- /* This moves the broadcast assignment to this cpu */
- clockevents_program_event(bc, bc->next_event, 1);
-}
-
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop
+/**
+ * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
+ * @state: The target state (enter/exit)
+ *
+ * The system enters/leaves a state, where affected devices might stop
* Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
*/
-int tick_broadcast_oneshot_control(unsigned long reason)
+int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
{
struct clock_event_device *bc, *dev;
struct tick_device *td;
- unsigned long flags;
- ktime_t now;
int cpu, ret = 0;
+ ktime_t now;
/*
* Periodic mode does not care about the enter/exit of power
@@ -703,17 +702,17 @@ int tick_broadcast_oneshot_control(unsigned long reason)
* We are called with preemtion disabled from the depth of the
* idle code, so we can't be moved away.
*/
- cpu = smp_processor_id();
- td = &per_cpu(tick_cpu_device, cpu);
+ td = this_cpu_ptr(&tick_cpu_device);
dev = td->evtdev;
if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
return 0;
+ raw_spin_lock(&tick_broadcast_lock);
bc = tick_broadcast_device.evtdev;
+ cpu = smp_processor_id();
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
- if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
+ if (state == TICK_BROADCAST_ENTER) {
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
broadcast_shutdown_local(bc, dev);
@@ -741,7 +740,7 @@ int tick_broadcast_oneshot_control(unsigned long reason)
cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
} else {
if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
/*
* The cpu which was handling the broadcast
* timer marked this cpu in the broadcast
@@ -805,9 +804,10 @@ int tick_broadcast_oneshot_control(unsigned long reason)
}
}
out:
- raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+ raw_spin_unlock(&tick_broadcast_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
/*
* Reset the one shot broadcast for a cpu
@@ -842,7 +842,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
/* Set it up only once ! */
if (bc->event_handler != tick_handle_oneshot_broadcast) {
- int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
+ int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC;
bc->event_handler = tick_handle_oneshot_broadcast;
@@ -858,7 +858,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
tick_broadcast_oneshot_mask, tmpmask);
if (was_periodic && !cpumask_empty(tmpmask)) {
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
tick_broadcast_init_next_event(tmpmask,
tick_next_period);
tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
@@ -894,14 +894,28 @@ void tick_broadcast_switch_to_oneshot(void)
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
+#ifdef CONFIG_HOTPLUG_CPU
+void hotplug_cpu__broadcast_tick_pull(int deadcpu)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+ bc = tick_broadcast_device.evtdev;
+
+ if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+ /* This moves the broadcast assignment to this CPU: */
+ clockevents_program_event(bc, bc->next_event, 1);
+ }
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
/*
* Remove a dead CPU from broadcasting
*/
-void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
+void tick_shutdown_broadcast_oneshot(unsigned int cpu)
{
unsigned long flags;
- unsigned int cpu = *cpup;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -913,10 +927,9 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
- broadcast_move_bc(cpu);
-
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
+#endif
/*
* Check, whether the broadcast device is in one shot mode
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index f7c5155..3ae6afa 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -102,7 +102,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
tick_periodic(cpu);
- if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+ if (dev->state != CLOCK_EVT_STATE_ONESHOT)
return;
for (;;) {
/*
@@ -140,7 +140,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
!tick_broadcast_oneshot_active()) {
- clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
} else {
unsigned long seq;
ktime_t next;
@@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
next = tick_next_period;
} while (read_seqretry(&jiffies_lock, seq));
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
for (;;) {
if (!clockevents_program_event(dev, next, false))
@@ -332,14 +332,16 @@ out_bc:
tick_install_broadcast_device(newdev);
}
+#ifdef CONFIG_HOTPLUG_CPU
/*
* Transfer the do_timer job away from a dying cpu.
*
- * Called with interrupts disabled.
+ * Called with interrupts disabled. Not locking required. If
+ * tick_do_timer_cpu is owned by this cpu, nothing can change it.
*/
-void tick_handover_do_timer(int *cpup)
+void tick_handover_do_timer(void)
{
- if (*cpup == tick_do_timer_cpu) {
+ if (tick_do_timer_cpu == smp_processor_id()) {
int cpu = cpumask_first(cpu_online_mask);
tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
@@ -354,9 +356,9 @@ void tick_handover_do_timer(int *cpup)
* access the hardware device itself.
* We just set the mode and remove it from the lists.
*/
-void tick_shutdown(unsigned int *cpup)
+void tick_shutdown(unsigned int cpu)
{
- struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
+ struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
struct clock_event_device *dev = td->evtdev;
td->mode = TICKDEV_MODE_PERIODIC;
@@ -365,27 +367,42 @@ void tick_shutdown(unsigned int *cpup)
* Prevent that the clock events layer tries to call
* the set mode function!
*/
+ dev->state = CLOCK_EVT_STATE_DETACHED;
dev->mode = CLOCK_EVT_MODE_UNUSED;
clockevents_exchange_device(dev, NULL);
dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
}
}
+#endif
-void tick_suspend(void)
+/**
+ * tick_suspend_local - Suspend the local tick device
+ *
+ * Called from the local cpu for freeze with interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend_local(void)
{
struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
clockevents_shutdown(td->evtdev);
}
-void tick_resume(void)
+/**
+ * tick_resume_local - Resume the local tick device
+ *
+ * Called from the local CPU for unfreeze or XEN resume magic.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume_local(void)
{
struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
- int broadcast = tick_resume_broadcast();
-
- clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
+ bool broadcast = tick_resume_check_broadcast();
+ clockevents_tick_resume(td->evtdev);
if (!broadcast) {
if (td->mode == TICKDEV_MODE_PERIODIC)
tick_setup_periodic(td->evtdev, 0);
@@ -394,6 +411,35 @@ void tick_resume(void)
}
}
+/**
+ * tick_suspend - Suspend the tick and the broadcast device
+ *
+ * Called from syscore_suspend() via timekeeping_suspend with only one
+ * CPU online and interrupts disabled or from tick_unfreeze() under
+ * tick_freeze_lock.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend(void)
+{
+ tick_suspend_local();
+ tick_suspend_broadcast();
+}
+
+/**
+ * tick_resume - Resume the tick and the broadcast device
+ *
+ * Called from syscore_resume() via timekeeping_resume with only one
+ * CPU online and interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume(void)
+{
+ tick_resume_broadcast();
+ tick_resume_local();
+}
+
static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
static unsigned int tick_freeze_depth;
@@ -411,12 +457,10 @@ void tick_freeze(void)
raw_spin_lock(&tick_freeze_lock);
tick_freeze_depth++;
- if (tick_freeze_depth == num_online_cpus()) {
+ if (tick_freeze_depth == num_online_cpus())
timekeeping_suspend();
- } else {
- tick_suspend();
- tick_suspend_broadcast();
- }
+ else
+ tick_suspend_local();
raw_spin_unlock(&tick_freeze_lock);
}
@@ -437,7 +481,7 @@ void tick_unfreeze(void)
if (tick_freeze_depth == num_online_cpus())
timekeeping_resume();
else
- tick_resume();
+ tick_resume_local();
tick_freeze_depth--;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 366aeb4..b64fdd8 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -5,15 +5,12 @@
#include <linux/tick.h>
#include "timekeeping.h"
+#include "tick-sched.h"
-extern seqlock_t jiffies_lock;
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
-#define CS_NAME_LEN 32
-
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
-
-#define TICK_DO_TIMER_NONE -1
-#define TICK_DO_TIMER_BOOT -2
+# define TICK_DO_TIMER_NONE -1
+# define TICK_DO_TIMER_BOOT -2
DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
extern ktime_t tick_next_period;
@@ -23,21 +20,72 @@ extern int tick_do_timer_cpu __read_mostly;
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
extern void tick_check_new_device(struct clock_event_device *dev);
-extern void tick_handover_do_timer(int *cpup);
-extern void tick_shutdown(unsigned int *cpup);
+extern void tick_shutdown(unsigned int cpu);
extern void tick_suspend(void);
extern void tick_resume(void);
extern bool tick_check_replacement(struct clock_event_device *curdev,
struct clock_event_device *newdev);
extern void tick_install_replacement(struct clock_event_device *dev);
+extern int tick_is_oneshot_available(void);
+extern struct tick_device *tick_get_device(int cpu);
-extern void clockevents_shutdown(struct clock_event_device *dev);
+extern int clockevents_tick_resume(struct clock_event_device *dev);
+/* Check, if the device is functional or a dummy for broadcast */
+static inline int tick_device_is_functional(struct clock_event_device *dev)
+{
+ return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
+}
+extern void clockevents_shutdown(struct clock_event_device *dev);
+extern void clockevents_exchange_device(struct clock_event_device *old,
+ struct clock_event_device *new);
+extern void clockevents_set_state(struct clock_event_device *dev,
+ enum clock_event_state state);
+extern int clockevents_program_event(struct clock_event_device *dev,
+ ktime_t expires, bool force);
+extern void clockevents_handle_noop(struct clock_event_device *dev);
+extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
-/*
- * NO_HZ / high resolution timer shared code
- */
+/* Broadcasting support */
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
+extern void tick_install_broadcast_device(struct clock_event_device *dev);
+extern int tick_is_broadcast_device(struct clock_event_device *dev);
+extern void tick_shutdown_broadcast(unsigned int cpu);
+extern void tick_suspend_broadcast(void);
+extern void tick_resume_broadcast(void);
+extern bool tick_resume_check_broadcast(void);
+extern void tick_broadcast_init(void);
+extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
+extern struct tick_device *tick_get_broadcast_device(void);
+extern struct cpumask *tick_get_broadcast_mask(void);
+# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */
+static inline void tick_install_broadcast_device(struct clock_event_device *dev) { }
+static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
+static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
+static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
+static inline void tick_shutdown_broadcast(unsigned int cpu) { }
+static inline void tick_suspend_broadcast(void) { }
+static inline void tick_resume_broadcast(void) { }
+static inline bool tick_resume_check_broadcast(void) { return false; }
+static inline void tick_broadcast_init(void) { }
+static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; }
+
+/* Set the periodic handler in non broadcast mode */
+static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
+{
+ dev->event_handler = tick_handle_periodic;
+}
+# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */
+
+#else /* !GENERIC_CLOCKEVENTS: */
+static inline void tick_suspend(void) { }
+static inline void tick_resume(void) { }
+#endif /* !GENERIC_CLOCKEVENTS */
+
+/* Oneshot related functions */
#ifdef CONFIG_TICK_ONESHOT
extern void tick_setup_oneshot(struct clock_event_device *newdev,
void (*handler)(struct clock_event_device *),
@@ -46,58 +94,42 @@ extern int tick_program_event(ktime_t expires, int force);
extern void tick_oneshot_notify(void);
extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
extern void tick_resume_oneshot(void);
-# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+static inline bool tick_oneshot_possible(void) { return true; }
+extern int tick_oneshot_mode_active(void);
+extern void tick_clock_notify(void);
+extern int tick_check_oneshot_change(int allow_nohz);
+extern int tick_init_highres(void);
+#else /* !CONFIG_TICK_ONESHOT: */
+static inline
+void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t nextevt) { BUG(); }
+static inline void tick_resume_oneshot(void) { BUG(); }
+static inline int tick_program_event(ktime_t expires, int force) { return 0; }
+static inline void tick_oneshot_notify(void) { }
+static inline bool tick_oneshot_possible(void) { return false; }
+static inline int tick_oneshot_mode_active(void) { return 0; }
+static inline void tick_clock_notify(void) { }
+static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+#endif /* !CONFIG_TICK_ONESHOT */
+
+/* Functions related to oneshot broadcasting */
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
-extern int tick_broadcast_oneshot_control(unsigned long reason);
extern void tick_broadcast_switch_to_oneshot(void);
-extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
-extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
extern int tick_broadcast_oneshot_active(void);
extern void tick_check_oneshot_broadcast_this_cpu(void);
bool tick_broadcast_oneshot_available(void);
-# else /* BROADCAST */
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
-{
- BUG();
-}
-static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
+extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
+#else /* !(BROADCAST && ONESHOT): */
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
static inline void tick_broadcast_switch_to_oneshot(void) { }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
static inline int tick_broadcast_oneshot_active(void) { return 0; }
static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
-static inline bool tick_broadcast_oneshot_available(void) { return true; }
-# endif /* !BROADCAST */
-
-#else /* !ONESHOT */
-static inline
-void tick_setup_oneshot(struct clock_event_device *newdev,
- void (*handler)(struct clock_event_device *),
- ktime_t nextevt)
-{
- BUG();
-}
-static inline void tick_resume_oneshot(void)
-{
- BUG();
-}
-static inline int tick_program_event(ktime_t expires, int force)
-{
- return 0;
-}
-static inline void tick_oneshot_notify(void) { }
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
-{
- BUG();
-}
-static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
-static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
-{
- return 0;
-}
-static inline int tick_broadcast_oneshot_active(void) { return 0; }
-static inline bool tick_broadcast_oneshot_available(void) { return false; }
-#endif /* !TICK_ONESHOT */
+static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
+#endif /* !(BROADCAST && ONESHOT) */
/* NO_HZ_FULL internal */
#ifdef CONFIG_NO_HZ_FULL
@@ -105,68 +137,3 @@ extern void tick_nohz_init(void);
# else
static inline void tick_nohz_init(void) { }
#endif
-
-/*
- * Broadcasting support
- */
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
-extern void tick_install_broadcast_device(struct clock_event_device *dev);
-extern int tick_is_broadcast_device(struct clock_event_device *dev);
-extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
-extern void tick_shutdown_broadcast(unsigned int *cpup);
-extern void tick_suspend_broadcast(void);
-extern int tick_resume_broadcast(void);
-extern void tick_broadcast_init(void);
-extern void
-tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
-int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
-
-#else /* !BROADCAST */
-
-static inline void tick_install_broadcast_device(struct clock_event_device *dev)
-{
-}
-
-static inline int tick_is_broadcast_device(struct clock_event_device *dev)
-{
- return 0;
-}
-static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
- int cpu)
-{
- return 0;
-}
-static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
-static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
-static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
-static inline void tick_suspend_broadcast(void) { }
-static inline int tick_resume_broadcast(void) { return 0; }
-static inline void tick_broadcast_init(void) { }
-static inline int tick_broadcast_update_freq(struct clock_event_device *dev,
- u32 freq) { return -ENODEV; }
-
-/*
- * Set the periodic handler in non broadcast mode
- */
-static inline void tick_set_periodic_handler(struct clock_event_device *dev,
- int broadcast)
-{
- dev->event_handler = tick_handle_periodic;
-}
-#endif /* !BROADCAST */
-
-/*
- * Check, if the device is functional or a dummy for broadcast
- */
-static inline int tick_device_is_functional(struct clock_event_device *dev)
-{
- return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
-}
-
-int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
-
-#endif
-
-extern void do_timer(unsigned long ticks);
-extern void update_wall_time(void);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 7ce740e..67a64b1 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -38,7 +38,7 @@ void tick_resume_oneshot(void)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
clockevents_program_event(dev, ktime_get(), true);
}
@@ -50,7 +50,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
ktime_t next_event)
{
newdev->event_handler = handler;
- clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT);
clockevents_program_event(newdev, next_event, true);
}
@@ -81,7 +81,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
td->mode = TICKDEV_MODE_ONESHOT;
dev->event_handler = handler;
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
tick_broadcast_switch_to_oneshot();
return 0;
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a4c4eda..9142591 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -34,7 +34,7 @@
/*
* Per cpu nohz control structure
*/
-DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
/*
* The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -416,6 +416,11 @@ static int __init setup_tick_nohz(char *str)
__setup("nohz=", setup_tick_nohz);
+int tick_nohz_tick_stopped(void)
+{
+ return __this_cpu_read(tick_cpu_sched.tick_stopped);
+}
+
/**
* tick_nohz_update_jiffies - update jiffies when idle was interrupted
*
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
new file mode 100644
index 0000000..28b5da3
--- /dev/null
+++ b/kernel/time/tick-sched.h
@@ -0,0 +1,74 @@
+#ifndef _TICK_SCHED_H
+#define _TICK_SCHED_H
+
+#include <linux/hrtimer.h>
+
+enum tick_device_mode {
+ TICKDEV_MODE_PERIODIC,
+ TICKDEV_MODE_ONESHOT,
+};
+
+struct tick_device {
+ struct clock_event_device *evtdev;
+ enum tick_device_mode mode;
+};
+
+enum tick_nohz_mode {
+ NOHZ_MODE_INACTIVE,
+ NOHZ_MODE_LOWRES,
+ NOHZ_MODE_HIGHRES,
+};
+
+/**
+ * struct tick_sched - sched tick emulation and no idle tick control/stats
+ * @sched_timer: hrtimer to schedule the periodic tick in high
+ * resolution mode
+ * @last_tick: Store the last tick expiry time when the tick
+ * timer is modified for nohz sleeps. This is necessary
+ * to resume the tick timer operation in the timeline
+ * when the CPU returns from nohz sleep.
+ * @tick_stopped: Indicator that the idle tick has been stopped
+ * @idle_jiffies: jiffies at the entry to idle for idle time accounting
+ * @idle_calls: Total number of idle calls
+ * @idle_sleeps: Number of idle calls, where the sched tick was stopped
+ * @idle_entrytime: Time when the idle call was entered
+ * @idle_waketime: Time when the idle was interrupted
+ * @idle_exittime: Time when the idle state was left
+ * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
+ * @sleep_length: Duration of the current idle sleep
+ * @do_timer_lst: CPU was the last one doing do_timer before going idle
+ */
+struct tick_sched {
+ struct hrtimer sched_timer;
+ unsigned long check_clocks;
+ enum tick_nohz_mode nohz_mode;
+ ktime_t last_tick;
+ int inidle;
+ int tick_stopped;
+ unsigned long idle_jiffies;
+ unsigned long idle_calls;
+ unsigned long idle_sleeps;
+ int idle_active;
+ ktime_t idle_entrytime;
+ ktime_t idle_waketime;
+ ktime_t idle_exittime;
+ ktime_t idle_sleeptime;
+ ktime_t iowait_sleeptime;
+ ktime_t sleep_length;
+ unsigned long last_jiffies;
+ unsigned long next_jiffies;
+ ktime_t idle_expires;
+ int do_timer_last;
+};
+
+extern struct tick_sched *tick_get_tick_sched(int cpu);
+
+extern void tick_setup_sched_timer(void);
+#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
+extern void tick_cancel_sched_timer(int cpu);
+#else
+static inline void tick_cancel_sched_timer(int cpu) { }
+#endif
+
+#endif
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c3fcff0..946acb7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -64,9 +64,6 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
-/* Flag for if there is a persistent clock on this platform */
-bool __read_mostly persistent_clock_exist = false;
-
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
@@ -1173,6 +1170,14 @@ void __weak read_persistent_clock(struct timespec *ts)
ts->tv_nsec = 0;
}
+void __weak read_persistent_clock64(struct timespec64 *ts64)
+{
+ struct timespec ts;
+
+ read_persistent_clock(&ts);
+ *ts64 = timespec_to_timespec64(ts);
+}
+
/**
* read_boot_clock - Return time of the system start.
*
@@ -1188,6 +1193,20 @@ void __weak read_boot_clock(struct timespec *ts)
ts->tv_nsec = 0;
}
+void __weak read_boot_clock64(struct timespec64 *ts64)
+{
+ struct timespec ts;
+
+ read_boot_clock(&ts);
+ *ts64 = timespec_to_timespec64(ts);
+}
+
+/* Flag for if timekeeping_resume() has injected sleeptime */
+static bool sleeptime_injected;
+
+/* Flag for if there is a persistent clock on this platform */
+static bool persistent_clock_exists;
+
/*
* timekeeping_init - Initializes the clocksource and common timekeeping values
*/
@@ -1197,20 +1216,17 @@ void __init timekeeping_init(void)
struct clocksource *clock;
unsigned long flags;
struct timespec64 now, boot, tmp;
- struct timespec ts;
- read_persistent_clock(&ts);
- now = timespec_to_timespec64(ts);
+ read_persistent_clock64(&now);
if (!timespec64_valid_strict(&now)) {
pr_warn("WARNING: Persistent clock returned invalid value!\n"
" Check your CMOS/BIOS settings.\n");
now.tv_sec = 0;
now.tv_nsec = 0;
} else if (now.tv_sec || now.tv_nsec)
- persistent_clock_exist = true;
+ persistent_clock_exists = true;
- read_boot_clock(&ts);
- boot = timespec_to_timespec64(ts);
+ read_boot_clock64(&boot);
if (!timespec64_valid_strict(&boot)) {
pr_warn("WARNING: Boot clock returned invalid value!\n"
" Check your CMOS/BIOS settings.\n");
@@ -1242,7 +1258,7 @@ void __init timekeeping_init(void)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
-/* time in seconds when suspend began */
+/* time in seconds when suspend began for persistent clock */
static struct timespec64 timekeeping_suspend_time;
/**
@@ -1267,12 +1283,49 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
tk_debug_account_sleep_time(delta);
}
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
+/**
+ * We have three kinds of time sources to use for sleep time
+ * injection, the preference order is:
+ * 1) non-stop clocksource
+ * 2) persistent clock (ie: RTC accessible when irqs are off)
+ * 3) RTC
+ *
+ * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
+ * If system has neither 1) nor 2), 3) will be used finally.
+ *
+ *
+ * If timekeeping has injected sleeptime via either 1) or 2),
+ * 3) becomes needless, so in this case we don't need to call
+ * rtc_resume(), and this is what timekeeping_rtc_skipresume()
+ * means.
+ */
+bool timekeeping_rtc_skipresume(void)
+{
+ return sleeptime_injected;
+}
+
+/**
+ * 1) can be determined whether to use or not only when doing
+ * timekeeping_resume() which is invoked after rtc_suspend(),
+ * so we can't skip rtc_suspend() surely if system has 1).
+ *
+ * But if system has 2), 2) will definitely be used, so in this
+ * case we don't need to call rtc_suspend(), and this is what
+ * timekeeping_rtc_skipsuspend() means.
+ */
+bool timekeeping_rtc_skipsuspend(void)
+{
+ return persistent_clock_exists;
+}
+
/**
* timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
* @delta: pointer to a timespec64 delta value
*
- * This hook is for architectures that cannot support read_persistent_clock
+ * This hook is for architectures that cannot support read_persistent_clock64
* because their RTC/persistent clock is only accessible when irqs are enabled.
+ * and also don't have an effective nonstop clocksource.
*
* This function should only be called by rtc_resume(), and allows
* a suspend offset to be injected into the timekeeping values.
@@ -1282,13 +1335,6 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
- /*
- * Make sure we don't set the clock twice, as timekeeping_resume()
- * already did it
- */
- if (has_persistent_clock())
- return;
-
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
@@ -1304,13 +1350,10 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
/* signal hrtimers about time change */
clock_was_set();
}
+#endif
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
- *
- * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/etc are
- * still managed by arch specific suspend/resume code.
*/
void timekeeping_resume(void)
{
@@ -1318,12 +1361,10 @@ void timekeeping_resume(void)
struct clocksource *clock = tk->tkr_mono.clock;
unsigned long flags;
struct timespec64 ts_new, ts_delta;
- struct timespec tmp;
cycle_t cycle_now, cycle_delta;
- bool suspendtime_found = false;
- read_persistent_clock(&tmp);
- ts_new = timespec_to_timespec64(tmp);
+ sleeptime_injected = false;
+ read_persistent_clock64(&ts_new);
clockevents_resume();
clocksource_resume();
@@ -1368,13 +1409,13 @@ void timekeeping_resume(void)
nsec += ((u64) cycle_delta * mult) >> shift;
ts_delta = ns_to_timespec64(nsec);
- suspendtime_found = true;
+ sleeptime_injected = true;
} else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
- suspendtime_found = true;
+ sleeptime_injected = true;
}
- if (suspendtime_found)
+ if (sleeptime_injected)
__timekeeping_inject_sleeptime(tk, &ts_delta);
/* Re-base the last cycle value */
@@ -1389,9 +1430,7 @@ void timekeeping_resume(void)
touch_softlockup_watchdog();
- clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-
- /* Resume hrtimers */
+ tick_resume();
hrtimers_resume();
}
@@ -1401,10 +1440,8 @@ int timekeeping_suspend(void)
unsigned long flags;
struct timespec64 delta, delta_delta;
static struct timespec64 old_delta;
- struct timespec tmp;
- read_persistent_clock(&tmp);
- timekeeping_suspend_time = timespec_to_timespec64(tmp);
+ read_persistent_clock64(&timekeeping_suspend_time);
/*
* On some systems the persistent_clock can not be detected at
@@ -1412,31 +1449,33 @@ int timekeeping_suspend(void)
* value returned, update the persistent_clock_exists flag.
*/
if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
- persistent_clock_exist = true;
+ persistent_clock_exists = true;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
timekeeping_suspended = 1;
- /*
- * To avoid drift caused by repeated suspend/resumes,
- * which each can add ~1 second drift error,
- * try to compensate so the difference in system time
- * and persistent_clock time stays close to constant.
- */
- delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
- delta_delta = timespec64_sub(delta, old_delta);
- if (abs(delta_delta.tv_sec) >= 2) {
+ if (persistent_clock_exists) {
/*
- * if delta_delta is too large, assume time correction
- * has occured and set old_delta to the current delta.
+ * To avoid drift caused by repeated suspend/resumes,
+ * which each can add ~1 second drift error,
+ * try to compensate so the difference in system time
+ * and persistent_clock time stays close to constant.
*/
- old_delta = delta;
- } else {
- /* Otherwise try to adjust old_system to compensate */
- timekeeping_suspend_time =
- timespec64_add(timekeeping_suspend_time, delta_delta);
+ delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
+ delta_delta = timespec64_sub(delta, old_delta);
+ if (abs(delta_delta.tv_sec) >= 2) {
+ /*
+ * if delta_delta is too large, assume time correction
+ * has occurred and set old_delta to the current delta.
+ */
+ old_delta = delta;
+ } else {
+ /* Otherwise try to adjust old_system to compensate */
+ timekeeping_suspend_time =
+ timespec64_add(timekeeping_suspend_time, delta_delta);
+ }
}
timekeeping_update(tk, TK_MIRROR);
@@ -1444,7 +1483,7 @@ int timekeeping_suspend(void)
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
- clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+ tick_suspend();
clocksource_suspend();
clockevents_suspend();
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 1d91416..ead8794 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -19,4 +19,11 @@ extern void timekeeping_clocktai(struct timespec *ts);
extern int timekeeping_suspend(void);
extern void timekeeping_resume(void);
+extern void do_timer(unsigned long ticks);
+extern void update_wall_time(void);
+
+extern seqlock_t jiffies_lock;
+
+#define CS_NAME_LEN 32
+
#endif
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d3f5c5..2ece3aa 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -90,8 +90,18 @@ struct tvec_base {
struct tvec tv5;
} ____cacheline_aligned;
+/*
+ * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've
+ * made NULL special, hint: lock_timer_base()) and we cannot get a compile time
+ * pointer to per-cpu entries because we don't know where we'll map the section,
+ * even for the boot cpu.
+ *
+ * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the
+ * rest of them.
+ */
struct tvec_base boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
+
static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
/* Functions below help us manage 'deferrable' flag */
@@ -1027,6 +1037,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
EXPORT_SYMBOL(try_to_del_timer_sync);
#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
+
/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
@@ -1532,64 +1544,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);
-static int init_timers_cpu(int cpu)
-{
- int j;
- struct tvec_base *base;
- static char tvec_base_done[NR_CPUS];
-
- if (!tvec_base_done[cpu]) {
- static char boot_done;
-
- if (boot_done) {
- /*
- * The APs use this path later in boot
- */
- base = kzalloc_node(sizeof(*base), GFP_KERNEL,
- cpu_to_node(cpu));
- if (!base)
- return -ENOMEM;
-
- /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
- if (WARN_ON(base != tbase_get_base(base))) {
- kfree(base);
- return -ENOMEM;
- }
- per_cpu(tvec_bases, cpu) = base;
- } else {
- /*
- * This is for the boot CPU - we use compile-time
- * static initialisation because per-cpu memory isn't
- * ready yet and because the memory allocators are not
- * initialised either.
- */
- boot_done = 1;
- base = &boot_tvec_bases;
- }
- spin_lock_init(&base->lock);
- tvec_base_done[cpu] = 1;
- base->cpu = cpu;
- } else {
- base = per_cpu(tvec_bases, cpu);
- }
-
-
- for (j = 0; j < TVN_SIZE; j++) {
- INIT_LIST_HEAD(base->tv5.vec + j);
- INIT_LIST_HEAD(base->tv4.vec + j);
- INIT_LIST_HEAD(base->tv3.vec + j);
- INIT_LIST_HEAD(base->tv2.vec + j);
- }
- for (j = 0; j < TVR_SIZE; j++)
- INIT_LIST_HEAD(base->tv1.vec + j);
-
- base->timer_jiffies = jiffies;
- base->next_timer = base->timer_jiffies;
- base->active_timers = 0;
- base->all_timers = 0;
- return 0;
-}
-
#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
{
@@ -1631,55 +1585,86 @@ static void migrate_timers(int cpu)
migrate_timer_list(new_base, old_base->tv5.vec + i);
}
+ old_base->active_timers = 0;
+ old_base->all_timers = 0;
+
spin_unlock(&old_base->lock);
spin_unlock_irq(&new_base->lock);
put_cpu_var(tvec_bases);
}
-#endif /* CONFIG_HOTPLUG_CPU */
static int timer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
- long cpu = (long)hcpu;
- int err;
-
- switch(action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- err = init_timers_cpu(cpu);
- if (err < 0)
- return notifier_from_errno(err);
- break;
-#ifdef CONFIG_HOTPLUG_CPU
+ switch (action) {
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- migrate_timers(cpu);
+ migrate_timers((long)hcpu);
break;
-#endif
default:
break;
}
+
return NOTIFY_OK;
}
-static struct notifier_block timers_nb = {
- .notifier_call = timer_cpu_notify,
-};
+static inline void timer_register_cpu_notifier(void)
+{
+ cpu_notifier(timer_cpu_notify, 0);
+}
+#else
+static inline void timer_register_cpu_notifier(void) { }
+#endif /* CONFIG_HOTPLUG_CPU */
+static void __init init_timer_cpu(struct tvec_base *base, int cpu)
+{
+ int j;
-void __init init_timers(void)
+ BUG_ON(base != tbase_get_base(base));
+
+ base->cpu = cpu;
+ per_cpu(tvec_bases, cpu) = base;
+ spin_lock_init(&base->lock);
+
+ for (j = 0; j < TVN_SIZE; j++) {
+ INIT_LIST_HEAD(base->tv5.vec + j);
+ INIT_LIST_HEAD(base->tv4.vec + j);
+ INIT_LIST_HEAD(base->tv3.vec + j);
+ INIT_LIST_HEAD(base->tv2.vec + j);
+ }
+ for (j = 0; j < TVR_SIZE; j++)
+ INIT_LIST_HEAD(base->tv1.vec + j);
+
+ base->timer_jiffies = jiffies;
+ base->next_timer = base->timer_jiffies;
+}
+
+static void __init init_timer_cpus(void)
{
- int err;
+ struct tvec_base *base;
+ int local_cpu = smp_processor_id();
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ if (cpu == local_cpu)
+ base = &boot_tvec_bases;
+#ifdef CONFIG_SMP
+ else
+ base = per_cpu_ptr(&__tvec_bases, cpu);
+#endif
+
+ init_timer_cpu(base, cpu);
+ }
+}
+
+void __init init_timers(void)
+{
/* ensure there are enough low bits for flags in timer->base pointer */
BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
- err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
- (void *)(long)smp_processor_id());
- BUG_ON(err != NOTIFY_OK);
-
+ init_timer_cpus();
init_timer_stats();
- register_cpu_notifier(&timers_nb);
+ timer_register_cpu_notifier();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 2cfd194..e878c2e 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -16,10 +16,10 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
-#include <linux/tick.h>
#include <asm/uaccess.h>
+#include "tick-internal.h"
struct timer_list_iter {
int cpu;
@@ -233,27 +233,27 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
print_name_offset(m, dev->set_mode);
SEQ_printf(m, "\n");
} else {
- if (dev->set_mode_shutdown) {
+ if (dev->set_state_shutdown) {
SEQ_printf(m, " shutdown: ");
- print_name_offset(m, dev->set_mode_shutdown);
+ print_name_offset(m, dev->set_state_shutdown);
SEQ_printf(m, "\n");
}
- if (dev->set_mode_periodic) {
+ if (dev->set_state_periodic) {
SEQ_printf(m, " periodic: ");
- print_name_offset(m, dev->set_mode_periodic);
+ print_name_offset(m, dev->set_state_periodic);
SEQ_printf(m, "\n");
}
- if (dev->set_mode_oneshot) {
+ if (dev->set_state_oneshot) {
SEQ_printf(m, " oneshot: ");
- print_name_offset(m, dev->set_mode_oneshot);
+ print_name_offset(m, dev->set_state_oneshot);
SEQ_printf(m, "\n");
}
- if (dev->set_mode_resume) {
+ if (dev->tick_resume) {
SEQ_printf(m, " resume: ");
- print_name_offset(m, dev->set_mode_resume);
+ print_name_offset(m, dev->tick_resume);
SEQ_printf(m, "\n");
}
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index c8e53c0..3b9a48a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -607,6 +607,34 @@ config RING_BUFFER_STARTUP_TEST
If unsure, say N
+config TRACE_ENUM_MAP_FILE
+ bool "Show enum mappings for trace events"
+ depends on TRACING
+ help
+ The "print fmt" of the trace events will show the enum names instead
+ of their values. This can cause problems for user space tools that
+ use this string to parse the raw data as user space does not know
+ how to convert the string to its value.
+
+ To fix this, there's a special macro in the kernel that can be used
+ to convert the enum into its value. If this macro is used, then the
+ print fmt strings will have the enums converted to their values.
+
+ If something does not get converted properly, this option can be
+ used to show what enums the kernel tried to convert.
+
+ This option is for debugging the enum conversions. A file is created
+ in the tracing directory called "enum_map" that will show the enum
+ names matched with their values and what trace event system they
+ belong too.
+
+ Normally, the mapping of the strings to values will be freed after
+ boot up or module load. With this option, they will not be freed, as
+ they are needed for the "enum_map" file. Enabling this option will
+ increase the memory footprint of the running kernel.
+
+ If unsure, say N
+
endif # FTRACE
endif # TRACING_SUPPORT
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4f22802..02bece4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -18,7 +18,7 @@
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/suspend.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/hardirq.h>
#include <linux/kthread.h>
#include <linux/uaccess.h>
@@ -249,6 +249,19 @@ static void update_function_graph_func(void);
static inline void update_function_graph_func(void) { }
#endif
+
+static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
+{
+ /*
+ * If this is a dynamic ops or we force list func,
+ * then it needs to call the list anyway.
+ */
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+ return ftrace_ops_list_func;
+
+ return ftrace_ops_get_func(ops);
+}
+
static void update_ftrace_function(void)
{
ftrace_func_t func;
@@ -270,7 +283,7 @@ static void update_ftrace_function(void)
* then have the mcount trampoline call the function directly.
*/
} else if (ftrace_ops_list->next == &ftrace_list_end) {
- func = ftrace_ops_get_func(ftrace_ops_list);
+ func = ftrace_ops_get_list_func(ftrace_ops_list);
} else {
/* Just use the default ftrace_ops */
@@ -1008,7 +1021,7 @@ static struct tracer_stat function_stats __initdata = {
.stat_show = function_stat_show
};
-static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
{
struct ftrace_profile_stat *stat;
struct dentry *entry;
@@ -1044,15 +1057,15 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
}
}
- entry = debugfs_create_file("function_profile_enabled", 0644,
+ entry = tracefs_create_file("function_profile_enabled", 0644,
d_tracer, NULL, &ftrace_profile_fops);
if (!entry)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'function_profile_enabled' entry\n");
}
#else /* CONFIG_FUNCTION_PROFILER */
-static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
{
}
#endif /* CONFIG_FUNCTION_PROFILER */
@@ -4712,7 +4725,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)
mutex_unlock(&ftrace_lock);
}
-static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
+static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
{
trace_create_file("available_filter_functions", 0444,
@@ -5020,7 +5033,7 @@ static int __init ftrace_nodyn_init(void)
}
core_initcall(ftrace_nodyn_init);
-static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
+static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_enable(int command) { }
static inline void ftrace_startup_all(int command) { }
/* Keep as macros so we do not need to define the commands */
@@ -5209,13 +5222,6 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
{
/*
- * If this is a dynamic ops or we force list func,
- * then it needs to call the list anyway.
- */
- if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
- return ftrace_ops_list_func;
-
- /*
* If the func handles its own recursion, call it directly.
* Otherwise call the recursion protected function that
* will call the ftrace ops function.
@@ -5473,7 +5479,7 @@ static const struct file_operations ftrace_pid_fops = {
.release = ftrace_pid_release,
};
-static __init int ftrace_init_debugfs(void)
+static __init int ftrace_init_tracefs(void)
{
struct dentry *d_tracer;
@@ -5481,16 +5487,16 @@ static __init int ftrace_init_debugfs(void)
if (IS_ERR(d_tracer))
return 0;
- ftrace_init_dyn_debugfs(d_tracer);
+ ftrace_init_dyn_tracefs(d_tracer);
trace_create_file("set_ftrace_pid", 0644, d_tracer,
NULL, &ftrace_pid_fops);
- ftrace_profile_debugfs(d_tracer);
+ ftrace_profile_tracefs(d_tracer);
return 0;
}
-fs_initcall(ftrace_init_debugfs);
+fs_initcall(ftrace_init_tracefs);
/**
* ftrace_kill - kill ftrace
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5040d44..0315d43 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2679,7 +2679,7 @@ static DEFINE_PER_CPU(unsigned int, current_context);
static __always_inline int trace_recursive_lock(void)
{
- unsigned int val = this_cpu_read(current_context);
+ unsigned int val = __this_cpu_read(current_context);
int bit;
if (in_interrupt()) {
@@ -2696,18 +2696,14 @@ static __always_inline int trace_recursive_lock(void)
return 1;
val |= (1 << bit);
- this_cpu_write(current_context, val);
+ __this_cpu_write(current_context, val);
return 0;
}
static __always_inline void trace_recursive_unlock(void)
{
- unsigned int val = this_cpu_read(current_context);
-
- val--;
- val &= this_cpu_read(current_context);
- this_cpu_write(current_context, val);
+ __this_cpu_and(current_context, __this_cpu_read(current_context) - 1);
}
#else
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62c6506..91eecaa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -20,6 +20,7 @@
#include <linux/notifier.h>
#include <linux/irqflags.h>
#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/pagemap.h>
#include <linux/hardirq.h>
#include <linux/linkage.h>
@@ -31,6 +32,7 @@
#include <linux/splice.h>
#include <linux/kdebug.h>
#include <linux/string.h>
+#include <linux/mount.h>
#include <linux/rwsem.h>
#include <linux/slab.h>
#include <linux/ctype.h>
@@ -123,6 +125,42 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
/* When set, tracing will stop when a WARN*() is hit */
int __disable_trace_on_warning;
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+/* Map of enums to their values, for "enum_map" file */
+struct trace_enum_map_head {
+ struct module *mod;
+ unsigned long length;
+};
+
+union trace_enum_map_item;
+
+struct trace_enum_map_tail {
+ /*
+ * "end" is first and points to NULL as it must be different
+ * than "mod" or "enum_string"
+ */
+ union trace_enum_map_item *next;
+ const char *end; /* points to NULL */
+};
+
+static DEFINE_MUTEX(trace_enum_mutex);
+
+/*
+ * The trace_enum_maps are saved in an array with two extra elements,
+ * one at the beginning, and one at the end. The beginning item contains
+ * the count of the saved maps (head.length), and the module they
+ * belong to if not built in (head.mod). The ending item contains a
+ * pointer to the next array of saved enum_map items.
+ */
+union trace_enum_map_item {
+ struct trace_enum_map map;
+ struct trace_enum_map_head head;
+ struct trace_enum_map_tail tail;
+};
+
+static union trace_enum_map_item *trace_enum_maps;
+#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+
static int tracing_set_tracer(struct trace_array *tr, const char *buf);
#define MAX_TRACER_SIZE 100
@@ -3908,6 +3946,182 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
.write = tracing_saved_cmdlines_size_write,
};
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+static union trace_enum_map_item *
+update_enum_map(union trace_enum_map_item *ptr)
+{
+ if (!ptr->map.enum_string) {
+ if (ptr->tail.next) {
+ ptr = ptr->tail.next;
+ /* Set ptr to the next real item (skip head) */
+ ptr++;
+ } else
+ return NULL;
+ }
+ return ptr;
+}
+
+static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ union trace_enum_map_item *ptr = v;
+
+ /*
+ * Paranoid! If ptr points to end, we don't want to increment past it.
+ * This really should never happen.
+ */
+ ptr = update_enum_map(ptr);
+ if (WARN_ON_ONCE(!ptr))
+ return NULL;
+
+ ptr++;
+
+ (*pos)++;
+
+ ptr = update_enum_map(ptr);
+
+ return ptr;
+}
+
+static void *enum_map_start(struct seq_file *m, loff_t *pos)
+{
+ union trace_enum_map_item *v;
+ loff_t l = 0;
+
+ mutex_lock(&trace_enum_mutex);
+
+ v = trace_enum_maps;
+ if (v)
+ v++;
+
+ while (v && l < *pos) {
+ v = enum_map_next(m, v, &l);
+ }
+
+ return v;
+}
+
+static void enum_map_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&trace_enum_mutex);
+}
+
+static int enum_map_show(struct seq_file *m, void *v)
+{
+ union trace_enum_map_item *ptr = v;
+
+ seq_printf(m, "%s %ld (%s)\n",
+ ptr->map.enum_string, ptr->map.enum_value,
+ ptr->map.system);
+
+ return 0;
+}
+
+static const struct seq_operations tracing_enum_map_seq_ops = {
+ .start = enum_map_start,
+ .next = enum_map_next,
+ .stop = enum_map_stop,
+ .show = enum_map_show,
+};
+
+static int tracing_enum_map_open(struct inode *inode, struct file *filp)
+{
+ if (tracing_disabled)
+ return -ENODEV;
+
+ return seq_open(filp, &tracing_enum_map_seq_ops);
+}
+
+static const struct file_operations tracing_enum_map_fops = {
+ .open = tracing_enum_map_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static inline union trace_enum_map_item *
+trace_enum_jmp_to_tail(union trace_enum_map_item *ptr)
+{
+ /* Return tail of array given the head */
+ return ptr + ptr->head.length + 1;
+}
+
+static void
+trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
+ int len)
+{
+ struct trace_enum_map **stop;
+ struct trace_enum_map **map;
+ union trace_enum_map_item *map_array;
+ union trace_enum_map_item *ptr;
+
+ stop = start + len;
+
+ /*
+ * The trace_enum_maps contains the map plus a head and tail item,
+ * where the head holds the module and length of array, and the
+ * tail holds a pointer to the next list.
+ */
+ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
+ if (!map_array) {
+ pr_warning("Unable to allocate trace enum mapping\n");
+ return;
+ }
+
+ mutex_lock(&trace_enum_mutex);
+
+ if (!trace_enum_maps)
+ trace_enum_maps = map_array;
+ else {
+ ptr = trace_enum_maps;
+ for (;;) {
+ ptr = trace_enum_jmp_to_tail(ptr);
+ if (!ptr->tail.next)
+ break;
+ ptr = ptr->tail.next;
+
+ }
+ ptr->tail.next = map_array;
+ }
+ map_array->head.mod = mod;
+ map_array->head.length = len;
+ map_array++;
+
+ for (map = start; (unsigned long)map < (unsigned long)stop; map++) {
+ map_array->map = **map;
+ map_array++;
+ }
+ memset(map_array, 0, sizeof(*map_array));
+
+ mutex_unlock(&trace_enum_mutex);
+}
+
+static void trace_create_enum_file(struct dentry *d_tracer)
+{
+ trace_create_file("enum_map", 0444, d_tracer,
+ NULL, &tracing_enum_map_fops);
+}
+
+#else /* CONFIG_TRACE_ENUM_MAP_FILE */
+static inline void trace_create_enum_file(struct dentry *d_tracer) { }
+static inline void trace_insert_enum_map_file(struct module *mod,
+ struct trace_enum_map **start, int len) { }
+#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
+
+static void trace_insert_enum_map(struct module *mod,
+ struct trace_enum_map **start, int len)
+{
+ struct trace_enum_map **map;
+
+ if (len <= 0)
+ return;
+
+ map = start;
+
+ trace_event_enum_update(map, len);
+
+ trace_insert_enum_map_file(mod, start, len);
+}
+
static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -4105,9 +4319,24 @@ static void tracing_set_nop(struct trace_array *tr)
tr->current_trace = &nop_trace;
}
-static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+static void update_tracer_options(struct trace_array *tr, struct tracer *t)
{
static struct trace_option_dentry *topts;
+
+ /* Only enable if the directory has been created already. */
+ if (!tr->dir)
+ return;
+
+ /* Currently, only the top instance has options */
+ if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL))
+ return;
+
+ destroy_trace_option_files(topts);
+ topts = create_trace_option_files(tr, t);
+}
+
+static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+{
struct tracer *t;
#ifdef CONFIG_TRACER_MAX_TRACE
bool had_max_tr;
@@ -4172,11 +4401,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
free_snapshot(tr);
}
#endif
- /* Currently, only the top instance has options */
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
- destroy_trace_option_files(topts);
- topts = create_trace_option_files(tr, t);
- }
+ update_tracer_options(tr, t);
#ifdef CONFIG_TRACER_MAX_TRACE
if (t->use_max_tr && !had_max_tr) {
@@ -5817,6 +6042,14 @@ static inline __init int register_snapshot_cmd(void) { return 0; }
static struct dentry *tracing_get_dentry(struct trace_array *tr)
{
+ if (WARN_ON(!tr->dir))
+ return ERR_PTR(-ENODEV);
+
+ /* Top directory uses NULL as the parent */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return NULL;
+
+ /* All sub buffers have a descriptor */
return tr->dir;
}
@@ -5831,10 +6064,10 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
if (IS_ERR(d_tracer))
return NULL;
- tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
+ tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer);
WARN_ONCE(!tr->percpu_dir,
- "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
+ "Could not create tracefs directory 'per_cpu/%d'\n", cpu);
return tr->percpu_dir;
}
@@ -5851,7 +6084,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
}
static void
-tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
+tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
{
struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
struct dentry *d_cpu;
@@ -5861,9 +6094,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
return;
snprintf(cpu_dir, 30, "cpu%ld", cpu);
- d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
+ d_cpu = tracefs_create_dir(cpu_dir, d_percpu);
if (!d_cpu) {
- pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
+ pr_warning("Could not create tracefs '%s' entry\n", cpu_dir);
return;
}
@@ -6015,9 +6248,9 @@ struct dentry *trace_create_file(const char *name,
{
struct dentry *ret;
- ret = debugfs_create_file(name, mode, parent, data, fops);
+ ret = tracefs_create_file(name, mode, parent, data, fops);
if (!ret)
- pr_warning("Could not create debugfs '%s' entry\n", name);
+ pr_warning("Could not create tracefs '%s' entry\n", name);
return ret;
}
@@ -6034,9 +6267,9 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
if (IS_ERR(d_tracer))
return NULL;
- tr->options = debugfs_create_dir("options", d_tracer);
+ tr->options = tracefs_create_dir("options", d_tracer);
if (!tr->options) {
- pr_warning("Could not create debugfs directory 'options'\n");
+ pr_warning("Could not create tracefs directory 'options'\n");
return NULL;
}
@@ -6105,7 +6338,7 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
return;
for (cnt = 0; topts[cnt].opt; cnt++)
- debugfs_remove(topts[cnt].entry);
+ tracefs_remove(topts[cnt].entry);
kfree(topts);
}
@@ -6194,7 +6427,7 @@ static const struct file_operations rb_simple_fops = {
struct dentry *trace_instance_dir;
static void
-init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
static int
allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
@@ -6271,7 +6504,7 @@ static void free_trace_buffers(struct trace_array *tr)
#endif
}
-static int new_instance_create(const char *name)
+static int instance_mkdir(const char *name)
{
struct trace_array *tr;
int ret;
@@ -6310,17 +6543,17 @@ static int new_instance_create(const char *name)
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
- tr->dir = debugfs_create_dir(name, trace_instance_dir);
+ tr->dir = tracefs_create_dir(name, trace_instance_dir);
if (!tr->dir)
goto out_free_tr;
ret = event_trace_add_tracer(tr->dir, tr);
if (ret) {
- debugfs_remove_recursive(tr->dir);
+ tracefs_remove_recursive(tr->dir);
goto out_free_tr;
}
- init_tracer_debugfs(tr, tr->dir);
+ init_tracer_tracefs(tr, tr->dir);
list_add(&tr->list, &ftrace_trace_arrays);
@@ -6341,7 +6574,7 @@ static int new_instance_create(const char *name)
}
-static int instance_delete(const char *name)
+static int instance_rmdir(const char *name)
{
struct trace_array *tr;
int found = 0;
@@ -6382,82 +6615,17 @@ static int instance_delete(const char *name)
return ret;
}
-static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
-{
- struct dentry *parent;
- int ret;
-
- /* Paranoid: Make sure the parent is the "instances" directory */
- parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
- if (WARN_ON_ONCE(parent != trace_instance_dir))
- return -ENOENT;
-
- /*
- * The inode mutex is locked, but debugfs_create_dir() will also
- * take the mutex. As the instances directory can not be destroyed
- * or changed in any other way, it is safe to unlock it, and
- * let the dentry try. If two users try to make the same dir at
- * the same time, then the new_instance_create() will determine the
- * winner.
- */
- mutex_unlock(&inode->i_mutex);
-
- ret = new_instance_create(dentry->d_iname);
-
- mutex_lock(&inode->i_mutex);
-
- return ret;
-}
-
-static int instance_rmdir(struct inode *inode, struct dentry *dentry)
-{
- struct dentry *parent;
- int ret;
-
- /* Paranoid: Make sure the parent is the "instances" directory */
- parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
- if (WARN_ON_ONCE(parent != trace_instance_dir))
- return -ENOENT;
-
- /* The caller did a dget() on dentry */
- mutex_unlock(&dentry->d_inode->i_mutex);
-
- /*
- * The inode mutex is locked, but debugfs_create_dir() will also
- * take the mutex. As the instances directory can not be destroyed
- * or changed in any other way, it is safe to unlock it, and
- * let the dentry try. If two users try to make the same dir at
- * the same time, then the instance_delete() will determine the
- * winner.
- */
- mutex_unlock(&inode->i_mutex);
-
- ret = instance_delete(dentry->d_iname);
-
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock(&dentry->d_inode->i_mutex);
-
- return ret;
-}
-
-static const struct inode_operations instance_dir_inode_operations = {
- .lookup = simple_lookup,
- .mkdir = instance_mkdir,
- .rmdir = instance_rmdir,
-};
-
static __init void create_trace_instances(struct dentry *d_tracer)
{
- trace_instance_dir = debugfs_create_dir("instances", d_tracer);
+ trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
+ instance_mkdir,
+ instance_rmdir);
if (WARN_ON(!trace_instance_dir))
return;
-
- /* Hijack the dir inode operations, to allow mkdir */
- trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
}
static void
-init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
int cpu;
@@ -6511,10 +6679,32 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
#endif
for_each_tracing_cpu(cpu)
- tracing_init_debugfs_percpu(tr, cpu);
+ tracing_init_tracefs_percpu(tr, cpu);
}
+static struct vfsmount *trace_automount(void *ingore)
+{
+ struct vfsmount *mnt;
+ struct file_system_type *type;
+
+ /*
+ * To maintain backward compatibility for tools that mount
+ * debugfs to get to the tracing facility, tracefs is automatically
+ * mounted to the debugfs/tracing directory.
+ */
+ type = get_fs_type("tracefs");
+ if (!type)
+ return NULL;
+ mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+ put_filesystem(type);
+ if (IS_ERR(mnt))
+ return NULL;
+ mntget(mnt);
+
+ return mnt;
+}
+
/**
* tracing_init_dentry - initialize top level trace array
*
@@ -6526,23 +6716,112 @@ struct dentry *tracing_init_dentry(void)
{
struct trace_array *tr = &global_trace;
+ /* The top level trace array uses NULL as parent */
if (tr->dir)
- return tr->dir;
+ return NULL;
if (WARN_ON(!debugfs_initialized()))
return ERR_PTR(-ENODEV);
- tr->dir = debugfs_create_dir("tracing", NULL);
-
+ /*
+ * As there may still be users that expect the tracing
+ * files to exist in debugfs/tracing, we must automount
+ * the tracefs file system there, so older tools still
+ * work with the newer kerenl.
+ */
+ tr->dir = debugfs_create_automount("tracing", NULL,
+ trace_automount, NULL);
if (!tr->dir) {
pr_warn_once("Could not create debugfs directory 'tracing'\n");
return ERR_PTR(-ENOMEM);
}
- return tr->dir;
+ return NULL;
+}
+
+extern struct trace_enum_map *__start_ftrace_enum_maps[];
+extern struct trace_enum_map *__stop_ftrace_enum_maps[];
+
+static void __init trace_enum_init(void)
+{
+ int len;
+
+ len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps;
+ trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len);
+}
+
+#ifdef CONFIG_MODULES
+static void trace_module_add_enums(struct module *mod)
+{
+ if (!mod->num_trace_enums)
+ return;
+
+ /*
+ * Modules with bad taint do not have events created, do
+ * not bother with enums either.
+ */
+ if (trace_module_has_bad_taint(mod))
+ return;
+
+ trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums);
}
-static __init int tracer_init_debugfs(void)
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+static void trace_module_remove_enums(struct module *mod)
+{
+ union trace_enum_map_item *map;
+ union trace_enum_map_item **last = &trace_enum_maps;
+
+ if (!mod->num_trace_enums)
+ return;
+
+ mutex_lock(&trace_enum_mutex);
+
+ map = trace_enum_maps;
+
+ while (map) {
+ if (map->head.mod == mod)
+ break;
+ map = trace_enum_jmp_to_tail(map);
+ last = &map->tail.next;
+ map = map->tail.next;
+ }
+ if (!map)
+ goto out;
+
+ *last = trace_enum_jmp_to_tail(map)->tail.next;
+ kfree(map);
+ out:
+ mutex_unlock(&trace_enum_mutex);
+}
+#else
+static inline void trace_module_remove_enums(struct module *mod) { }
+#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+
+static int trace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ trace_module_add_enums(mod);
+ break;
+ case MODULE_STATE_GOING:
+ trace_module_remove_enums(mod);
+ break;
+ }
+
+ return 0;
+}
+
+static struct notifier_block trace_module_nb = {
+ .notifier_call = trace_module_notify,
+ .priority = 0,
+};
+#endif /* CONFIG_MODULES */
+
+static __init int tracer_init_tracefs(void)
{
struct dentry *d_tracer;
@@ -6552,7 +6831,7 @@ static __init int tracer_init_debugfs(void)
if (IS_ERR(d_tracer))
return 0;
- init_tracer_debugfs(&global_trace, d_tracer);
+ init_tracer_tracefs(&global_trace, d_tracer);
trace_create_file("tracing_thresh", 0644, d_tracer,
&global_trace, &tracing_thresh_fops);
@@ -6566,6 +6845,14 @@ static __init int tracer_init_debugfs(void)
trace_create_file("saved_cmdlines_size", 0644, d_tracer,
NULL, &tracing_saved_cmdlines_size_fops);
+ trace_enum_init();
+
+ trace_create_enum_file(d_tracer);
+
+#ifdef CONFIG_MODULES
+ register_module_notifier(&trace_module_nb);
+#endif
+
#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -6575,6 +6862,10 @@ static __init int tracer_init_debugfs(void)
create_trace_options_dir(&global_trace);
+ /* If the tracer was started via cmdline, create options for it here */
+ if (global_trace.current_trace != &nop_trace)
+ update_tracer_options(&global_trace, global_trace.current_trace);
+
return 0;
}
@@ -6888,7 +7179,7 @@ void __init trace_init(void)
tracepoint_printk = 0;
}
tracer_alloc_buffers();
- trace_event_init();
+ trace_event_init();
}
__init static int clear_boot_tracer(void)
@@ -6910,5 +7201,5 @@ __init static int clear_boot_tracer(void)
return 0;
}
-fs_initcall(tracer_init_debugfs);
+fs_initcall(tracer_init_tracefs);
late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dd8205a..d261201 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -334,7 +334,7 @@ struct tracer_flags {
/**
- * struct tracer - a specific tracer and its callbacks to interact with debugfs
+ * struct tracer - a specific tracer and its callbacks to interact with tracefs
* @name: the name chosen to select it on the available_tracers file
* @init: called when one switches to this tracer (echo name > current_tracer)
* @reset: called when one switches to another tracer
@@ -1309,8 +1309,10 @@ static inline void init_ftrace_syscalls(void) { }
#ifdef CONFIG_EVENT_TRACING
void trace_event_init(void);
+void trace_event_enum_update(struct trace_enum_map **map, int len);
#else
static inline void __init trace_event_init(void) { }
+static inlin void trace_event_enum_update(struct trace_enum_map **map, int len) { }
#endif
extern struct trace_iterator *tracepoint_print_iter;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e2d027a..ee7b94a 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,7 +223,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
__dynamic_array( u32, buf )
),
- F_printk("%pf: %s",
+ F_printk("%ps: %s",
(void *)__entry->ip, __entry->fmt),
FILTER_OTHER
@@ -238,7 +238,7 @@ FTRACE_ENTRY(print, print_entry,
__dynamic_array( char, buf )
),
- F_printk("%pf: %s",
+ F_printk("%ps: %s",
(void *)__entry->ip, __entry->buf),
FILTER_OTHER
@@ -253,7 +253,7 @@ FTRACE_ENTRY(bputs, bputs_entry,
__field( const char *, str )
),
- F_printk("%pf: %s",
+ F_printk("%ps: %s",
(void *)__entry->ip, __entry->str),
FILTER_OTHER
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index db54dda..7da1dfe 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -13,7 +13,7 @@
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
@@ -480,7 +480,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
return;
if (!--dir->nr_events) {
- debugfs_remove_recursive(dir->entry);
+ tracefs_remove_recursive(dir->entry);
list_del(&dir->list);
__put_system_dir(dir);
}
@@ -499,7 +499,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
}
spin_unlock(&dir->d_lock);
- debugfs_remove_recursive(dir);
+ tracefs_remove_recursive(dir);
}
list_del(&file->list);
@@ -1526,7 +1526,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
} else
__get_system(system);
- dir->entry = debugfs_create_dir(name, parent);
+ dir->entry = tracefs_create_dir(name, parent);
if (!dir->entry) {
pr_warn("Failed to create system directory %s\n", name);
__put_system(system);
@@ -1539,12 +1539,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
dir->subsystem = system;
file->system = dir;
- entry = debugfs_create_file("filter", 0644, dir->entry, dir,
+ entry = tracefs_create_file("filter", 0644, dir->entry, dir,
&ftrace_subsystem_filter_fops);
if (!entry) {
kfree(system->filter);
system->filter = NULL;
- pr_warn("Could not create debugfs '%s/filter' entry\n", name);
+ pr_warn("Could not create tracefs '%s/filter' entry\n", name);
}
trace_create_file("enable", 0644, dir->entry, dir,
@@ -1585,9 +1585,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
d_events = parent;
name = ftrace_event_name(call);
- file->dir = debugfs_create_dir(name, d_events);
+ file->dir = tracefs_create_dir(name, d_events);
if (!file->dir) {
- pr_warn("Could not create debugfs '%s' directory\n", name);
+ pr_warn("Could not create tracefs '%s' directory\n", name);
return -1;
}
@@ -1704,6 +1704,125 @@ __register_event(struct ftrace_event_call *call, struct module *mod)
return 0;
}
+static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
+{
+ int rlen;
+ int elen;
+
+ /* Find the length of the enum value as a string */
+ elen = snprintf(ptr, 0, "%ld", map->enum_value);
+ /* Make sure there's enough room to replace the string with the value */
+ if (len < elen)
+ return NULL;
+
+ snprintf(ptr, elen + 1, "%ld", map->enum_value);
+
+ /* Get the rest of the string of ptr */
+ rlen = strlen(ptr + len);
+ memmove(ptr + elen, ptr + len, rlen);
+ /* Make sure we end the new string */
+ ptr[elen + rlen] = 0;
+
+ return ptr + elen;
+}
+
+static void update_event_printk(struct ftrace_event_call *call,
+ struct trace_enum_map *map)
+{
+ char *ptr;
+ int quote = 0;
+ int len = strlen(map->enum_string);
+
+ for (ptr = call->print_fmt; *ptr; ptr++) {
+ if (*ptr == '\\') {
+ ptr++;
+ /* paranoid */
+ if (!*ptr)
+ break;
+ continue;
+ }
+ if (*ptr == '"') {
+ quote ^= 1;
+ continue;
+ }
+ if (quote)
+ continue;
+ if (isdigit(*ptr)) {
+ /* skip numbers */
+ do {
+ ptr++;
+ /* Check for alpha chars like ULL */
+ } while (isalnum(*ptr));
+ /*
+ * A number must have some kind of delimiter after
+ * it, and we can ignore that too.
+ */
+ continue;
+ }
+ if (isalpha(*ptr) || *ptr == '_') {
+ if (strncmp(map->enum_string, ptr, len) == 0 &&
+ !isalnum(ptr[len]) && ptr[len] != '_') {
+ ptr = enum_replace(ptr, map, len);
+ /* Hmm, enum string smaller than value */
+ if (WARN_ON_ONCE(!ptr))
+ return;
+ /*
+ * No need to decrement here, as enum_replace()
+ * returns the pointer to the character passed
+ * the enum, and two enums can not be placed
+ * back to back without something in between.
+ * We can skip that something in between.
+ */
+ continue;
+ }
+ skip_more:
+ do {
+ ptr++;
+ } while (isalnum(*ptr) || *ptr == '_');
+ /*
+ * If what comes after this variable is a '.' or
+ * '->' then we can continue to ignore that string.
+ */
+ if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) {
+ ptr += *ptr == '.' ? 1 : 2;
+ goto skip_more;
+ }
+ /*
+ * Once again, we can skip the delimiter that came
+ * after the string.
+ */
+ continue;
+ }
+ }
+}
+
+void trace_event_enum_update(struct trace_enum_map **map, int len)
+{
+ struct ftrace_event_call *call, *p;
+ const char *last_system = NULL;
+ int last_i;
+ int i;
+
+ down_write(&trace_event_sem);
+ list_for_each_entry_safe(call, p, &ftrace_events, list) {
+ /* events are usually grouped together with systems */
+ if (!last_system || call->class->system != last_system) {
+ last_i = 0;
+ last_system = call->class->system;
+ }
+
+ for (i = last_i; i < len; i++) {
+ if (call->class->system == map[i]->system) {
+ /* Save the first system if need be */
+ if (!last_i)
+ last_i = i;
+ update_event_printk(call, map[i]);
+ }
+ }
+ }
+ up_write(&trace_event_sem);
+}
+
static struct ftrace_event_file *
trace_create_new_event(struct ftrace_event_call *call,
struct trace_array *tr)
@@ -1915,7 +2034,7 @@ static int trace_module_notify(struct notifier_block *self,
static struct notifier_block trace_module_nb = {
.notifier_call = trace_module_notify,
- .priority = 0,
+ .priority = 1, /* higher than trace.c module notify */
};
#endif /* CONFIG_MODULES */
@@ -2228,7 +2347,7 @@ static inline int register_event_cmds(void) { return 0; }
/*
* The top level array has already had its ftrace_event_file
* descriptors created in order to allow for early events to
- * be recorded. This function is called after the debugfs has been
+ * be recorded. This function is called after the tracefs has been
* initialized, and we now have to create the files associated
* to the events.
*/
@@ -2311,16 +2430,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
struct dentry *d_events;
struct dentry *entry;
- entry = debugfs_create_file("set_event", 0644, parent,
+ entry = tracefs_create_file("set_event", 0644, parent,
tr, &ftrace_set_event_fops);
if (!entry) {
- pr_warn("Could not create debugfs 'set_event' entry\n");
+ pr_warn("Could not create tracefs 'set_event' entry\n");
return -ENOMEM;
}
- d_events = debugfs_create_dir("events", parent);
+ d_events = tracefs_create_dir("events", parent);
if (!d_events) {
- pr_warn("Could not create debugfs 'events' directory\n");
+ pr_warn("Could not create tracefs 'events' directory\n");
return -ENOMEM;
}
@@ -2412,7 +2531,7 @@ int event_trace_del_tracer(struct trace_array *tr)
down_write(&trace_event_sem);
__trace_remove_event_dirs(tr);
- debugfs_remove_recursive(tr->event_dir);
+ tracefs_remove_recursive(tr->event_dir);
up_write(&trace_event_sem);
tr->event_dir = NULL;
@@ -2534,10 +2653,10 @@ static __init int event_trace_init(void)
if (IS_ERR(d_tracer))
return 0;
- entry = debugfs_create_file("available_events", 0444, d_tracer,
+ entry = tracefs_create_file("available_events", 0444, d_tracer,
tr, &ftrace_avail_fops);
if (!entry)
- pr_warn("Could not create debugfs 'available_events' entry\n");
+ pr_warn("Could not create tracefs 'available_events' entry\n");
if (trace_define_common_fields())
pr_warn("tracing: Failed to allocate common fields");
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 12e2b99..174a6a7 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -177,7 +177,7 @@ struct ftrace_event_call __used event_##call = { \
}, \
.event.type = etype, \
.print_fmt = print, \
- .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
+ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
}; \
struct ftrace_event_call __used \
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 2d25ad1..9cfea4c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -6,7 +6,6 @@
* is Copyright (c) Steven Rostedt <srostedt@redhat.com>
*
*/
-#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
@@ -151,7 +150,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
* The curr_ret_stack is initialized to -1 and get increased
* in this function. So it can be less than -1 only if it was
* filtered out via ftrace_graph_notrace_addr() which can be
- * set from set_graph_notrace file in debugfs by user.
+ * set from set_graph_notrace file in tracefs by user.
*/
if (current->curr_ret_stack < -1)
return -EBUSY;
@@ -1432,7 +1431,7 @@ static const struct file_operations graph_depth_fops = {
.llseek = generic_file_llseek,
};
-static __init int init_graph_debugfs(void)
+static __init int init_graph_tracefs(void)
{
struct dentry *d_tracer;
@@ -1445,7 +1444,7 @@ static __init int init_graph_debugfs(void)
return 0;
}
-fs_initcall(init_graph_debugfs);
+fs_initcall(init_graph_tracefs);
static __init int init_graph_trace(void)
{
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index dc34625..d0ce590 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -250,7 +250,7 @@ DEFINE_FETCH_symbol(string_size)
#define fetch_file_offset_string_size NULL
/* Fetch type information table */
-const struct fetch_type kprobes_fetch_type_table[] = {
+static const struct fetch_type kprobes_fetch_type_table[] = {
/* Special types */
[FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
sizeof(u32), 1, "__data_loc char[]"),
@@ -760,7 +760,8 @@ static int create_trace_kprobe(int argc, char **argv)
/* Parse fetch argument */
ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
- is_return, true);
+ is_return, true,
+ kprobes_fetch_type_table);
if (ret) {
pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
goto error;
@@ -1318,7 +1319,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
return ret;
}
-/* Make a debugfs interface for controlling probe points */
+/* Make a tracefs interface for controlling probe points */
static __init int init_kprobe_trace(void)
{
struct dentry *d_tracer;
@@ -1331,20 +1332,20 @@ static __init int init_kprobe_trace(void)
if (IS_ERR(d_tracer))
return 0;
- entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+ entry = tracefs_create_file("kprobe_events", 0644, d_tracer,
NULL, &kprobe_events_ops);
/* Event list interface */
if (!entry)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'kprobe_events' entry\n");
/* Profile interface */
- entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
+ entry = tracefs_create_file("kprobe_profile", 0444, d_tracer,
NULL, &kprobe_profile_ops);
if (!entry)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'kprobe_profile' entry\n");
return 0;
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index b983b2f..1769a81 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -356,17 +356,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
/* Recursive argument parser */
static int parse_probe_arg(char *arg, const struct fetch_type *t,
- struct fetch_param *f, bool is_return, bool is_kprobe)
+ struct fetch_param *f, bool is_return, bool is_kprobe,
+ const struct fetch_type *ftbl)
{
- const struct fetch_type *ftbl;
unsigned long param;
long offset;
char *tmp;
int ret = 0;
- ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
- BUG_ON(ftbl == NULL);
-
switch (arg[0]) {
case '$':
ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe);
@@ -447,7 +444,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
dprm->fetch_size = get_fetch_size_function(t,
dprm->fetch, ftbl);
ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
- is_kprobe);
+ is_kprobe, ftbl);
if (ret)
kfree(dprm);
else {
@@ -505,15 +502,12 @@ static int __parse_bitfield_probe_arg(const char *bf,
/* String length checking wrapper */
int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
- struct probe_arg *parg, bool is_return, bool is_kprobe)
+ struct probe_arg *parg, bool is_return, bool is_kprobe,
+ const struct fetch_type *ftbl)
{
- const struct fetch_type *ftbl;
const char *t;
int ret;
- ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
- BUG_ON(ftbl == NULL);
-
if (strlen(arg) > MAX_ARGSTR_LEN) {
pr_info("Argument is too long.: %s\n", arg);
return -ENOSPC;
@@ -535,7 +529,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
}
parg->offset = *size;
*size += parg->type->size;
- ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe);
+ ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return,
+ is_kprobe, ftbl);
if (ret >= 0 && t != NULL)
ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 4f815fb..ab283e1 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -25,7 +25,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/smp.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/ctype.h>
@@ -229,13 +229,6 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \
#define FETCH_TYPE_STRING 0
#define FETCH_TYPE_STRSIZE 1
-/*
- * Fetch type information table.
- * It's declared as a weak symbol due to conditional compilation.
- */
-extern __weak const struct fetch_type kprobes_fetch_type_table[];
-extern __weak const struct fetch_type uprobes_fetch_type_table[];
-
#ifdef CONFIG_KPROBE_EVENT
struct symbol_cache;
unsigned long update_symbol_cache(struct symbol_cache *sc);
@@ -333,7 +326,8 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
}
extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
- struct probe_arg *parg, bool is_return, bool is_kprobe);
+ struct probe_arg *parg, bool is_return, bool is_kprobe,
+ const struct fetch_type *ftbl);
extern int traceprobe_conflict_field_name(const char *name,
struct probe_arg *args, int narg);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 75e19e8..6cf9353 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -12,7 +12,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include "trace_stat.h"
#include "trace.h"
@@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session)
static void destroy_session(struct stat_session *session)
{
- debugfs_remove(session->file);
+ tracefs_remove(session->file);
__reset_stat_session(session);
mutex_destroy(&session->stat_mutex);
kfree(session);
@@ -279,9 +279,9 @@ static int tracing_stat_init(void)
if (IS_ERR(d_tracing))
return 0;
- stat_dir = debugfs_create_dir("trace_stat", d_tracing);
+ stat_dir = tracefs_create_dir("trace_stat", d_tracing);
if (!stat_dir)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'trace_stat' entry\n");
return 0;
}
@@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session)
if (!stat_dir && tracing_stat_init())
return -ENODEV;
- session->file = debugfs_create_file(session->ts->name, 0644,
+ session->file = tracefs_create_file(session->ts->name, 0644,
stat_dir,
session, &tracing_stat_fops);
if (!session->file)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 996e452..d60fe62 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -196,7 +196,7 @@ DEFINE_FETCH_file_offset(string)
DEFINE_FETCH_file_offset(string_size)
/* Fetch type information table */
-const struct fetch_type uprobes_fetch_type_table[] = {
+static const struct fetch_type uprobes_fetch_type_table[] = {
/* Special types */
[FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
sizeof(u32), 1, "__data_loc char[]"),
@@ -535,7 +535,8 @@ static int create_trace_uprobe(int argc, char **argv)
/* Parse fetch argument */
ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
- is_return, false);
+ is_return, false,
+ uprobes_fetch_type_table);
if (ret) {
pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
goto error;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 41ff75b..586ad91 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -159,6 +159,7 @@ struct worker_pool {
/* see manage_workers() for details on the two manager mutexes */
struct mutex manager_arb; /* manager arbitration */
+ struct worker *manager; /* L: purely informational */
struct mutex attach_mutex; /* attach/detach exclusion */
struct list_head workers; /* A: attached workers */
struct completion *detach_completion; /* all workers detached */
@@ -230,7 +231,7 @@ struct wq_device;
*/
struct workqueue_struct {
struct list_head pwqs; /* WR: all pwqs of this wq */
- struct list_head list; /* PL: list of all workqueues */
+ struct list_head list; /* PR: list of all workqueues */
struct mutex mutex; /* protects this wq */
int work_color; /* WQ: current work color */
@@ -257,6 +258,13 @@ struct workqueue_struct {
#endif
char name[WQ_NAME_LEN]; /* I: workqueue name */
+ /*
+ * Destruction of workqueue_struct is sched-RCU protected to allow
+ * walking the workqueues list without grabbing wq_pool_mutex.
+ * This is used to dump all workqueues from sysrq.
+ */
+ struct rcu_head rcu;
+
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
@@ -288,7 +296,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
-static LIST_HEAD(workqueues); /* PL: list of all workqueues */
+static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
/* the per-cpu worker pools */
@@ -324,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);
static void copy_workqueue_attrs(struct workqueue_attrs *to,
const struct workqueue_attrs *from);
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
@@ -1911,9 +1920,11 @@ static bool manage_workers(struct worker *worker)
*/
if (!mutex_trylock(&pool->manager_arb))
return false;
+ pool->manager = worker;
maybe_create_worker(pool);
+ pool->manager = NULL;
mutex_unlock(&pool->manager_arb);
return true;
}
@@ -2303,6 +2314,7 @@ repeat:
struct wq_barrier {
struct work_struct work;
struct completion done;
+ struct task_struct *task; /* purely informational */
};
static void wq_barrier_func(struct work_struct *work)
@@ -2351,6 +2363,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
init_completion(&barr->done);
+ barr->task = current;
/*
* If @target is currently being executed, schedule the
@@ -2989,323 +3002,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
}
EXPORT_SYMBOL_GPL(execute_in_process_context);
-#ifdef CONFIG_SYSFS
-/*
- * Workqueues with WQ_SYSFS flag set is visible to userland via
- * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
- * following attributes.
- *
- * per_cpu RO bool : whether the workqueue is per-cpu or unbound
- * max_active RW int : maximum number of in-flight work items
- *
- * Unbound workqueues have the following extra attributes.
- *
- * id RO int : the associated pool ID
- * nice RW int : nice value of the workers
- * cpumask RW mask : bitmask of allowed CPUs for the workers
- */
-struct wq_device {
- struct workqueue_struct *wq;
- struct device dev;
-};
-
-static struct workqueue_struct *dev_to_wq(struct device *dev)
-{
- struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
- return wq_dev->wq;
-}
-
-static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
-
- return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
-}
-static DEVICE_ATTR_RO(per_cpu);
-
-static ssize_t max_active_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
-
- return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
-}
-
-static ssize_t max_active_store(struct device *dev,
- struct device_attribute *attr, const char *buf,
- size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int val;
-
- if (sscanf(buf, "%d", &val) != 1 || val <= 0)
- return -EINVAL;
-
- workqueue_set_max_active(wq, val);
- return count;
-}
-static DEVICE_ATTR_RW(max_active);
-
-static struct attribute *wq_sysfs_attrs[] = {
- &dev_attr_per_cpu.attr,
- &dev_attr_max_active.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(wq_sysfs);
-
-static ssize_t wq_pool_ids_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- const char *delim = "";
- int node, written = 0;
-
- rcu_read_lock_sched();
- for_each_node(node) {
- written += scnprintf(buf + written, PAGE_SIZE - written,
- "%s%d:%d", delim, node,
- unbound_pwq_by_node(wq, node)->pool->id);
- delim = " ";
- }
- written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
- rcu_read_unlock_sched();
-
- return written;
-}
-
-static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int written;
-
- mutex_lock(&wq->mutex);
- written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
- mutex_unlock(&wq->mutex);
-
- return written;
-}
-
-/* prepare workqueue_attrs for sysfs store operations */
-static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
-{
- struct workqueue_attrs *attrs;
-
- attrs = alloc_workqueue_attrs(GFP_KERNEL);
- if (!attrs)
- return NULL;
-
- mutex_lock(&wq->mutex);
- copy_workqueue_attrs(attrs, wq->unbound_attrs);
- mutex_unlock(&wq->mutex);
- return attrs;
-}
-
-static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- struct workqueue_attrs *attrs;
- int ret;
-
- attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- return -ENOMEM;
-
- if (sscanf(buf, "%d", &attrs->nice) == 1 &&
- attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
- ret = apply_workqueue_attrs(wq, attrs);
- else
- ret = -EINVAL;
-
- free_workqueue_attrs(attrs);
- return ret ?: count;
-}
-
-static ssize_t wq_cpumask_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int written;
-
- mutex_lock(&wq->mutex);
- written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
- cpumask_pr_args(wq->unbound_attrs->cpumask));
- mutex_unlock(&wq->mutex);
- return written;
-}
-
-static ssize_t wq_cpumask_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- struct workqueue_attrs *attrs;
- int ret;
-
- attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- return -ENOMEM;
-
- ret = cpumask_parse(buf, attrs->cpumask);
- if (!ret)
- ret = apply_workqueue_attrs(wq, attrs);
-
- free_workqueue_attrs(attrs);
- return ret ?: count;
-}
-
-static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int written;
-
- mutex_lock(&wq->mutex);
- written = scnprintf(buf, PAGE_SIZE, "%d\n",
- !wq->unbound_attrs->no_numa);
- mutex_unlock(&wq->mutex);
-
- return written;
-}
-
-static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- struct workqueue_attrs *attrs;
- int v, ret;
-
- attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- return -ENOMEM;
-
- ret = -EINVAL;
- if (sscanf(buf, "%d", &v) == 1) {
- attrs->no_numa = !v;
- ret = apply_workqueue_attrs(wq, attrs);
- }
-
- free_workqueue_attrs(attrs);
- return ret ?: count;
-}
-
-static struct device_attribute wq_sysfs_unbound_attrs[] = {
- __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
- __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
- __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
- __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
- __ATTR_NULL,
-};
-
-static struct bus_type wq_subsys = {
- .name = "workqueue",
- .dev_groups = wq_sysfs_groups,
-};
-
-static int __init wq_sysfs_init(void)
-{
- return subsys_virtual_register(&wq_subsys, NULL);
-}
-core_initcall(wq_sysfs_init);
-
-static void wq_device_release(struct device *dev)
-{
- struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
- kfree(wq_dev);
-}
-
-/**
- * workqueue_sysfs_register - make a workqueue visible in sysfs
- * @wq: the workqueue to register
- *
- * Expose @wq in sysfs under /sys/bus/workqueue/devices.
- * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
- * which is the preferred method.
- *
- * Workqueue user should use this function directly iff it wants to apply
- * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
- * apply_workqueue_attrs() may race against userland updating the
- * attributes.
- *
- * Return: 0 on success, -errno on failure.
- */
-int workqueue_sysfs_register(struct workqueue_struct *wq)
-{
- struct wq_device *wq_dev;
- int ret;
-
- /*
- * Adjusting max_active or creating new pwqs by applyting
- * attributes breaks ordering guarantee. Disallow exposing ordered
- * workqueues.
- */
- if (WARN_ON(wq->flags & __WQ_ORDERED))
- return -EINVAL;
-
- wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
- if (!wq_dev)
- return -ENOMEM;
-
- wq_dev->wq = wq;
- wq_dev->dev.bus = &wq_subsys;
- wq_dev->dev.init_name = wq->name;
- wq_dev->dev.release = wq_device_release;
-
- /*
- * unbound_attrs are created separately. Suppress uevent until
- * everything is ready.
- */
- dev_set_uevent_suppress(&wq_dev->dev, true);
-
- ret = device_register(&wq_dev->dev);
- if (ret) {
- kfree(wq_dev);
- wq->wq_dev = NULL;
- return ret;
- }
-
- if (wq->flags & WQ_UNBOUND) {
- struct device_attribute *attr;
-
- for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
- ret = device_create_file(&wq_dev->dev, attr);
- if (ret) {
- device_unregister(&wq_dev->dev);
- wq->wq_dev = NULL;
- return ret;
- }
- }
- }
-
- dev_set_uevent_suppress(&wq_dev->dev, false);
- kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
- return 0;
-}
-
-/**
- * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
- * @wq: the workqueue to unregister
- *
- * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
- */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
-{
- struct wq_device *wq_dev = wq->wq_dev;
-
- if (!wq->wq_dev)
- return;
-
- wq->wq_dev = NULL;
- device_unregister(&wq_dev->dev);
-}
-#else /* CONFIG_SYSFS */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
-#endif /* CONFIG_SYSFS */
-
/**
* free_workqueue_attrs - free a workqueue_attrs
* @attrs: workqueue_attrs to free
@@ -3424,6 +3120,20 @@ static int init_worker_pool(struct worker_pool *pool)
return 0;
}
+static void rcu_free_wq(struct rcu_head *rcu)
+{
+ struct workqueue_struct *wq =
+ container_of(rcu, struct workqueue_struct, rcu);
+
+ if (!(wq->flags & WQ_UNBOUND))
+ free_percpu(wq->cpu_pwqs);
+ else
+ free_workqueue_attrs(wq->unbound_attrs);
+
+ kfree(wq->rescuer);
+ kfree(wq);
+}
+
static void rcu_free_pool(struct rcu_head *rcu)
{
struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
@@ -3601,12 +3311,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
/*
* If we're the last pwq going away, @wq is already dead and no one
- * is gonna access it anymore. Free it.
+ * is gonna access it anymore. Schedule RCU free.
*/
- if (is_last) {
- free_workqueue_attrs(wq->unbound_attrs);
- kfree(wq);
- }
+ if (is_last)
+ call_rcu_sched(&wq->rcu, rcu_free_wq);
}
/**
@@ -4143,7 +3851,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
pwq_adjust_max_active(pwq);
mutex_unlock(&wq->mutex);
- list_add(&wq->list, &workqueues);
+ list_add_tail_rcu(&wq->list, &workqueues);
mutex_unlock(&wq_pool_mutex);
@@ -4199,24 +3907,20 @@ void destroy_workqueue(struct workqueue_struct *wq)
* flushing is complete in case freeze races us.
*/
mutex_lock(&wq_pool_mutex);
- list_del_init(&wq->list);
+ list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
workqueue_sysfs_unregister(wq);
- if (wq->rescuer) {
+ if (wq->rescuer)
kthread_stop(wq->rescuer->task);
- kfree(wq->rescuer);
- wq->rescuer = NULL;
- }
if (!(wq->flags & WQ_UNBOUND)) {
/*
* The base ref is never dropped on per-cpu pwqs. Directly
- * free the pwqs and wq.
+ * schedule RCU free.
*/
- free_percpu(wq->cpu_pwqs);
- kfree(wq);
+ call_rcu_sched(&wq->rcu, rcu_free_wq);
} else {
/*
* We're the sole accessor of @wq at this point. Directly
@@ -4437,6 +4141,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
}
}
+static void pr_cont_pool_info(struct worker_pool *pool)
+{
+ pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
+ if (pool->node != NUMA_NO_NODE)
+ pr_cont(" node=%d", pool->node);
+ pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
+}
+
+static void pr_cont_work(bool comma, struct work_struct *work)
+{
+ if (work->func == wq_barrier_func) {
+ struct wq_barrier *barr;
+
+ barr = container_of(work, struct wq_barrier, work);
+
+ pr_cont("%s BAR(%d)", comma ? "," : "",
+ task_pid_nr(barr->task));
+ } else {
+ pr_cont("%s %pf", comma ? "," : "", work->func);
+ }
+}
+
+static void show_pwq(struct pool_workqueue *pwq)
+{
+ struct worker_pool *pool = pwq->pool;
+ struct work_struct *work;
+ struct worker *worker;
+ bool has_in_flight = false, has_pending = false;
+ int bkt;
+
+ pr_info(" pwq %d:", pool->id);
+ pr_cont_pool_info(pool);
+
+ pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+ !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
+
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (worker->current_pwq == pwq) {
+ has_in_flight = true;
+ break;
+ }
+ }
+ if (has_in_flight) {
+ bool comma = false;
+
+ pr_info(" in-flight:");
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (worker->current_pwq != pwq)
+ continue;
+
+ pr_cont("%s %d%s:%pf", comma ? "," : "",
+ task_pid_nr(worker->task),
+ worker == pwq->wq->rescuer ? "(RESCUER)" : "",
+ worker->current_func);
+ list_for_each_entry(work, &worker->scheduled, entry)
+ pr_cont_work(false, work);
+ comma = true;
+ }
+ pr_cont("\n");
+ }
+
+ list_for_each_entry(work, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq) {
+ has_pending = true;
+ break;
+ }
+ }
+ if (has_pending) {
+ bool comma = false;
+
+ pr_info(" pending:");
+ list_for_each_entry(work, &pool->worklist, entry) {
+ if (get_work_pwq(work) != pwq)
+ continue;
+
+ pr_cont_work(comma, work);
+ comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+ }
+ pr_cont("\n");
+ }
+
+ if (!list_empty(&pwq->delayed_works)) {
+ bool comma = false;
+
+ pr_info(" delayed:");
+ list_for_each_entry(work, &pwq->delayed_works, entry) {
+ pr_cont_work(comma, work);
+ comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+ }
+ pr_cont("\n");
+ }
+}
+
+/**
+ * show_workqueue_state - dump workqueue state
+ *
+ * Called from a sysrq handler and prints out all busy workqueues and
+ * pools.
+ */
+void show_workqueue_state(void)
+{
+ struct workqueue_struct *wq;
+ struct worker_pool *pool;
+ unsigned long flags;
+ int pi;
+
+ rcu_read_lock_sched();
+
+ pr_info("Showing busy workqueues and worker pools:\n");
+
+ list_for_each_entry_rcu(wq, &workqueues, list) {
+ struct pool_workqueue *pwq;
+ bool idle = true;
+
+ for_each_pwq(pwq, wq) {
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
+ idle = false;
+ break;
+ }
+ }
+ if (idle)
+ continue;
+
+ pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+
+ for_each_pwq(pwq, wq) {
+ spin_lock_irqsave(&pwq->pool->lock, flags);
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+ show_pwq(pwq);
+ spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ }
+ }
+
+ for_each_pool(pool, pi) {
+ struct worker *worker;
+ bool first = true;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (pool->nr_workers == pool->nr_idle)
+ goto next_pool;
+
+ pr_info("pool %d:", pool->id);
+ pr_cont_pool_info(pool);
+ pr_cont(" workers=%d", pool->nr_workers);
+ if (pool->manager)
+ pr_cont(" manager: %d",
+ task_pid_nr(pool->manager->task));
+ list_for_each_entry(worker, &pool->idle_list, entry) {
+ pr_cont(" %s%d", first ? "idle: " : "",
+ task_pid_nr(worker->task));
+ first = false;
+ }
+ pr_cont("\n");
+ next_pool:
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+
+ rcu_read_unlock_sched();
+}
+
/*
* CPU hotplug.
*
@@ -4834,6 +4698,323 @@ out_unlock:
}
#endif /* CONFIG_FREEZER */
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
+ * following attributes.
+ *
+ * per_cpu RO bool : whether the workqueue is per-cpu or unbound
+ * max_active RW int : maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ * id RO int : the associated pool ID
+ * nice RW int : nice value of the workers
+ * cpumask RW mask : bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+ struct workqueue_struct *wq;
+ struct device dev;
+};
+
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+ struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+ return wq_dev->wq;
+}
+
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+static DEVICE_ATTR_RO(per_cpu);
+
+static ssize_t max_active_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+
+static ssize_t max_active_store(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int val;
+
+ if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+ return -EINVAL;
+
+ workqueue_set_max_active(wq, val);
+ return count;
+}
+static DEVICE_ATTR_RW(max_active);
+
+static struct attribute *wq_sysfs_attrs[] = {
+ &dev_attr_per_cpu.attr,
+ &dev_attr_max_active.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(wq_sysfs);
+
+static ssize_t wq_pool_ids_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ const char *delim = "";
+ int node, written = 0;
+
+ rcu_read_lock_sched();
+ for_each_node(node) {
+ written += scnprintf(buf + written, PAGE_SIZE - written,
+ "%s%d:%d", delim, node,
+ unbound_pwq_by_node(wq, node)->pool->id);
+ delim = " ";
+ }
+ written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+ rcu_read_unlock_sched();
+
+ return written;
+}
+
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+ struct workqueue_attrs *attrs;
+
+ attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!attrs)
+ return NULL;
+
+ mutex_lock(&wq->mutex);
+ copy_workqueue_attrs(attrs, wq->unbound_attrs);
+ mutex_unlock(&wq->mutex);
+ return attrs;
+}
+
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+ attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
+ ret = apply_workqueue_attrs(wq, attrs);
+ else
+ ret = -EINVAL;
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static ssize_t wq_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+ cpumask_pr_args(wq->unbound_attrs->cpumask));
+ mutex_unlock(&wq->mutex);
+ return written;
+}
+
+static ssize_t wq_cpumask_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ ret = cpumask_parse(buf, attrs->cpumask);
+ if (!ret)
+ ret = apply_workqueue_attrs(wq, attrs);
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n",
+ !wq->unbound_attrs->no_numa);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int v, ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ ret = -EINVAL;
+ if (sscanf(buf, "%d", &v) == 1) {
+ attrs->no_numa = !v;
+ ret = apply_workqueue_attrs(wq, attrs);
+ }
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+ __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+ __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+ __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+ __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+ __ATTR_NULL,
+};
+
+static struct bus_type wq_subsys = {
+ .name = "workqueue",
+ .dev_groups = wq_sysfs_groups,
+};
+
+static int __init wq_sysfs_init(void)
+{
+ return subsys_virtual_register(&wq_subsys, NULL);
+}
+core_initcall(wq_sysfs_init);
+
+static void wq_device_release(struct device *dev)
+{
+ struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+ kfree(wq_dev);
+}
+
+/**
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
+ *
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
+ *
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int workqueue_sysfs_register(struct workqueue_struct *wq)
+{
+ struct wq_device *wq_dev;
+ int ret;
+
+ /*
+ * Adjusting max_active or creating new pwqs by applyting
+ * attributes breaks ordering guarantee. Disallow exposing ordered
+ * workqueues.
+ */
+ if (WARN_ON(wq->flags & __WQ_ORDERED))
+ return -EINVAL;
+
+ wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+ if (!wq_dev)
+ return -ENOMEM;
+
+ wq_dev->wq = wq;
+ wq_dev->dev.bus = &wq_subsys;
+ wq_dev->dev.init_name = wq->name;
+ wq_dev->dev.release = wq_device_release;
+
+ /*
+ * unbound_attrs are created separately. Suppress uevent until
+ * everything is ready.
+ */
+ dev_set_uevent_suppress(&wq_dev->dev, true);
+
+ ret = device_register(&wq_dev->dev);
+ if (ret) {
+ kfree(wq_dev);
+ wq->wq_dev = NULL;
+ return ret;
+ }
+
+ if (wq->flags & WQ_UNBOUND) {
+ struct device_attribute *attr;
+
+ for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+ ret = device_create_file(&wq_dev->dev, attr);
+ if (ret) {
+ device_unregister(&wq_dev->dev);
+ wq->wq_dev = NULL;
+ return ret;
+ }
+ }
+ }
+
+ dev_set_uevent_suppress(&wq_dev->dev, false);
+ kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+ return 0;
+}
+
+/**
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
+ *
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+ */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+{
+ struct wq_device *wq_dev = wq->wq_dev;
+
+ if (!wq->wq_dev)
+ return;
+
+ wq->wq_dev = NULL;
+ device_unregister(&wq_dev->dev);
+}
+#else /* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
+#endif /* CONFIG_SYSFS */
+
static void __init wq_numa_init(void)
{
cpumask_var_t *tbl;