From 7539a3b3d1f892dd97eaf094134d7de55c13befe Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 13 Dec 2009 00:07:30 +0100 Subject: sched: Make wakeup side and atomic variants of completion API irq safe Alan Stern noticed that all the wakeup side (and atomic) variants of the completion APIs should be irq safe, but the newly introduced completion_done() and try_wait_for_completion() aren't. The use of the irq unsafe variants in IRQ contexts can cause crashes/hangs. Fix the problem by making them use spin_lock_irqsave() and spin_lock_irqrestore(). Reported-by: Alan Stern Signed-off-by: Rafael J. Wysocki Cc: Linus Torvalds Cc: Zhang Rui Cc: pm list Cc: Peter Zijlstra Cc: David Chinner Cc: Lachlan McIlroy LKML-Reference: <200912130007.30541.rjw@sisk.pl> Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index ff39cad..8b3532f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5908,14 +5908,15 @@ EXPORT_SYMBOL(wait_for_completion_killable); */ bool try_wait_for_completion(struct completion *x) { + unsigned long flags; int ret = 1; - spin_lock_irq(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; else x->done--; - spin_unlock_irq(&x->wait.lock); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(try_wait_for_completion); @@ -5930,12 +5931,13 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { + unsigned long flags; int ret = 1; - spin_lock_irq(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; - spin_unlock_irq(&x->wait.lock); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(completion_done); -- cgit v0.10.2 From 663997d417330a59a566452f52cfa04c8ffd190b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 12 Dec 2009 13:57:27 -0800 Subject: sched: Use pr_fmt() and pr_() - Convert printk(KERN_ to pr_ (not KERN_DEBUG) - Add #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Coalesce long format strings - Add missing \n to "ERROR: !SD_LOAD_BALANCE domain has parent" Signed-off-by: Joe Perches Cc: Peter Zijlstra LKML-Reference: <1260655047.2637.7.camel@Joe-Laptop.home> Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 8b3532f..258c73c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -26,6 +26,8 @@ * Thomas Gleixner, Mike Kravetz */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -5337,8 +5339,8 @@ static noinline void __schedule_bug(struct task_struct *prev) { struct pt_regs *regs = get_irq_regs(); - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); + pr_err("BUG: scheduling while atomic: %s/%d/0x%08x\n", + prev->comm, prev->pid, preempt_count()); debug_show_held_locks(prev); print_modules(); @@ -6906,23 +6908,23 @@ void sched_show_task(struct task_struct *p) unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-13.13s %c", p->comm, + pr_info("%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 if (state == TASK_RUNNING) - printk(KERN_CONT " running "); + pr_cont(" running "); else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); + pr_cont(" %08lx ", thread_saved_pc(p)); #else if (state == TASK_RUNNING) - printk(KERN_CONT " running task "); + pr_cont(" running task "); else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); + pr_cont(" %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + pr_cont("%5lu %5d %6d 0x%08lx\n", free, task_pid_nr(p), task_pid_nr(p->real_parent), (unsigned long)task_thread_info(p)->flags); @@ -6934,11 +6936,9 @@ void show_state_filter(unsigned long state_filter) struct task_struct *g, *p; #if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); + pr_info(" task PC stack pid father\n"); #else - printk(KERN_INFO - " task PC stack pid father\n"); + pr_info(" task PC stack pid father\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { @@ -7296,9 +7296,8 @@ again: * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); + pr_info("process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); } } @@ -7805,48 +7804,44 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_DEBUG "%*s domain %d: ", level, "", level); if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); + pr_cont("does not load-balance\n"); if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); + pr_err("ERROR: !SD_LOAD_BALANCE domain has parent\n"); return -1; } - printk(KERN_CONT "span %s level %s\n", str, sd->name); + pr_cont("span %s level %s\n", str, sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); + pr_err("ERROR: domain->span does not contain CPU%d\n", cpu); } if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { - printk(KERN_ERR "ERROR: domain->groups does not contain" - " CPU%d\n", cpu); + pr_err("ERROR: domain->groups does not contain CPU%d\n", cpu); } printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { - printk("\n"); - printk(KERN_ERR "ERROR: group is NULL\n"); + pr_cont("\n"); + pr_err("ERROR: group is NULL\n"); break; } if (!group->cpu_power) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); + pr_cont("\n"); + pr_err("ERROR: domain->cpu_power not set\n"); break; } if (!cpumask_weight(sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: empty group\n"); + pr_cont("\n"); + pr_err("ERROR: empty group\n"); break; } if (cpumask_intersects(groupmask, sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: repeated CPUs\n"); + pr_cont("\n"); + pr_err("ERROR: repeated CPUs\n"); break; } @@ -7854,23 +7849,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - printk(KERN_CONT " %s", str); + pr_cont(" %s", str); if (group->cpu_power != SCHED_LOAD_SCALE) { - printk(KERN_CONT " (cpu_power = %d)", - group->cpu_power); + pr_cont(" (cpu_power = %d)", group->cpu_power); } group = group->next; } while (group != sd->groups); - printk(KERN_CONT "\n"); + pr_cont("\n"); if (!cpumask_equal(sched_domain_span(sd), groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + pr_err("ERROR: groups don't span domain->span\n"); if (sd->parent && !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); + pr_err("ERROR: parent span is not a superset of domain->span\n"); return 0; } @@ -8426,8 +8419,7 @@ static int build_numa_sched_groups(struct s_data *d, sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, num); if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for node %d\n", - num); + pr_warning("Can not alloc domain group for node %d\n", num); return -ENOMEM; } d->sched_group_nodes[num] = sg; @@ -8456,8 +8448,8 @@ static int build_numa_sched_groups(struct s_data *d, sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, num); if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); + pr_warning("Can not alloc domain group for node %d\n", + j); return -ENOMEM; } sg->cpu_power = 0; @@ -8685,7 +8677,7 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, d->sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), GFP_KERNEL); if (!d->sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); + pr_warning("Can not alloc sched group node list\n"); return sa_notcovered; } sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; @@ -8702,7 +8694,7 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_send_covered; d->rd = alloc_rootdomain(); if (!d->rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); + pr_warning("Cannot alloc root domain\n"); return sa_tmpmask; } return sa_rootdomain; @@ -9684,13 +9676,11 @@ void __might_sleep(char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), - current->pid, current->comm); + pr_err("BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + pr_err("in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); debug_show_held_locks(current); if (irqs_disabled()) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 33d5384..b810e22 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -35,7 +35,7 @@ static void dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) { spin_unlock_irq(&rq->lock); - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + pr_err("bad: scheduling from the idle thread!\n"); dump_stack(); spin_lock_irq(&rq->lock); } -- cgit v0.10.2 From 5fe85be081edf0ac92d83f9c39e0ab5c1371eb82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 10:14:58 +0000 Subject: sched: Use rcu in sys_sched_getscheduler/sys_sched_getparam() read_lock(&tasklist_lock) does not protect sys_sched_getscheduler and sys_sched_getparam() against a concurrent update of the policy or scheduler parameters as do_sched_setscheduler() does not take the tasklist_lock. The accessed integers can be retrieved w/o locking and are snapshots anyway. Using rcu_read_lock() to protect find_task_by_vpid() and prevent the task struct from going away is not changing the above situation. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20091209100706.753790977@linutronix.de> Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 258c73c..1782bee 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6458,7 +6458,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) return -EINVAL; retval = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (p) { retval = security_task_getscheduler(p); @@ -6466,7 +6466,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) retval = p->policy | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } @@ -6484,7 +6484,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (!param || pid < 0) return -EINVAL; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) @@ -6495,7 +6495,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) goto out_unlock; lp.sched_priority = p->rt_priority; - read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * This one might sleep, we cannot do it with a spinlock held ... @@ -6505,7 +6505,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) return retval; out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } -- cgit v0.10.2 From 23f5d142519621b16cf2b378cf8adf4dcf01a616 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 10:15:01 +0000 Subject: sched: Use rcu in sched_get/set_affinity() tasklist_lock is held read locked to protect the find_task_by_vpid() call and to prevent the task going away. sched_setaffinity acquires a task struct ref and drops tasklist lock right away. The access to the cpus_allowed mask is protected by rq->lock. rcu_read_lock() provides the same protection here. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20091209100706.789059966@linutronix.de> Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 1782bee..7989312 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6516,22 +6516,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) int retval; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); return -ESRCH; } - /* - * It is not safe to call set_cpus_allowed with the - * tasklist_lock held. We will bump the task_struct's - * usage count and then drop tasklist_lock. - */ + /* Prevent p going away */ get_task_struct(p); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; @@ -6617,7 +6613,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) int retval; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); @@ -6633,7 +6629,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) task_rq_unlock(rq, &flags); out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); return retval; -- cgit v0.10.2 From 1a551ae715825bb2a2107a2dd68de024a1fa4e32 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 10:15:11 +0000 Subject: sched: Use rcu in sched_get_rr_param() read_lock(&tasklist_lock) does not protect sys_sched_get_rr_param() against a concurrent update of the policy or scheduler parameters as do_sched_scheduler() does not take the tasklist_lock. The access to task->sched_class->get_rr_interval is protected by task_rq_lock(task). Use rcu_read_lock() to protect find_task_by_vpid() and prevent the task struct from going away. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20091209100706.862897167@linutronix.de> Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 7989312..db5c266 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6873,7 +6873,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, return -EINVAL; retval = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (!p) goto out_unlock; @@ -6886,13 +6886,13 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, time_slice = p->sched_class->get_rr_interval(rq, p); task_rq_unlock(rq, &flags); - read_unlock(&tasklist_lock); + rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; return retval; out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } -- cgit v0.10.2 From b9f8fcd55bbdb037e5332dbdb7b494f0b70861ac Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 13 Dec 2009 18:25:02 -0800 Subject: sched: Fix cpu_clock() in NMIs, on !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK Relax stable-sched-clock architectures to not save/disable/restore hardirqs in cpu_clock(). The background is that I was trying to resolve a sparc64 perf issue when I discovered this problem. On sparc64 I implement pseudo NMIs by simply running the kernel at IRQ level 14 when local_irq_disable() is called, this allows performance counter events to still come in at IRQ level 15. This doesn't work if any code in an NMI handler does local_irq_save() or local_irq_disable() since the "disable" will kick us back to cpu IRQ level 14 thus letting NMIs back in and we recurse. The only path which that does that in the perf event IRQ handling path is the code supporting frequency based events. It uses cpu_clock(). cpu_clock() simply invokes sched_clock() with IRQs disabled. And that's a fundamental bug all on it's own, particularly for the HAVE_UNSTABLE_SCHED_CLOCK case. NMIs can thus get into the sched_clock() code interrupting the local IRQ disable code sections of it. Furthermore, for the not-HAVE_UNSTABLE_SCHED_CLOCK case, the IRQ disabling done by cpu_clock() is just pure overhead and completely unnecessary. So the core problem is that sched_clock() is not NMI safe, but we are invoking it from NMI contexts in the perf events code (via cpu_clock()). A less important issue is the overhead of IRQ disabling when it isn't necessary in cpu_clock(). CONFIG_HAVE_UNSTABLE_SCHED_CLOCK architectures are not affected by this patch. Signed-off-by: David S. Miller Acked-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091213.182502.215092085.davem@davemloft.net> Signed-off-by: Ingo Molnar diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 479ce56..5b49613 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -236,6 +236,18 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); +unsigned long long cpu_clock(int cpu) +{ + unsigned long long clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(cpu); + local_irq_restore(flags); + + return clock; +} + #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -251,17 +263,12 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ unsigned long long cpu_clock(int cpu) { - unsigned long long clock; - unsigned long flags; + return sched_clock_cpu(cpu); +} - local_irq_save(flags); - clock = sched_clock_cpu(cpu); - local_irq_restore(flags); +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ - return clock; -} EXPORT_SYMBOL_GPL(cpu_clock); -- cgit v0.10.2 From 933b0618d8b2a59c7a0742e43836544e02f1e9bd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:31 +0100 Subject: sched: Mark boot-cpu active before smp_init() A UP machine has 1 active cpu, not having the boot-cpu in the active map when starting the scheduler confuses things. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.423469527@chello.nl> Signed-off-by: Ingo Molnar diff --git a/init/main.c b/init/main.c index c3db4a9..dac44a9 100644 --- a/init/main.c +++ b/init/main.c @@ -369,12 +369,6 @@ static void __init smp_init(void) { unsigned int cpu; - /* - * Set up the current CPU as possible to migrate to. - * The other ones will be done by cpu_up/cpu_down() - */ - set_cpu_active(smp_processor_id(), true); - /* FIXME: This should be done in userspace --RR */ for_each_present_cpu(cpu) { if (num_online_cpus() >= setup_max_cpus) @@ -486,6 +480,7 @@ static void __init boot_cpu_init(void) int cpu = smp_processor_id(); /* Mark the boot cpu "present", "online" etc for SMP and UP case */ set_cpu_online(cpu, true); + set_cpu_active(cpu, true); set_cpu_present(cpu, true); set_cpu_possible(cpu, true); } -- cgit v0.10.2 From 9ee349ad6d326df3633d43f54202427295999c47 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Wed, 16 Dec 2009 18:04:32 +0100 Subject: sched: Fix set_cpu_active() in cpu_down() Sachin found cpu hotplug test failures on powerpc, which made the kernel hang on his POWER box. The problem is that we fail to re-activate a cpu when a hot-unplug fails. Fix this by moving the de-activation into _cpu_down after doing the initial checks. Remove the synchronize_sched() calls and rely on those implied by rebuilding the sched domains using the new mask. Reported-by: Sachin Sant Signed-off-by: Xiaotian Feng Tested-by: Sachin Sant Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.500272612@chello.nl> Signed-off-by: Ingo Molnar diff --git a/kernel/cpu.c b/kernel/cpu.c index 291ac58..1c8ddd6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -209,6 +209,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -ENOMEM; cpu_hotplug_begin(); + set_cpu_active(cpu, false); err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err == NOTIFY_BAD) { @@ -280,18 +281,6 @@ int __ref cpu_down(unsigned int cpu) goto out; } - set_cpu_active(cpu, false); - - /* - * Make sure the all cpus did the reschedule and are not - * using stale version of the cpu_active_mask. - * This is not strictly necessary becuase stop_machine() - * that we run down the line already provides the required - * synchronization. But it's really a side effect and we do not - * want to depend on the innards of the stop_machine here. - */ - synchronize_sched(); - err = _cpu_down(cpu, 0); out: @@ -382,19 +371,12 @@ int disable_nonboot_cpus(void) return error; cpu_maps_update_begin(); first_cpu = cpumask_first(cpu_online_mask); - /* We take down all of the non-boot CPUs in one shot to avoid races + /* + * We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); - for_each_online_cpu(cpu) { - if (cpu == first_cpu) - continue; - set_cpu_active(cpu, false); - } - - synchronize_sched(); - printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) -- cgit v0.10.2 From e6c8fba7771563b2f3dfb96a78f36ec17e15bdf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:33 +0100 Subject: sched: Fix task_hot() test order Make sure not to access sched_fair fields before verifying it is indeed a sched_fair task. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith CC: stable@kernel.org LKML-Reference: <20091216170517.577998058@chello.nl> Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 9c30858..1d8ca25 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2046,6 +2046,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { s64 delta; + if (p->sched_class != &fair_sched_class) + return 0; + /* * Buddy candidates are cache hot: */ @@ -2054,9 +2057,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) &p->se == cfs_rq_of(&p->se)->last)) return 1; - if (p->sched_class != &fair_sched_class) - return 0; - if (sysctl_sched_migration_cost == -1) return 1; if (sysctl_sched_migration_cost == 0) -- cgit v0.10.2 From e4f4288842ee12747e10c354d72be7d424c0b627 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:34 +0100 Subject: sched: Select_task_rq_fair() must honour SD_LOAD_BALANCE We should skip !SD_LOAD_BALANCE domains. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.653578430@chello.nl> CC: stable@kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5bedf6e..ec1d271 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1429,6 +1429,9 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag } for_each_domain(cpu, tmp) { + if (!(tmp->flags & SD_LOAD_BALANCE)) + continue; + /* * If power savings logic is enabled for a domain, see if we * are not overloaded, if so, don't balance wider. -- cgit v0.10.2 From 06b83b5fbea273672822b6ee93e16781046553ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:35 +0100 Subject: sched: Use TASK_WAKING for fork wakups For later convenience use TASK_WAKING for fresh tasks. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.732561278@chello.nl> Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 1d8ca25..1672823 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2540,14 +2540,6 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif - - /* - * We mark the process as running here, but have not actually - * inserted it onto the runqueue yet. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. - */ - p->state = TASK_RUNNING; } /* @@ -2558,6 +2550,12 @@ void sched_fork(struct task_struct *p, int clone_flags) int cpu = get_cpu(); __sched_fork(p); + /* + * We mark the process as waking here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_WAKING; /* * Revert to default priority/policy on fork if requested. @@ -2626,7 +2624,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) struct rq *rq; rq = task_rq_lock(p, &flags); - BUG_ON(p->state != TASK_RUNNING); + BUG_ON(p->state != TASK_WAKING); + p->state = TASK_RUNNING; update_rq_clock(rq); activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); @@ -6984,6 +6983,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) raw_spin_lock_irqsave(&rq->lock, flags); __sched_fork(idle); + idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); -- cgit v0.10.2 From e2912009fb7b715728311b0d8fe327a1432b3f79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:36 +0100 Subject: sched: Ensure set_task_cpu() is never called on blocked tasks In order to clean up the set_task_cpu() rq dependencies we need to ensure it is never called on blocked tasks because such usage does not pair with consistent rq->lock usage. This puts the migration burden on ttwu(). Furthermore we need to close a race against changing ->cpus_allowed, since select_task_rq() runs with only preemption disabled. For sched_fork() this is safe because the child isn't in the tasklist yet, for wakeup we fix this by synchronizing set_cpus_allowed_ptr() against TASK_WAKING, which leaves sched_exec to be a problem This also closes a hole in (6ad4c1888 sched: Fix balance vs hotplug race) where ->select_task_rq() doesn't validate the result against the sched_domain/root_domain. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.807938893@chello.nl> Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 1672823..33d7965 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2018,22 +2018,15 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, */ void kthread_bind(struct task_struct *p, unsigned int cpu) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - /* Must have done schedule() in kthread() before we set_task_cpu */ if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { WARN_ON(1); return; } - raw_spin_lock_irqsave(&rq->lock, flags); - update_rq_clock(rq); - set_task_cpu(p, cpu); p->cpus_allowed = cpumask_of_cpu(cpu); p->rt.nr_cpus_allowed = 1; p->flags |= PF_THREAD_BOUND; - raw_spin_unlock_irqrestore(&rq->lock, flags); } EXPORT_SYMBOL(kthread_bind); @@ -2074,6 +2067,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) struct cfs_rq *old_cfsrq = task_cfs_rq(p), *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); +#ifdef CONFIG_SCHED_DEBUG + /* + * We should never call set_task_cpu() on a blocked task, + * ttwu() will sort out the placement. + */ + WARN_ON(p->state != TASK_RUNNING && p->state != TASK_WAKING); +#endif + trace_sched_migrate_task(p, new_cpu); if (old_cpu != new_cpu) { @@ -2107,13 +2108,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) /* * If the task is not on a runqueue (and not running), then - * it is sufficient to simply update the task's cpu field. + * the next wake-up will properly place the task. */ - if (!p->se.on_rq && !task_running(rq, p)) { - update_rq_clock(rq); - set_task_cpu(p, dest_cpu); + if (!p->se.on_rq && !task_running(rq, p)) return 0; - } init_completion(&req->done); req->task = p; @@ -2319,10 +2317,42 @@ void task_oncpu_function_call(struct task_struct *p, } #ifdef CONFIG_SMP +/* + * Called from: + * + * - fork, @p is stable because it isn't on the tasklist yet + * + * - exec, @p is unstable XXX + * + * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so + * we should be good. + */ static inline int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) { - return p->sched_class->select_task_rq(p, sd_flags, wake_flags); + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + + /* + * In order not to call set_task_cpu() on a blocking task we need + * to rely on ttwu() to place the task on a valid ->cpus_allowed + * cpu. + * + * Since this is common to all placement strategies, this lives here. + * + * [ this allows ->select_task() to simply return task_cpu(p) and + * not worry about this generic constraint ] + */ + if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || + !cpu_active(cpu))) { + + cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + /* + * XXX: race against hot-plug modifying cpu_active_mask + */ + BUG_ON(cpu >= nr_cpu_ids); + } + + return cpu; } #endif @@ -7098,7 +7128,23 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) struct rq *rq; int ret = 0; + /* + * Since we rely on wake-ups to migrate sleeping tasks, don't change + * the ->cpus_allowed mask from under waking tasks, which would be + * possible when we change rq->lock in ttwu(), so synchronize against + * TASK_WAKING to avoid that. + */ +again: + while (p->state == TASK_WAKING) + cpu_relax(); + rq = task_rq_lock(p, &flags); + + if (p->state == TASK_WAKING) { + task_rq_unlock(rq, &flags); + goto again; + } + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; @@ -7154,7 +7200,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; - int ret = 0, on_rq; + int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; @@ -7170,12 +7216,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; - on_rq = p->se.on_rq; - if (on_rq) + /* + * If we're not on a rq, the next wake-up will ensure we're + * placed properly. + */ + if (p->se.on_rq) { deactivate_task(rq_src, p, 0); - - set_task_cpu(p, dest_cpu); - if (on_rq) { + set_task_cpu(p, dest_cpu); activate_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p, 0); } -- cgit v0.10.2 From 3802290628348674985d14914f9bfee7b9084548 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:37 +0100 Subject: sched: Fix sched_exec() balancing Since we access ->cpus_allowed without holding rq->lock we need a retry loop to validate the result, this comes for near free when we merge sched_migrate_task() into sched_exec() since that already does the needed check. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.884743662@chello.nl> Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 33d7965..63e55ac 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2322,7 +2322,7 @@ void task_oncpu_function_call(struct task_struct *p, * * - fork, @p is stable because it isn't on the tasklist yet * - * - exec, @p is unstable XXX + * - exec, @p is unstable, retry loop * * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so * we should be good. @@ -3132,21 +3132,36 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) } /* - * If dest_cpu is allowed for this process, migrate the task to it. - * This is accomplished by forcing the cpu_allowed mask to only - * allow dest_cpu, which will force the cpu onto dest_cpu. Then - * the cpu_allowed mask is restored. + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. */ -static void sched_migrate_task(struct task_struct *p, int dest_cpu) +void sched_exec(void) { + struct task_struct *p = current; struct migration_req req; + int dest_cpu, this_cpu; unsigned long flags; struct rq *rq; +again: + this_cpu = get_cpu(); + dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0); + if (dest_cpu == this_cpu) { + put_cpu(); + return; + } + rq = task_rq_lock(p, &flags); + put_cpu(); + + /* + * select_task_rq() can race against ->cpus_allowed + */ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) - || unlikely(!cpu_active(dest_cpu))) - goto out; + || unlikely(!cpu_active(dest_cpu))) { + task_rq_unlock(rq, &flags); + goto again; + } /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { @@ -3161,24 +3176,10 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) return; } -out: task_rq_unlock(rq, &flags); } /* - * sched_exec - execve() is a valuable balancing opportunity, because at - * this point the task has the smallest effective memory and cache footprint. - */ -void sched_exec(void) -{ - int new_cpu, this_cpu = get_cpu(); - new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0); - put_cpu(); - if (new_cpu != this_cpu) - sched_migrate_task(current, new_cpu); -} - -/* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -- cgit v0.10.2 From 5da9a0fb673a0ea0a093862f95f6b89b3390c31e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:38 +0100 Subject: sched: Fix select_task_rq() vs hotplug issues Since select_task_rq() is now responsible for guaranteeing ->cpus_allowed and cpu_active_mask, we need to verify this. select_task_rq_rt() can blindly return smp_processor_id()/task_cpu() without checking the valid masks, select_task_rq_fair() can do the same in the rare case that all SD_flags are disabled. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.961475466@chello.nl> Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 63e55ac..cc40bda 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2317,6 +2317,43 @@ void task_oncpu_function_call(struct task_struct *p, } #ifdef CONFIG_SMP +static int select_fallback_rq(int cpu, struct task_struct *p) +{ + int dest_cpu; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + return dest_cpu; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + if (dest_cpu < nr_cpu_ids) + return dest_cpu; + + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + rcu_read_lock(); + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + rcu_read_unlock(); + dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); + + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } + } + + return dest_cpu; +} + /* * Called from: * @@ -2343,14 +2380,8 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) * not worry about this generic constraint ] */ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || - !cpu_active(cpu))) { - - cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); - /* - * XXX: race against hot-plug modifying cpu_active_mask - */ - BUG_ON(cpu >= nr_cpu_ids); - } + !cpu_active(cpu))) + cpu = select_fallback_rq(task_cpu(p), p); return cpu; } @@ -7319,36 +7350,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { int dest_cpu; - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); again: - /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) - goto move; - - /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); - if (dest_cpu < nr_cpu_ids) - goto move; - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); - - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - pr_info("process %d (%s) no longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } - } + dest_cpu = select_fallback_rq(dead_cpu, p); -move: /* It can have affinity changed while we were choosing. */ if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) goto again; -- cgit v0.10.2 From 881232b70b195768a71cd74ff4b4e8ab9502997b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:39 +0100 Subject: sched: Move kthread_bind() back to kthread.c Since kthread_bind() lost its dependencies on sched.c, move it back where it came from. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.039524041@chello.nl> Signed-off-by: Ingo Molnar diff --git a/kernel/kthread.c b/kernel/kthread.c index ab7ae57..fbb6222 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -150,6 +150,29 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), EXPORT_SYMBOL(kthread_create); /** + * kthread_bind - bind a just-created kthread to a cpu. + * @p: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create()). + */ +void kthread_bind(struct task_struct *p, unsigned int cpu) +{ + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { + WARN_ON(1); + return; + } + + p->cpus_allowed = cpumask_of_cpu(cpu); + p->rt.nr_cpus_allowed = 1; + p->flags |= PF_THREAD_BOUND; +} +EXPORT_SYMBOL(kthread_bind); + +/** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * diff --git a/kernel/sched.c b/kernel/sched.c index cc40bda..297dc44 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2004,32 +2004,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } -/** - * kthread_bind - bind a just-created kthread to a cpu. - * @p: thread created by kthread_create(). - * @cpu: cpu (might not be online, must be possible) for @k to run on. - * - * Description: This function is equivalent to set_cpus_allowed(), - * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create()). - * - * Function lives here instead of kthread.c because it messes with - * scheduler internals which require locking. - */ -void kthread_bind(struct task_struct *p, unsigned int cpu) -{ - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - - p->cpus_allowed = cpumask_of_cpu(cpu); - p->rt.nr_cpus_allowed = 1; - p->flags |= PF_THREAD_BOUND; -} -EXPORT_SYMBOL(kthread_bind); - #ifdef CONFIG_SMP /* * Is this task likely cache-hot: -- cgit v0.10.2 From efbbd05a595343a413964ad85a2ad359b7b7efbd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:40 +0100 Subject: sched: Add pre and post wakeup hooks As will be apparent in the next patch, we need a pre wakeup hook for sched_fair task migration, hence rename the post wakeup hook and one pre wakeup. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.114746117@chello.nl> Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c858f3..2c9fa1c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1091,7 +1091,8 @@ struct sched_class { enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); - void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); + void (*task_waking) (struct rq *this_rq, struct task_struct *task); + void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, const struct cpumask *newmask); diff --git a/kernel/sched.c b/kernel/sched.c index 297dc44..6c571bd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2412,6 +2412,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; p->state = TASK_WAKING; + + if (p->sched_class->task_waking) + p->sched_class->task_waking(rq, p); + __task_rq_unlock(rq); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); @@ -2475,8 +2479,8 @@ out_running: p->state = TASK_RUNNING; #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); if (unlikely(rq->idle_stamp)) { u64 delta = rq->clock - rq->idle_stamp; @@ -2666,8 +2670,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, &flags); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d2ea282..f48328a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1472,7 +1472,7 @@ static void post_schedule_rt(struct rq *rq) * If we are not running and we are not going to reschedule soon, we should * try to push tasks away now */ -static void task_wake_up_rt(struct rq *rq, struct task_struct *p) +static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && @@ -1753,7 +1753,7 @@ static const struct sched_class rt_sched_class = { .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, - .task_wake_up = task_wake_up_rt, + .task_woken = task_woken_rt, .switched_from = switched_from_rt, #endif -- cgit v0.10.2 From 88ec22d3edb72b261f8628226cd543589a6d5e1b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:41 +0100 Subject: sched: Remove the cfs_rq dependency from set_task_cpu() In order to remove the cfs_rq dependency from set_task_cpu() we need to ensure the task is cfs_rq invariant for all callsites. The simple approach is to substract cfs_rq->min_vruntime from se->vruntime on dequeue, and add cfs_rq->min_vruntime on enqueue. However, this has the downside of breaking FAIR_SLEEPERS since we loose the old vruntime as we only maintain the relative position. To solve this, we observe that we only migrate runnable tasks, we do this using deactivate_task(.sleep=0) and activate_task(.wakeup=0), therefore we can restrain the min_vruntime invariance to that state. The only other case is wakeup balancing, since we want to maintain the old vruntime we cannot make it relative on dequeue, but since we don't migrate inactive tasks, we can do so right before we activate it again. This is where we need the new pre-wakeup hook, we need to call this while still holding the old rq->lock. We could fold it into ->select_task_rq(), but since that has multiple callsites and would obfuscate the locking requirements, that seems like a fudge. This leaves the fork() case, simply make sure that ->task_fork() leaves the ->vruntime in a relative state. This covers all cases where set_task_cpu() gets called, and ensures it sees a relative vruntime. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.191697025@chello.nl> Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c9fa1c..973b2b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1116,7 +1116,7 @@ struct sched_class { struct task_struct *task); #ifdef CONFIG_FAIR_GROUP_SCHED - void (*moved_group) (struct task_struct *p); + void (*moved_group) (struct task_struct *p, int on_rq); #endif }; diff --git a/kernel/sched.c b/kernel/sched.c index 6c571bd..f92ce63 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2038,8 +2038,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct cfs_rq *old_cfsrq = task_cfs_rq(p), - *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); #ifdef CONFIG_SCHED_DEBUG /* @@ -2056,8 +2054,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } - p->se.vruntime -= old_cfsrq->min_vruntime - - new_cfsrq->min_vruntime; __set_task_cpu(p, new_cpu); } @@ -10102,7 +10098,7 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk); + tsk->sched_class->moved_group(tsk, on_rq); #endif if (unlikely(running)) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ec1d271..42ac3c9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -510,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = calc_delta_fair(delta_exec, curr); + curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); } @@ -765,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) se->vruntime = vruntime; } +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_MIGRATE 2 + static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* + * Update the normalized vruntime before updating min_vruntime + * through callig update_curr(). + */ + if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) + se->vruntime += cfs_rq->min_vruntime; + + /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); account_entity_enqueue(cfs_rq, se); - if (wakeup) { + if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); } @@ -828,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); + + /* + * Normalize the entity after updating the min_vruntime because the + * update can refer to the ->curr item and we need to reflect this + * movement in our normalized position. + */ + if (!sleep) + se->vruntime -= cfs_rq->min_vruntime; } /* @@ -1038,13 +1057,19 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int flags = 0; + + if (wakeup) + flags |= ENQUEUE_WAKEUP; + if (p->state == TASK_WAKING) + flags |= ENQUEUE_MIGRATE; for_each_sched_entity(se) { if (se->on_rq) break; cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, wakeup); - wakeup = 1; + enqueue_entity(cfs_rq, se, flags); + flags = ENQUEUE_WAKEUP; } hrtick_update(rq); @@ -1120,6 +1145,14 @@ static void yield_task_fair(struct rq *rq) #ifdef CONFIG_SMP +static void task_waking_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + se->vruntime -= cfs_rq->min_vruntime; +} + #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group @@ -1978,6 +2011,8 @@ static void task_fork_fair(struct task_struct *p) resched_task(rq->curr); } + se->vruntime -= cfs_rq->min_vruntime; + raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -2031,12 +2066,13 @@ static void set_curr_task_fair(struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p) +static void moved_group_fair(struct task_struct *p, int on_rq) { struct cfs_rq *cfs_rq = task_cfs_rq(p); update_curr(cfs_rq); - place_entity(cfs_rq, &p->se, 1); + if (!on_rq) + place_entity(cfs_rq, &p->se, 1); } #endif @@ -2076,6 +2112,8 @@ static const struct sched_class fair_sched_class = { .move_one_task = move_one_task_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, + + .task_waking = task_waking_fair, #endif .set_curr_task = set_curr_task_fair, -- cgit v0.10.2 From 738d2be4301007f054541c5c4bf7fb6a361c9b3a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:42 +0100 Subject: sched: Simplify set_task_cpu() Rearrange code a bit now that its a simpler function. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.269101883@chello.nl> Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index f92ce63..8a2bfd3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2034,11 +2034,8 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } - void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { - int old_cpu = task_cpu(p); - #ifdef CONFIG_SCHED_DEBUG /* * We should never call set_task_cpu() on a blocked task, @@ -2049,11 +2046,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); - if (old_cpu != new_cpu) { - p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, - 1, 1, NULL, 0); - } + if (task_cpu(p) == new_cpu) + return; + + p->se.nr_migrations++; + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); __set_task_cpu(p, new_cpu); } -- cgit v0.10.2 From 416eb39556a03d1c7e52b0791e9052ccd71db241 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 17 Dec 2009 06:05:49 +0100 Subject: sched: Make warning less noisy Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.807938893@chello.nl> Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 8a2bfd3..af7dfa7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2041,7 +2041,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * We should never call set_task_cpu() on a blocked task, * ttwu() will sort out the placement. */ - WARN_ON(p->state != TASK_RUNNING && p->state != TASK_WAKING); + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING); #endif trace_sched_migrate_task(p, new_cpu); -- cgit v0.10.2 From 234da7bcdc7aaa935846534c3b726dbc79a9cdd5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Dec 2009 20:21:05 +0100 Subject: sched: Teach might_sleep() about preemptible RCU In practice, it is harmless to voluntarily sleep in a rcu_read_lock() section if we are running under preempt rcu, but it is illegal if we build a kernel running non-preemptable rcu. Currently, might_sleep() doesn't notice sleepable operations under rcu_read_lock() sections if we are running under preemptable rcu because preempt_count() is left untouched after rcu_read_lock() in this case. But we want developers who test their changes under such config to notice the "sleeping while atomic" issues. So we add rcu_read_lock_nesting to prempt_count() in might_sleep() checks. [ v2: Handle rcu-tiny ] Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Cc: Peter Zijlstra LKML-Reference: <1260991265-8451-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index c4ba9a7..96cc307 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -101,4 +101,9 @@ static inline void exit_rcu(void) { } +static inline int rcu_preempt_depth(void) +{ + return 0; +} + #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index c93eee5..8044b1b 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -45,6 +45,12 @@ extern void __rcu_read_unlock(void); extern void synchronize_rcu(void); extern void exit_rcu(void); +/* + * Defined as macro as it is a very low level header + * included from areas that don't even know about current + */ +#define rcu_preempt_depth() (current->rcu_read_lock_nesting) + #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ static inline void __rcu_read_lock(void) @@ -63,6 +69,11 @@ static inline void exit_rcu(void) { } +static inline int rcu_preempt_depth(void) +{ + return 0; +} + #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ static inline void __rcu_read_lock_bh(void) diff --git a/kernel/sched.c b/kernel/sched.c index af7dfa7..7be88a7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9682,7 +9682,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = preempt_count() & ~PREEMPT_ACTIVE; + int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); } -- cgit v0.10.2 From 733421516b42c44b9e21f1793c430cc801ef8324 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:27 +0100 Subject: sched: Move TASK_STATE_TO_CHAR_STR near the TASK_state bits So that we don't keep forgetting about it. Signed-off-by: Peter Zijlstra LKML-Reference: <20091217121829.815779372@chello.nl> Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index 973b2b8..c28ed1b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -193,6 +193,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define TASK_WAKEKILL 128 #define TASK_WAKING 256 +#define TASK_STATE_TO_CHAR_STR "RSDTtZX" + /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) @@ -2595,8 +2597,6 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) } #endif /* CONFIG_MM_OWNER */ -#define TASK_STATE_TO_CHAR_STR "RSDTtZX" - #endif /* __KERNEL__ */ #endif -- cgit v0.10.2 From 44d90df6b757c59651ddd55f1a84f28132b50d29 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:28 +0100 Subject: sched: Add missing state chars to TASK_STATE_TO_CHAR_STR We grew 3 new task states since the last time someone touched it. Signed-off-by: Peter Zijlstra LKML-Reference: <20091217121829.892737686@chello.nl> Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index c28ed1b..94858df 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -193,7 +193,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define TASK_WAKEKILL 128 #define TASK_WAKING 256 -#define TASK_STATE_TO_CHAR_STR "RSDTtZX" +#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) -- cgit v0.10.2 From 464763cf1c6df632dccc8f2f4c7e50163154a2c0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:29 +0100 Subject: sched: Update task_state_arraypwith new states Neglected because its hidden... (who reads comments anyway) Signed-off-by: Peter Zijlstra LKML-Reference: <20091217121829.970166036@chello.nl> Signed-off-by: Ingo Molnar diff --git a/fs/proc/array.c b/fs/proc/array.c index 4badde1..96361e8 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -138,9 +138,12 @@ static const char *task_state_array[] = { "S (sleeping)", /* 1 */ "D (disk sleep)", /* 2 */ "T (stopped)", /* 4 */ - "T (tracing stop)", /* 8 */ + "t (tracing stop)", /* 8 */ "Z (zombie)", /* 16 */ - "X (dead)" /* 32 */ + "X (dead)", /* 32 */ + "x (dead)", /* 64 */ + "K (wakekill)", /* 128 */ + "W (waking)", /* 256 */ }; static inline const char *get_task_state(struct task_struct *tsk) -- cgit v0.10.2 From e1781538cf5c870ab696e9b8f0a5c498d3900f2f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:30 +0100 Subject: sched: Assert task state bits at build time Since everybody is lazy and prone to forgetting things, make the compiler help us a bit. Signed-off-by: Peter Zijlstra LKML-Reference: <20091217121830.060186433@chello.nl> Signed-off-by: Ingo Molnar diff --git a/fs/proc/array.c b/fs/proc/array.c index 96361e8..f560325 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -134,14 +134,14 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) * simple bit tests. */ static const char *task_state_array[] = { - "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "t (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)", /* 32 */ - "x (dead)", /* 64 */ + "R (running)", /* 0 */ + "S (sleeping)", /* 1 */ + "D (disk sleep)", /* 2 */ + "T (stopped)", /* 4 */ + "t (tracing stop)", /* 8 */ + "Z (zombie)", /* 16 */ + "X (dead)", /* 32 */ + "x (dead)", /* 64 */ "K (wakekill)", /* 128 */ "W (waking)", /* 256 */ }; @@ -151,6 +151,8 @@ static inline const char *get_task_state(struct task_struct *tsk) unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; const char **p = &task_state_array[0]; + BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array)); + while (state) { p++; state >>= 1; diff --git a/include/linux/sched.h b/include/linux/sched.h index 94858df..3754387 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -192,9 +192,13 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define TASK_DEAD 64 #define TASK_WAKEKILL 128 #define TASK_WAKING 256 +#define TASK_STATE_MAX 512 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" +extern char ___assert_task_state[1 - 2*!!( + sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; + /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) -- cgit v0.10.2 From 077614ee1e93245a3b9a4e1213659405dbeb0ba6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:31 +0100 Subject: sched: Fix broken assertion There's a preemption race in the set_task_cpu() debug check in that when we get preempted after setting task->state we'd still be on the rq proper, but fail the test. Check for preempted tasks, since those are always on the RQ. Signed-off-by: Peter Zijlstra LKML-Reference: <20091217121830.137155561@chello.nl> Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 7be88a7..720df108 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2041,7 +2041,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * We should never call set_task_cpu() on a blocked task, * ttwu() will sort out the placement. */ - WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING); + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && + !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); #endif trace_sched_migrate_task(p, new_cpu); -- cgit v0.10.2