From 68985633bccb6066bf1803e316fbc6c1f5b796d6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Dec 2015 14:04:04 +0100 Subject: sched/wait: Fix signal handling in bit wait helpers Vladimir reported getting RCU stall warnings and bisected it back to commit: 743162013d40 ("sched: Remove proliferation of wait_on_bit() action functions") That commit inadvertently reversed the calls to schedule() and signal_pending(), thereby not handling the case where the signal receives while we sleep. Reported-by: Vladimir Murzin Tested-by: Vladimir Murzin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: mark.rutland@arm.com Cc: neilb@suse.de Cc: oleg@redhat.com Fixes: 743162013d40 ("sched: Remove proliferation of wait_on_bit() action functions") Fixes: cbbce8220949 ("SCHED: add some "wait..on_bit...timeout()" interfaces.") Link: http://lkml.kernel.org/r/20151201130404.GL3816@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 052e026..f10bd87 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -583,18 +583,18 @@ EXPORT_SYMBOL(wake_up_atomic_t); __sched int bit_wait(struct wait_bit_key *word) { - if (signal_pending_state(current->state, current)) - return 1; schedule(); + if (signal_pending(current)) + return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait); __sched int bit_wait_io(struct wait_bit_key *word) { - if (signal_pending_state(current->state, current)) - return 1; io_schedule(); + if (signal_pending(current)) + return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait_io); @@ -602,11 +602,11 @@ EXPORT_SYMBOL(bit_wait_io); __sched int bit_wait_timeout(struct wait_bit_key *word) { unsigned long now = READ_ONCE(jiffies); - if (signal_pending_state(current->state, current)) - return 1; if (time_after_eq(now, word->timeout)) return -EAGAIN; schedule_timeout(word->timeout - now); + if (signal_pending(current)) + return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_timeout); @@ -614,11 +614,11 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout); __sched int bit_wait_io_timeout(struct wait_bit_key *word) { unsigned long now = READ_ONCE(jiffies); - if (signal_pending_state(current->state, current)) - return 1; if (time_after_eq(now, word->timeout)) return -EAGAIN; io_schedule_timeout(word->timeout - now); + if (signal_pending(current)) + return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_io_timeout); -- cgit v0.10.2 From 119d6f6a3be8b424b200dcee56e74484d5445f7e Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 30 Nov 2015 20:34:20 -0500 Subject: sched/core: Remove false-positive warning from wake_up_process() Because wakeups can (fundamentally) be late, a task might not be in the expected state. Therefore testing against a task's state is racy, and can yield false positives. Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: oleg@redhat.com Fixes: 9067ac85d533 ("wake_up_process() should be never used to wakeup a TASK_STOPPED/TRACED task") Link: http://lkml.kernel.org/r/1448933660-23082-1-git-send-email-sasha.levin@oracle.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d568ac..fc8c987 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2039,7 +2039,6 @@ out: */ int wake_up_process(struct task_struct *p) { - WARN_ON(task_is_stopped_or_traced(p)); return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); -- cgit v0.10.2 From 8295c69925ad53ec32ca54ac9fc194ff21bc40e2 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 2 Dec 2015 19:52:59 +0800 Subject: sched/core: Clear the root_domain cpumasks in init_rootdomain() root_domain::rto_mask allocated through alloc_cpumask_var() contains garbage data, this may cause problems. For instance, When doing pull_rt_task(), it may do useless iterations if rto_mask retains some extra garbage bits. Worse still, this violates the isolated domain rule for clustered scheduling using cpuset, because the tasks(with all the cpus allowed) belongs to one root domain can be pulled away into another root domain. The patch cleans the garbage by using zalloc_cpumask_var() instead of alloc_cpumask_var() for root_domain::rto_mask allocation, thereby addressing the issues. Do the same thing for root_domain's other cpumask memembers: dlo_mask, span, and online. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1449057179-29321-1-git-send-email-xlpang@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fc8c987..eee4ee6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5846,13 +5846,13 @@ static int init_rootdomain(struct root_domain *rd) { memset(rd, 0, sizeof(*rd)); - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) goto free_online; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; init_dl_bw(&rd->dl_bw); -- cgit v0.10.2 From 2541117b0cf79977fa11a0d6e17d61010677bd7b Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 19 Nov 2015 16:47:28 +0100 Subject: sched/cputime: Fix invalid gtime in proc /proc/stats shows invalid gtime when the thread is running in guest. When vtime accounting is not enabled, we cannot get a valid delta. The delta is calculated with now - tsk->vtime_snap, but tsk->vtime_snap is only updated when vtime accounting is runtime enabled. This patch makes task_gtime() just return gtime without computing the buggy non-existing tickless delta when vtime accounting is not enabled. Use context_tracking_is_enabled() to check if vtime is accounting on some cpu, in which case only we need to check the tickless delta. This way we fix the gtime value regression on machines not running nohz full. The kernel config contains CONFIG_VIRT_CPU_ACCOUNTING_GEN=y and CONFIG_NO_HZ_FULL_ALL=n and boot without nohz_full. I ran and stop a busy loop in VM and see the gtime in host. Dump the 43rd field which shows the gtime in every second: # while :; do awk '{print $3" "$43}' /proc/3955/task/4014/stat; sleep 1; done S 4348 R 7064566 R 7064766 R 7064967 R 7065168 S 4759 S 4759 During running busy loop, it returns large value. After applying this patch, we can see right gtime. # while :; do awk '{print $3" "$43}' /proc/10913/task/10956/stat; sleep 1; done S 5338 R 5365 R 5465 R 5566 R 5666 S 5726 S 5726 Signed-off-by: Hiroshi Shimamoto Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E . McKenney Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447948054-28668-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 26a5446..05de80b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -788,6 +788,9 @@ cputime_t task_gtime(struct task_struct *t) unsigned int seq; cputime_t gtime; + if (!context_tracking_is_enabled()) + return t->gtime; + do { seq = read_seqbegin(&t->vtime_seqlock); -- cgit v0.10.2 From b75a22531588e77aa8c2daf228c9723916ae2cd0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Oct 2015 14:36:17 +0200 Subject: sched/core: Better document the try_to_wake_up() barriers Explain how the control dependency and smp_rmb() end up providing ACQUIRE semantics and pair with smp_store_release() in finish_lock_switch(). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index eee4ee6..b64f163 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1953,7 +1953,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) while (p->on_cpu) cpu_relax(); /* - * Pairs with the smp_wmb() in finish_lock_switch(). + * Combined with the control dependency above, we have an effective + * smp_load_acquire() without the need for full barriers. + * + * Pairs with the smp_store_release() in finish_lock_switch(). + * + * This ensures that tasks getting woken will be fully ordered against + * their previous state and preserve Program Order. */ smp_rmb(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index efd3bfc..b242775 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1073,6 +1073,9 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * We must ensure this doesn't happen until the switch is completely * finished. * + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * * Pairs with the control dependency and rmb in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); -- cgit v0.10.2 From ecf7d01c229d11a44609c0067889372c91fb4f36 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 7 Oct 2015 14:14:13 +0200 Subject: sched/core: Fix an SMP ordering race in try_to_wake_up() vs. schedule() Oleg noticed that its possible to falsely observe p->on_cpu == 0 such that we'll prematurely continue with the wakeup and effectively run p on two CPUs at the same time. Even though the overlap is very limited; the task is in the middle of being scheduled out; it could still result in corruption of the scheduler data structures. CPU0 CPU1 set_current_state(...) context_switch(X, Y) prepare_lock_switch(Y) Y->on_cpu = 1; finish_lock_switch(X) store_release(X->on_cpu, 0); try_to_wake_up(X) LOCK(p->pi_lock); t = X->on_cpu; // 0 context_switch(Y, X) prepare_lock_switch(X) X->on_cpu = 1; finish_lock_switch(Y) store_release(Y->on_cpu, 0); schedule(); deactivate_task(X); X->on_rq = 0; if (X->on_rq) // false if (t) while (X->on_cpu) cpu_relax(); context_switch(X, ..) finish_lock_switch(X) store_release(X->on_cpu, 0); Avoid the load of X->on_cpu being hoisted over the X->on_rq load. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b64f163..7063c6a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1947,6 +1947,25 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #ifdef CONFIG_SMP /* + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be + * possible to, falsely, observe p->on_cpu == 0. + * + * One must be running (->on_cpu == 1) in order to remove oneself + * from the runqueue. + * + * [S] ->on_cpu = 1; [L] ->on_rq + * UNLOCK rq->lock + * RMB + * LOCK rq->lock + * [S] ->on_rq = 0; [L] ->on_cpu + * + * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock + * from the consecutive calls to schedule(); the first switching to our + * task, the second putting it to sleep. + */ + smp_rmb(); + + /* * If the owning (remote) cpu is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. */ -- cgit v0.10.2