From b93e0b8fa819c3d5641794ed9a07e643416aa0fd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 23 May 2014 18:10:21 +0200 Subject: irq_work: Split raised and lazy lists An irq work can be handled from two places: from the tick if the work carries the "lazy" flag and the tick is periodic, or from a self IPI. We merge all these works in a single list and we use some per cpu latch to avoid raising a self-IPI when one is already pending. Now we could do away with this ugly latch if only the list was only made of non-lazy works. Just enqueueing a work on the empty list would be enough to know if we need to raise an IPI or not. Also we are going to implement remote irq work queuing. Then the per CPU latch will need to become atomic in the global scope. That's too bad because, here as well, just enqueueing a work on an empty list of non-lazy works would be enough to know if we need to raise an IPI or not. So lets take a way out of this: split the works in two distinct lists, one for the works that can be handled by the next tick and another one for those handled by the IPI. Just checking if the latter is empty when we queue a new work is enough to know if we need to raise an IPI. Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra Cc: Andrew Morton Cc: Ingo Molnar Cc: Kevin Hilman Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker diff --git a/kernel/irq_work.c b/kernel/irq_work.c index a82170e..126f254 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -19,8 +19,8 @@ #include -static DEFINE_PER_CPU(struct llist_head, irq_work_list); -static DEFINE_PER_CPU(int, irq_work_raised); +static DEFINE_PER_CPU(struct llist_head, raised_list); +static DEFINE_PER_CPU(struct llist_head, lazy_list); /* * Claim the entry so that no one else will poke at it. @@ -70,15 +70,13 @@ bool irq_work_queue(struct irq_work *work) /* Queue the entry and raise the IPI if needed. */ preempt_disable(); - llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); - - /* - * If the work is not "lazy" or the tick is stopped, raise the irq - * work interrupt (if supported by the arch), otherwise, just wait - * for the next tick. - */ - if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) { - if (!this_cpu_cmpxchg(irq_work_raised, 0, 1)) + /* If the work is "lazy", handle it from next tick if any */ + if (work->flags & IRQ_WORK_LAZY) { + if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) && + tick_nohz_tick_stopped()) + arch_irq_work_raise(); + } else { + if (llist_add(&work->llnode, &__get_cpu_var(raised_list))) arch_irq_work_raise(); } @@ -90,10 +88,11 @@ EXPORT_SYMBOL_GPL(irq_work_queue); bool irq_work_needs_cpu(void) { - struct llist_head *this_list; + struct llist_head *raised, *lazy; - this_list = &__get_cpu_var(irq_work_list); - if (llist_empty(this_list)) + raised = &__get_cpu_var(raised_list); + lazy = &__get_cpu_var(lazy_list); + if (llist_empty(raised) && llist_empty(lazy)) return false; /* All work should have been flushed before going offline */ @@ -102,28 +101,18 @@ bool irq_work_needs_cpu(void) return true; } -static void __irq_work_run(void) +static void irq_work_run_list(struct llist_head *list) { unsigned long flags; struct irq_work *work; - struct llist_head *this_list; struct llist_node *llnode; + BUG_ON(!irqs_disabled()); - /* - * Reset the "raised" state right before we check the list because - * an NMI may enqueue after we find the list empty from the runner. - */ - __this_cpu_write(irq_work_raised, 0); - barrier(); - - this_list = &__get_cpu_var(irq_work_list); - if (llist_empty(this_list)) + if (llist_empty(list)) return; - BUG_ON(!irqs_disabled()); - - llnode = llist_del_all(this_list); + llnode = llist_del_all(list); while (llnode != NULL) { work = llist_entry(llnode, struct irq_work, llnode); @@ -148,6 +137,12 @@ static void __irq_work_run(void) } } +static void __irq_work_run(void) +{ + irq_work_run_list(&__get_cpu_var(raised_list)); + irq_work_run_list(&__get_cpu_var(lazy_list)); +} + /* * Run the irq_work entries on this cpu. Requires to be ran from hardirq * context with local IRQs disabled. -- cgit v0.10.2 From 478850160636c4f0b2558451df0e42f8c5a10939 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 8 May 2014 01:37:48 +0200 Subject: irq_work: Implement remote queueing irq work currently only supports local callbacks. However its code is mostly ready to run remote callbacks and we have some potential user. The full nohz subsystem currently open codes its own remote irq work on top of the scheduler ipi when it wants a CPU to reevaluate its next tick. However this ad hoc solution bloats the scheduler IPI. Lets just extend the irq work subsystem to support remote queuing on top of the generic SMP IPI to handle this kind of user. This shouldn't add noticeable overhead. Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra Cc: Andrew Morton Cc: Eric Dumazet Cc: Ingo Molnar Cc: Kevin Hilman Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 19ae05d..bf9422c 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -33,6 +33,11 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) #define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), } bool irq_work_queue(struct irq_work *work); + +#ifdef CONFIG_SMP +bool irq_work_queue_on(struct irq_work *work, int cpu); +#endif + void irq_work_run(void); void irq_work_sync(struct irq_work *work); diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 126f254..4b0a890 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void) */ } +#ifdef CONFIG_SMP /* - * Enqueue the irq_work @entry unless it's already pending + * Enqueue the irq_work @work on @cpu unless it's already pending * somewhere. * * Can be re-enqueued while the callback is still in progress. */ +bool irq_work_queue_on(struct irq_work *work, int cpu) +{ + /* All work should have been flushed before going offline */ + WARN_ON_ONCE(cpu_is_offline(cpu)); + + /* Arch remote IPI send/receive backend aren't NMI safe */ + WARN_ON_ONCE(in_nmi()); + + /* Only queue if not already pending */ + if (!irq_work_claim(work)) + return false; + + if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) + arch_send_call_function_single_ipi(cpu); + + return true; +} +EXPORT_SYMBOL_GPL(irq_work_queue_on); +#endif + +/* Enqueue the irq work @work on the current CPU */ bool irq_work_queue(struct irq_work *work) { /* Only queue if not already pending */ diff --git a/kernel/smp.c b/kernel/smp.c index 306f818..a1812d1 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -3,6 +3,7 @@ * * (C) Jens Axboe 2008 */ +#include #include #include #include @@ -210,6 +211,14 @@ void generic_smp_call_function_single_interrupt(void) csd->func(csd->info); csd_unlock(csd); } + + /* + * Handle irq works queued remotely by irq_work_queue_on(). + * Smp functions above are typically synchronous so they + * better run first since some other CPUs may be busy waiting + * for them. + */ + irq_work_run(); } /* -- cgit v0.10.2 From 3d36aebc2e78923095575df954f3f3b430ac0a30 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Jun 2014 16:17:33 +0200 Subject: nohz: Support nohz full remote kick Remotely kicking a full nohz CPU in order to make it re-evaluate its next tick is currently implemented using the scheduler IPI. However this bloats a scheduler fast path with an off-topic feature. The scheduler tick was abused here for its cool "callable anywhere/anytime" properties. But now that the irq work subsystem can queue remote callbacks, it's a perfect fit to safely queue IPIs when interrupts are disabled without worrying about concurrent callers. So lets implement remote kick on top of irq work. This is going to be used when a new event requires the next tick to be recalculated: more than 1 task competing on the CPU, timer armed, ... Acked-by: Peter Zijlstra Cc: Andrew Morton Cc: Ingo Molnar Cc: Kevin Hilman Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker diff --git a/include/linux/tick.h b/include/linux/tick.h index b84773c..8a4987f 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -181,7 +181,13 @@ static inline bool tick_nohz_full_cpu(int cpu) extern void tick_nohz_init(void); extern void __tick_nohz_full_check(void); -extern void tick_nohz_full_kick(void); +extern void tick_nohz_full_kick_cpu(int cpu); + +static inline void tick_nohz_full_kick(void) +{ + tick_nohz_full_kick_cpu(smp_processor_id()); +} + extern void tick_nohz_full_kick_all(void); extern void __tick_nohz_task_switch(struct task_struct *tsk); #else @@ -189,6 +195,7 @@ static inline void tick_nohz_init(void) { } static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } static inline void __tick_nohz_full_check(void) { } +static inline void tick_nohz_full_kick_cpu(int cpu) { } static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } static inline void __tick_nohz_task_switch(struct task_struct *tsk) { } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6558b7a..3d63944 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -224,13 +224,15 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { }; /* - * Kick the current CPU if it's full dynticks in order to force it to + * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ -void tick_nohz_full_kick(void) +void tick_nohz_full_kick_cpu(int cpu) { - if (tick_nohz_full_cpu(smp_processor_id())) - irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); + if (!tick_nohz_full_cpu(cpu)) + return; + + irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } static void nohz_full_kick_ipi(void *info) -- cgit v0.10.2 From 53c5fa16b4c843f1df91f7498e3c7bf95e0eaefa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Jun 2014 16:20:21 +0200 Subject: nohz: Switch to nohz full remote kick on timer enqueue When a new timer is enqueued on a full dynticks target, that CPU must re-evaluate the next tick to handle the timer correctly. This is currently performed through the scheduler IPI. Meanwhile this happens at the cost of off-topic workarounds in that fast path to make it call irq_exit(). As we plan to remove this hack off the scheduler IPI, lets use the nohz full kick instead. Pretty much any IPI fits for that job as long at it calls irq_exit(). The nohz full kick just happens to be handy and readily available here. If it happens to be too much an overkill in the future, we can still turn that timer kick into an empty IPI. Acked-by: Peter Zijlstra Cc: Andrew Morton Cc: Ingo Molnar Cc: Kevin Hilman Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3bdf01b..feb5496 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -684,10 +684,16 @@ static void wake_up_idle_cpu(int cpu) static bool wake_up_full_nohz_cpu(int cpu) { + /* + * We just need the target to call irq_exit() and re-evaluate + * the next tick. The nohz full kick at least implies that. + * If needed we can still optimize that later with an + * empty IRQ. + */ if (tick_nohz_full_cpu(cpu)) { if (cpu != smp_processor_id() || tick_nohz_tick_stopped()) - smp_send_reschedule(cpu); + tick_nohz_full_kick_cpu(cpu); return true; } -- cgit v0.10.2 From fd2ac4f4a65a7f34b0bc6433fcca1192d7ba8b8e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 18 Mar 2014 21:12:53 +0100 Subject: nohz: Use nohz own full kick on 2nd task enqueue Now that we have a nohz full remote kick based on irq work, lets use it to notify a CPU that it's exiting single task mode. This unbloats a bit the scheduler IPI that the nohz code was abusing for its cool "callable anywhere/anytime" properties. Acked-by: Peter Zijlstra Cc: Andrew Morton Cc: Ingo Molnar Cc: Kevin Hilman Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker diff --git a/kernel/sched/core.c b/kernel/sched/core.c index feb5496..13f5857 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1574,9 +1574,7 @@ void scheduler_ipi(void) */ preempt_fold_need_resched(); - if (llist_empty(&this_rq()->wake_list) - && !tick_nohz_full_cpu(smp_processor_id()) - && !got_nohz_idle_kick()) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) return; /* @@ -1593,7 +1591,6 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); - tick_nohz_full_check(); sched_ttwu_pending(); /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 31cc02e..599a72af 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1223,7 +1223,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (tick_nohz_full_cpu(rq->cpu)) { /* Order rq->nr_running write against the IPI */ smp_wmb(); - smp_send_reschedule(rq->cpu); + tick_nohz_full_kick_cpu(rq->cpu); } } #endif -- cgit v0.10.2 From 3882ec643997757824cd5f25180cd8a787b9dbe1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 18 Mar 2014 22:54:04 +0100 Subject: nohz: Use IPI implicit full barrier against rq->nr_running r/w A full dynticks CPU is allowed to stop its tick when a single task runs. Meanwhile when a new task gets enqueued, the CPU must be notified so that it can restart its tick to maintain local fairness and other accounting details. This notification is performed by way of an IPI. Then when the target receives the IPI, we expect it to see the new value of rq->nr_running. Hence the following ordering scenario: CPU 0 CPU 1 write rq->running get IPI smp_wmb() smp_rmb() send IPI read rq->nr_running But Paul Mckenney says that nowadays IPIs imply a full barrier on all architectures. So we can safely remove this pair and rely on the implicit barriers that come along IPI send/receive. Lets just comment on this new assumption. Acked-by: Peter Zijlstra Cc: Andrew Morton Cc: Ingo Molnar Cc: Kevin Hilman Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13f5857..7f3063c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -740,10 +740,11 @@ bool sched_can_stop_tick(void) rq = this_rq(); - /* Make sure rq->nr_running update is visible after the IPI */ - smp_rmb(); - - /* More than one running task need preemption */ + /* + * More than one running task need preemption. + * nr_running update is assumed to be visible + * after IPI is sent from wakers. + */ if (rq->nr_running > 1) return false; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 599a72af..eb85676 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1221,8 +1221,14 @@ static inline void add_nr_running(struct rq *rq, unsigned count) #ifdef CONFIG_NO_HZ_FULL if (prev_nr < 2 && rq->nr_running >= 2) { if (tick_nohz_full_cpu(rq->cpu)) { - /* Order rq->nr_running write against the IPI */ - smp_wmb(); + /* + * Tick is needed if more than one task runs on a CPU. + * Send the target an IPI to kick it out of nohz mode. + * + * We assume that IPI implies full memory barrier and the + * new value of rq->nr_running is visible on reception + * from the target. + */ tick_nohz_full_kick_cpu(rq->cpu); } } -- cgit v0.10.2 From a43455a1d572daf7b730fe12eb747d1e17411365 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 4 Jun 2014 16:09:42 -0400 Subject: sched/numa: Ensure task_numa_migrate() checks the preferred node The first thing task_numa_migrate() does is check to see if there is CPU capacity available on the preferred node, in order to move the task there. However, if the preferred node is all busy, we would skip considering that node for tasks swaps in the subsequent loop. This prevents NUMA convergence of tasks on busy systems. However, swapping locations with a task on our preferred nid, when the preferred nid is busy, is perfectly fine. The fix is to also look for a CPU on our preferred nid when it is totally busy. This changes "perf bench numa mem -p 4 -t 20 -m -0 -P 1000" from not converging in 15 minutes on my 4 node system, to converging in 10-20 seconds. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140604160942.6969b101@cuia.bos.redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fea7d33..8fbb011 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1302,9 +1302,8 @@ static int task_numa_migrate(struct task_struct *p) groupimp = group_weight(p, env.dst_nid) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); - /* If the preferred nid has free capacity, try to use it. */ - if (env.dst_stats.has_free_capacity) - task_numa_find_cpu(&env, taskimp, groupimp); + /* Try to find a spot on the preferred nid. */ + task_numa_find_cpu(&env, taskimp, groupimp); /* No space available on the preferred nid. Look elsewhere. */ if (env.best_cpu == -1) { -- cgit v0.10.2 From bb97fc31647539f1f102eed646a95e200160a150 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 4 Jun 2014 16:33:15 -0400 Subject: sched/numa: Always try to migrate to preferred node at task_numa_placement() time It is possible that at task_numa_placement() time, the task's numa_preferred_nid does not change, but the task is not actually running on the preferred node at the time. In that case, we still want to attempt migration to the preferred node. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140604163315.1dbc7b56@cuia.bos.redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8fbb011..3fa3e18 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1613,11 +1613,13 @@ static void task_numa_placement(struct task_struct *p) spin_unlock_irq(group_lock); } - /* Preferred node as the node with the most faults */ - if (max_faults && max_nid != p->numa_preferred_nid) { - /* Update the preferred nid and migrate task if possible */ - sched_setnuma(p, max_nid); - numa_migrate_preferred(p); + if (max_faults) { + /* Set the new preferred node */ + if (max_nid != p->numa_preferred_nid) + sched_setnuma(p, max_nid); + + if (task_node(p) != p->numa_preferred_nid) + numa_migrate_preferred(p); } } -- cgit v0.10.2 From 5d5e2b1bcbdc996e72815c03fdc5ea82c4642397 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jun 2014 10:58:43 +0200 Subject: sched: Fix CACHE_HOT_BUDY condition When computing cache hot, we should check if the migration dst cpu is idle, instead of the current cpu. Though they are same in normal balancing, that is false nowadays in nohz idle balancing at least. Signed-off-by: Hillf Danton Signed-off-by: Peter Zijlstra Cc: Hillf Danton Cc: Mike Galbraith Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140607090452.4696E301D2@webmail.sinamail.sina.com.cn Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3fa3e18..1f9c457 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5095,8 +5095,7 @@ static void move_task(struct task_struct *p, struct lb_env *env) /* * Is this task likely cache-hot: */ -static int -task_hot(struct task_struct *p, u64 now) +static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; @@ -5109,7 +5108,7 @@ task_hot(struct task_struct *p, u64 now) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && + if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; @@ -5119,7 +5118,7 @@ task_hot(struct task_struct *p, u64 now) if (sysctl_sched_migration_cost == 0) return 0; - delta = now - p->se.exec_start; + delta = rq_clock_task(env->src_rq) - p->se.exec_start; return delta < (s64)sysctl_sched_migration_cost; } @@ -5273,7 +5272,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); + tsk_cache_hot = task_hot(p, env); if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); -- cgit v0.10.2 From a77353e5eb56b6c6098bfce59aff1f449451b0b7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Jun 2014 07:13:07 +0200 Subject: irq_work: Remove BUG_ON in irq_work_run() Because of a collision with 8d056c48e486 ("CPU hotplug, smp: flush any pending IPI callbacks before CPU offline"), which ends up calling hotplug_cfd()->flush_smp_call_function_queue()->irq_work_run(), which is not from IRQ context. And since that already calls irq_work_run() from the hotplug path, remove our entire hotplug handling. Reported-by: Stephen Warren Tested-by: Stephen Warren Reviewed-by: Srivatsa S. Bhat Cc: Frederic Weisbecker Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-busatzs2gvz4v62258agipuf@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 4b0a890..e6bcbe7 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -160,20 +160,14 @@ static void irq_work_run_list(struct llist_head *list) } } -static void __irq_work_run(void) -{ - irq_work_run_list(&__get_cpu_var(raised_list)); - irq_work_run_list(&__get_cpu_var(lazy_list)); -} - /* - * Run the irq_work entries on this cpu. Requires to be ran from hardirq - * context with local IRQs disabled. + * hotplug calls this through: + * hotplug_cfd() -> flush_smp_call_function_queue() */ void irq_work_run(void) { - BUG_ON(!in_irq()); - __irq_work_run(); + irq_work_run_list(&__get_cpu_var(raised_list)); + irq_work_run_list(&__get_cpu_var(lazy_list)); } EXPORT_SYMBOL_GPL(irq_work_run); @@ -189,35 +183,3 @@ void irq_work_sync(struct irq_work *work) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); - -#ifdef CONFIG_HOTPLUG_CPU -static int irq_work_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_DYING: - /* Called from stop_machine */ - if (WARN_ON_ONCE(cpu != smp_processor_id())) - break; - __irq_work_run(); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block cpu_notify; - -static __init int irq_work_init_cpu_notifier(void) -{ - cpu_notify.notifier_call = irq_work_cpu_notify; - cpu_notify.priority = 0; - register_cpu_notifier(&cpu_notify); - return 0; -} -device_initcall(irq_work_init_cpu_notifier); - -#endif /* CONFIG_HOTPLUG_CPU */ -- cgit v0.10.2 From 541b82644d72c1ef4a0587515a619712c1c19bd3 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 24 Jun 2014 14:04:12 +0530 Subject: sched/core: Fix formatting issues in sched_can_stop_tick() sched_can_stop_tick() is using 7 spaces instead of 8 spaces or a 'tab' at the beginning of few lines. Which doesn't align well with the Coding Guidelines. Also remove local variable 'rq' as it is used at only one place and we can directly use this_rq() instead. Signed-off-by: Viresh Kumar Cc: fweisbec@gmail.com Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/afb781733e4a9ffbced5eb9fd25cc0aa5c6ffd7a.1403596966.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f3063c..866d840 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -736,19 +736,15 @@ static inline bool got_nohz_idle_kick(void) #ifdef CONFIG_NO_HZ_FULL bool sched_can_stop_tick(void) { - struct rq *rq; - - rq = this_rq(); - /* * More than one running task need preemption. * nr_running update is assumed to be visible * after IPI is sent from wakers. */ - if (rq->nr_running > 1) - return false; + if (this_rq()->nr_running > 1) + return false; - return true; + return true; } #endif /* CONFIG_NO_HZ_FULL */ -- cgit v0.10.2 From c06f04c70489b9deea3212af8375e2f0c2f0b184 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Fri, 20 Jun 2014 15:21:20 -0700 Subject: sched: Fix potential near-infinite distribute_cfs_runtime() loop distribute_cfs_runtime() intentionally only hands out enough runtime to bring each cfs_rq to 1 ns of runtime, expecting the cfs_rqs to then take the runtime they need only once they actually get to run. However, if they get to run sufficiently quickly, the period timer is still in distribute_cfs_runtime() and no runtime is available, causing them to throttle. Then distribute has to handle them again, and this can go on until distribute has handed out all of the runtime 1ns at a time, which takes far too long. Instead allow access to the same runtime that distribute is handing out, accepting that corner cases with very low quota may be able to spend the entire cfs_b->runtime during distribute_cfs_runtime, meaning that the runtime directly handed out by distribute_cfs_runtime was over quota. In addition, if a cfs_rq does manage to throttle like this, make sure the existing distribute_cfs_runtime no longer loops over it again. Signed-off-by: Ben Segall Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140620222120.13814.21652.stgit@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1f9c457..ef5eac7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3361,7 +3361,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + /* + * Add to the _head_ of the list, so that an already-started + * distribute_cfs_runtime will not see us + */ + list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); if (!cfs_b->timer_active) __start_cfs_bandwidth(cfs_b, false); raw_spin_unlock(&cfs_b->lock); @@ -3418,7 +3422,8 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining, u64 expires) { struct cfs_rq *cfs_rq; - u64 runtime = remaining; + u64 runtime; + u64 starting_runtime = remaining; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -3449,7 +3454,7 @@ next: } rcu_read_unlock(); - return remaining; + return starting_runtime - remaining; } /* @@ -3495,22 +3500,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - /* - * There are throttled entities so we must first use the new bandwidth - * to unthrottle them before making it generally available. This - * ensures that all existing debts will be paid before a new cfs_rq is - * allowed to run. - */ - runtime = cfs_b->runtime; runtime_expires = cfs_b->runtime_expires; - cfs_b->runtime = 0; /* - * This check is repeated as we are holding onto the new bandwidth - * while we unthrottle. This can potentially race with an unthrottled - * group trying to acquire new bandwidth from the global pool. + * This check is repeated as we are holding onto the new bandwidth while + * we unthrottle. This can potentially race with an unthrottled group + * trying to acquire new bandwidth from the global pool. This can result + * in us over-using our runtime if it is all used during this loop, but + * only by limited amounts in that extreme case. */ - while (throttled && runtime > 0) { + while (throttled && cfs_b->runtime > 0) { + runtime = cfs_b->runtime; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, @@ -3518,10 +3518,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) raw_spin_lock(&cfs_b->lock); throttled = !list_empty(&cfs_b->throttled_cfs_rq); + + cfs_b->runtime -= min(runtime, cfs_b->runtime); } - /* return (any) remaining runtime */ - cfs_b->runtime = runtime; /* * While we are ensured activity in the period following an * unthrottle, this also covers the case in which the new bandwidth is @@ -3632,10 +3632,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) return; } - if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - cfs_b->runtime = 0; - } + expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); @@ -3646,7 +3645,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) raw_spin_lock(&cfs_b->lock); if (expires == cfs_b->runtime_expires) - cfs_b->runtime = runtime; + cfs_b->runtime -= min(runtime, cfs_b->runtime); raw_spin_unlock(&cfs_b->lock); } -- cgit v0.10.2 From 4036ac1567834222fc763ab18e3e17df93b4eaaf Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 24 Jun 2014 07:49:40 +0200 Subject: sched: Fix clock_gettime(CLOCK_[PROCESS/THREAD]_CPUTIME_ID) monotonicity If a task has been dequeued, it has been accounted. Do not project cycles that may or may not ever be accounted to a dequeued task, as that may make clock_gettime() both inaccurate and non-monotonic. Protect update_rq_clock() from slight TSC skew while at it. Signed-off-by: Mike Galbraith Cc: kosaki.motohiro@jp.fujitsu.com Cc: pjt@google.com Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403588980.29711.11.camel@marge.simpson.net Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 866d840..e50234b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq) return; delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + if (delta < 0) + return; rq->clock += delta; update_rq_clock_task(rq, delta); } @@ -2431,7 +2433,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) { u64 ns = 0; - if (task_current(rq, p)) { + /* + * Must be ->curr _and_ ->on_rq. If dequeued, we would + * project cycles that may never be accounted to this + * thread, breaking clock_gettime(). + */ + if (task_current(rq, p) && p->on_rq) { update_rq_clock(rq); ns = rq_clock_task(rq) - p->se.exec_start; if ((s64)ns < 0) @@ -2474,8 +2481,10 @@ unsigned long long task_sched_runtime(struct task_struct *p) * If we race with it leaving cpu, we'll take a lock. So we're correct. * If we race with it entering cpu, unaccounted time is 0. This is * indistinguishable from the read occurring a few cycles earlier. + * If we see ->on_cpu without ->on_rq, the task is leaving, and has + * been accounted, so we're correct here as well. */ - if (!p->on_cpu) + if (!p->on_cpu || !p->on_rq) return p->se.sum_exec_runtime; #endif -- cgit v0.10.2 From 89abb5ad10ae8ac3405e635ac80815f781c8b8e9 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 24 Jun 2014 10:01:01 +0530 Subject: sched/idle: Drop !! while calculating 'broadcast' We don't need 'broadcast' to be set to 'zero or one', but to 'zero or non-zero' and so the extra operation to convert it to 'zero or one' can be skipped. Also change type of 'broadcast' to unsigned int, i.e. type of drv->states[*].flags. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/0dfbe2976aa108c53e08d3477ea90f6360c1f54c.1403584026.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cf009fb..9f1608f 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -79,7 +79,7 @@ static void cpuidle_idle_call(void) struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; - bool broadcast; + unsigned int broadcast; /* * Check if the idle task must be rescheduled. If it is the @@ -135,7 +135,7 @@ use_default: goto exit_idle; } - broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); + broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP; /* * Tell the time framework to switch to a broadcast timer -- cgit v0.10.2 From 4486edd12b5ac8a9af7a5e16e4b9eeb3b8339c10 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Mon, 23 Jun 2014 12:16:49 -0700 Subject: sched/fair: Implement fast idling of CPUs when the system is partially loaded When a system is lightly loaded (i.e. no more than 1 job per cpu), attempt to pull job to a cpu before putting it to idle is unnecessary and can be skipped. This patch adds an indicator so the scheduler can know when there's no more than 1 active job is on any CPU in the system to skip needless job pulls. On a 4 socket machine with a request/response kind of workload from clients, we saw about 0.13 msec delay when we go through a full load balance to try pull job from all the other cpus. While 0.1 msec was spent on processing the request and generating a response, the 0.13 msec load balance overhead was actually more than the actual work being done. This overhead can be skipped much of the time for lightly loaded systems. With this patch, we tested with a netperf request/response workload that has the server busy with half the cpus in a 4 socket system. We found the patch eliminated 75% of the load balance attempts before idling a cpu. The overhead of setting/clearing the indicator is low as we already gather the necessary info while we call add_nr_running() and update_sd_lb_stats.() We switch to full load balance load immediately if any cpu got more than one job on its run queue in add_nr_running. We'll clear the indicator to avoid load balance when we detect no cpu's have more than one job when we scan the work queues in update_sg_lb_stats(). We are aggressive in turning on the load balance and opportunistic in skipping the load balance. Signed-off-by: Tim Chen Acked-by: Rik van Riel Acked-by: Jason Low Cc: "Paul E.McKenney" Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Alex Shi Cc: Michel Lespinasse Cc: Peter Hurley Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403551009.2970.613.camel@schen9-DESK Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ef5eac7..e3ff3d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5866,7 +5866,8 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, - int local_group, struct sg_lb_stats *sgs) + int local_group, struct sg_lb_stats *sgs, + bool *overload) { unsigned long load; int i; @@ -5884,6 +5885,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += load; sgs->sum_nr_running += rq->nr_running; + + if (rq->nr_running > 1) + *overload = true; + #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; @@ -5994,6 +5999,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; + bool overload = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -6014,7 +6020,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, load_idx, local_group, sgs); + update_sg_lb_stats(env, sg, load_idx, local_group, sgs, + &overload); if (local_group) goto next_group; @@ -6048,6 +6055,13 @@ next_group: if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); + + if (!env->sd->parent) { + /* update overload indicator if we are at root domain */ + if (env->dst_rq->rd->overload != overload) + env->dst_rq->rd->overload = overload; + } + } /** @@ -6766,7 +6780,8 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) { + if (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eb85676..0191ed5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -477,6 +477,9 @@ struct root_domain { cpumask_var_t span; cpumask_var_t online; + /* Indicate more than one runnable task for any CPU */ + bool overload; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). @@ -1218,8 +1221,13 @@ static inline void add_nr_running(struct rq *rq, unsigned count) rq->nr_running = prev_nr + count; -#ifdef CONFIG_NO_HZ_FULL if (prev_nr < 2 && rq->nr_running >= 2) { +#ifdef CONFIG_SMP + if (!rq->rd->overload) + rq->rd->overload = true; +#endif + +#ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(rq->cpu)) { /* * Tick is needed if more than one task runs on a CPU. @@ -1231,8 +1239,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) */ tick_nohz_full_kick_cpu(rq->cpu); } - } #endif + } } static inline void sub_nr_running(struct rq *rq, unsigned count) -- cgit v0.10.2 From f0b8a4afd6a8c500161e45065a91738b490bf5ae Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 23 Jun 2014 11:41:29 -0400 Subject: sched/numa: Use group's max nid as task's preferred nid From task_numa_placement, always try to consolidate the tasks in a group on the group's top nid. In case this task is part of a group that is interleaved over multiple nodes, task_numa_migrate will set the task's preferred nid to the best node it could find for the task, so this patch will cause at most one run through task_numa_migrate. Signed-off-by: Rik van Riel Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538095-31256-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e3ff3d1..96b2d39 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1594,23 +1594,8 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_group) { update_numa_active_node_mask(p->numa_group); - /* - * If the preferred task and group nids are different, - * iterate over the nodes again to find the best place. - */ - if (max_nid != max_group_nid) { - unsigned long weight, max_weight = 0; - - for_each_online_node(nid) { - weight = task_weight(p, nid) + group_weight(p, nid); - if (weight > max_weight) { - max_weight = weight; - max_nid = nid; - } - } - } - spin_unlock_irq(group_lock); + max_nid = max_group_nid; } if (max_faults) { -- cgit v0.10.2 From 28a21745190a0ca613cab817bfe3dc65373158bf Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 23 Jun 2014 11:46:13 -0400 Subject: sched/numa: Move power adjustment into load_too_imbalanced() Currently the NUMA code scales the load on each node with the amount of CPU power available on that node, but it does not apply any adjustment to the load of the task that is being moved over. On systems with SMT/HT, this results in a task being weighed much more heavily than a CPU core, and a task move that would even out the load between nodes being disallowed. The correct thing is to apply the power correction to the numbers after we have first applied the move of the tasks' loads to them. This also allows us to do the power correction with a multiplication, rather than a division. Also drop two function arguments for load_too_unbalanced, since it takes various factors from env already. Signed-off-by: Rik van Riel Cc: chegu_vinod@hp.com Cc: mgorman@suse.de Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538378-31571-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96b2d39..f287d0b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; ns->task_capacity = DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); ns->has_free_capacity = (ns->nr_running < ns->task_capacity); @@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } -static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, - long src_load, long dst_load, +static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { long imb, old_imb; + long orig_src_load, orig_dst_load; + long src_capacity, dst_capacity; + + /* + * The load is corrected for the CPU capacity available on each node. + * + * src_load dst_load + * ------------ vs --------- + * src_capacity dst_capacity + */ + src_capacity = env->src_stats.compute_capacity; + dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ if (dst_load < src_load) swap(dst_load, src_load); /* Is the difference below the threshold? */ - imb = dst_load * 100 - src_load * env->imbalance_pct; + imb = dst_load * src_capacity * 100 - + src_load * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; @@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, * The imbalance is above the allowed threshold. * Compare it with the old imbalance. */ + orig_src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; + if (orig_dst_load < orig_src_load) swap(orig_dst_load, orig_src_load); - old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + old_imb = orig_dst_load * src_capacity * 100 - + orig_src_load * dst_capacity * env->imbalance_pct; /* Would this change make things worse? */ return (imb > old_imb); @@ -1136,8 +1151,7 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long orig_src_load, src_load; - long orig_dst_load, dst_load; + long src_load, dst_load; long load; long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1211,13 +1225,9 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - orig_dst_load = env->dst_stats.load; - orig_src_load = env->src_stats.load; - - /* XXX missing capacity terms */ load = task_h_load(env->p); - dst_load = orig_dst_load + load; - src_load = orig_src_load - load; + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; if (cur) { load = task_h_load(cur); @@ -1225,8 +1235,7 @@ balance: src_load += load; } - if (load_too_imbalanced(orig_src_load, orig_dst_load, - src_load, dst_load, env)) + if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; assign: -- cgit v0.10.2 From 6dc1a672ab15604947361dcd02e459effa09bad5 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 23 Jun 2014 11:46:14 -0400 Subject: sched/numa: Use effective_load() to balance NUMA loads When CONFIG_FAIR_GROUP_SCHED is enabled, the load that a task places on a CPU is determined by the group the task is in. The active groups on the source and destination CPU can be different, resulting in a different load contribution by the same task at its source and at its destination. As a result, the load needs to be calculated separately for each CPU, instead of estimated once with task_h_load(). Getting this calculation right allows some workloads to converge, where previously the last thread could get stuck on another node, without being able to migrate to its final destination. Signed-off-by: Rik van Riel Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538378-31571-3-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f287d0b..d6526d2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1151,6 +1151,7 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; + struct task_group *tg; long src_load, dst_load; long load; long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1225,14 +1226,21 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - load = task_h_load(env->p); - dst_load = env->dst_stats.load + load; - src_load = env->src_stats.load - load; + src_load = env->src_stats.load; + dst_load = env->dst_stats.load; + + /* Calculate the effect of moving env->p from src to dst. */ + load = env->p->se.load.weight; + tg = task_group(env->p); + src_load += effective_load(tg, env->src_cpu, -load, -load); + dst_load += effective_load(tg, env->dst_cpu, load, load); if (cur) { - load = task_h_load(cur); - dst_load -= load; - src_load += load; + /* Cur moves in the opposite direction. */ + load = cur->se.load.weight; + tg = task_group(cur); + src_load += effective_load(tg, env->src_cpu, load, load); + dst_load += effective_load(tg, env->dst_cpu, -load, -load); } if (load_too_imbalanced(src_load, dst_load, env)) -- cgit v0.10.2 From 1c5d3eb3759013bc7ee4197aa0a9f245bdb6eb90 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 23 Jun 2014 11:46:15 -0400 Subject: sched/numa: Simplify task_numa_compare() When a task is part of a numa_group, the comparison should always use the group weight, in order to make workloads converge. Signed-off-by: Rik van Riel Cc: chegu_vinod@hp.com Cc: mgorman@suse.de Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538378-31571-4-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d6526d2..cebb312 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1154,7 +1154,7 @@ static void task_numa_compare(struct task_numa_env *env, struct task_group *tg; long src_load, dst_load; long load; - long imp = (groupimp > 0) ? groupimp : taskimp; + long imp = env->p->numa_group ? groupimp : taskimp; rcu_read_lock(); cur = ACCESS_ONCE(dst_rq->curr); @@ -1192,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env, * itself (not part of a group), use the task weight * instead. */ - if (env->p->numa_group) - imp = groupimp; - else - imp = taskimp; - if (cur->numa_group) imp += group_weight(cur, env->src_nid) - group_weight(cur, env->dst_nid); -- cgit v0.10.2 From 0132c3e1777ceabc24c7d209b7cbe78c28c03c09 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 23 Jun 2014 11:46:16 -0400 Subject: sched/numa: Examine a task move when examining a task swap Running "perf bench numa mem -0 -m -P 1000 -p 8 -t 20" on a 4 node system results in 160 runnable threads on a system with 80 CPU threads. Once a process has nearly converged, with 39 threads on one node and 1 thread on another node, the remaining thread will be unable to migrate to its preferred node through a task swap. However, a simple task move would make the workload converge, witout causing an imbalance. Test for this unlikely occurrence, and attempt a task move to the preferred nid when it happens. # Running main, "perf bench numa mem -p 8 -t 20 -0 -m -P 1000" ### # 160 tasks will execute (on 4 nodes, 80 CPUs): # -1x 0MB global shared mem operations # -1x 1000MB process shared mem operations # -1x 0MB thread local mem operations ### ### # # 0.0% [0.2 mins] 0/0 1/1 36/2 0/0 [36/3 ] l: 0-0 ( 0) {0-2} # 0.0% [0.3 mins] 43/3 37/2 39/2 41/3 [ 6/10] l: 0-1 ( 1) {1-2} # 0.0% [0.4 mins] 42/3 38/2 40/2 40/2 [ 4/9 ] l: 1-2 ( 1) [50.0%] {1-2} # 0.0% [0.6 mins] 41/3 39/2 40/2 40/2 [ 2/9 ] l: 2-4 ( 2) [50.0%] {1-2} # 0.0% [0.7 mins] 40/2 40/2 40/2 40/2 [ 0/8 ] l: 3-5 ( 2) [40.0%] ( 41.8s converged) Without this patch, this same perf bench numa mem run had to rely on the scheduler load balancer to first balance out the load (moving a random task), before a task swap could complete the NUMA convergence. The load balancer does not normally take action unless the load difference exceeds 25%. Convergence times of over half an hour have been observed without this patch. With this patch, the NUMA balancing code will simply migrate the task, if that does not cause an imbalance. Also skip examining a CPU in detail if the improvement on that CPU is no more than the best we already have. Signed-off-by: Rik van Riel Cc: chegu_vinod@hp.com Cc: mgorman@suse.de Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ggthh0rnh0yua6o5o3p6cr1o@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cebb312..9d1734a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1155,6 +1155,7 @@ static void task_numa_compare(struct task_numa_env *env, long src_load, dst_load; long load; long imp = env->p->numa_group ? groupimp : taskimp; + long moveimp = imp; rcu_read_lock(); cur = ACCESS_ONCE(dst_rq->curr); @@ -1201,7 +1202,7 @@ static void task_numa_compare(struct task_numa_env *env, } } - if (imp < env->best_imp) + if (imp <= env->best_imp && moveimp <= env->best_imp) goto unlock; if (!cur) { @@ -1214,7 +1215,8 @@ static void task_numa_compare(struct task_numa_env *env, } /* Balance doesn't matter much if we're running a task per cpu */ - if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) + if (imp > env->best_imp && src_rq->nr_running == 1 && + dst_rq->nr_running == 1) goto assign; /* @@ -1230,6 +1232,23 @@ balance: src_load += effective_load(tg, env->src_cpu, -load, -load); dst_load += effective_load(tg, env->dst_cpu, load, load); + if (moveimp > imp && moveimp > env->best_imp) { + /* + * If the improvement from just moving env->p direction is + * better than swapping tasks around, check if a move is + * possible. Store a slightly smaller score than moveimp, + * so an actually idle CPU will win. + */ + if (!load_too_imbalanced(src_load, dst_load, env)) { + imp = moveimp - 1; + cur = NULL; + goto assign; + } + } + + if (imp <= env->best_imp) + goto unlock; + if (cur) { /* Cur moves in the opposite direction. */ load = cur->se.load.weight; -- cgit v0.10.2 From db015daedb56251b73f956f70b3b8813f80d8ee1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 23 Jun 2014 11:41:34 -0400 Subject: sched/numa: Rework best node setting in task_numa_migrate() Fix up the best node setting in task_numa_migrate() to deal with a task in a pseudo-interleaved NUMA group, which is already running in the best location. Set the task's preferred nid to the current nid, so task migration is not retried at a high rate. Signed-off-by: Rik van Riel Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538095-31256-7-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9d1734a..7bb2f46 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1354,10 +1354,6 @@ static int task_numa_migrate(struct task_struct *p) } } - /* No better CPU than the current one was found. */ - if (env.best_cpu == -1) - return -EAGAIN; - /* * If the task is part of a workload that spans multiple NUMA nodes, * and is migrating into one of the workload's active nodes, remember @@ -1366,8 +1362,19 @@ static int task_numa_migrate(struct task_struct *p) * A task that migrated to a second choice node will be better off * trying for a better one later. Do not set the preferred node here. */ - if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) - sched_setnuma(p, env.dst_nid); + if (p->numa_group) { + if (env.best_cpu == -1) + nid = env.src_nid; + else + nid = env.dst_nid; + + if (node_isset(nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); + } + + /* No better CPU than the current one was found. */ + if (env.best_cpu == -1) + return -EAGAIN; /* * Reset the scan period if the task is being rescheduled on an -- cgit v0.10.2 From a22b4b012340b988dbe7a58461d6fcc582f34aa0 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 23 Jun 2014 11:41:35 -0400 Subject: sched/numa: Change scan period code to match intent Reading through the scan period code and comment, it appears the intent was to slow down NUMA scanning when a majority of accesses are on the local node, specifically a local:remote ratio of 3:1. However, the code actually tests local / (local + remote), and the actual cut-off point was around 30% local accesses, well before a task has actually converged on a node. Changing the threshold to 7 means scanning slows down when a task has around 70% of its accesses local, which appears to match the intent of the code more closely. Signed-off-by: Rik van Riel Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538095-31256-8-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7bb2f46..a140c6a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1452,12 +1452,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) /* * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS * increments. The more local the fault statistics are, the higher the scan - * period will be for the next scan window. If local/remote ratio is below - * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the - * scan period will decrease + * period will be for the next scan window. If local/(local+remote) ratio is + * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) + * the scan period will decrease. Aim for 70% local accesses. */ #define NUMA_PERIOD_SLOTS 10 -#define NUMA_PERIOD_THRESHOLD 3 +#define NUMA_PERIOD_THRESHOLD 7 /* * Increase the scan period (slow down scanning) if the majority of -- cgit v0.10.2 From 0e59bdaea75f12a7d7c03672f4ac22c0119a1bc0 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 25 Jun 2014 12:19:42 +0400 Subject: sched/fair: Disable runtime_enabled on dying rq We kill rq->rd on the CPU_DOWN_PREPARE stage: cpuset_cpu_inactive -> cpuset_update_active_cpus -> partition_sched_domains -> -> cpu_attach_domain -> rq_attach_root -> set_rq_offline This unthrottles all throttled cfs_rqs. But the cpu is still able to call schedule() till take_cpu_down->__cpu_disable() is called from stop_machine. This case the tasks from just unthrottled cfs_rqs are pickable in a standard scheduler way, and they are picked by dying cpu. The cfs_rqs becomes throttled again, and migrate_tasks() in migration_call skips their tasks (one more unthrottle in migrate_tasks()->CPU_DYING does not happen, because rq->rd is already NULL). Patch sets runtime_enabled to zero. This guarantees, the runtime is not accounted, and the cfs_rqs won't exceed given cfs_rq->runtime_remaining = 1, and tasks will be pickable in migrate_tasks(). runtime_enabled is recalculated again when rq becomes online again. Ben Segall also noticed, we always enable runtime in tg_set_cfs_bandwidth(). Actually, we should do that for online cpus only. To prevent races with unthrottle_offline_cfs_rqs() we take get_online_cpus() lock. Reviewed-by: Ben Segall Reviewed-by: Srikar Dronamraju Signed-off-by: Kirill Tkhai CC: Konstantin Khorenko CC: Paul Turner CC: Mike Galbraith Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403684382.3462.42.camel@tkhai Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e50234b..2dbc63d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7817,6 +7817,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) if (period > max_cfs_quota_period) return -EINVAL; + /* + * Prevent race between setting of cfs_rq->runtime_enabled and + * unthrottle_offline_cfs_rqs(). + */ + get_online_cpus(); mutex_lock(&cfs_constraints_mutex); ret = __cfs_schedulable(tg, period, quota); if (ret) @@ -7842,7 +7847,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) } raw_spin_unlock_irq(&cfs_b->lock); - for_each_possible_cpu(i) { + for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; @@ -7858,6 +7863,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); + put_online_cpus(); return ret; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a140c6a..923fe32 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3798,6 +3798,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } +static void __maybe_unused update_runtime_enabled(struct rq *rq) +{ + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + + raw_spin_lock(&cfs_b->lock); + cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; + raw_spin_unlock(&cfs_b->lock); + } +} + static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { struct cfs_rq *cfs_rq; @@ -3811,6 +3824,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * there's some valid quota amount */ cfs_rq->runtime_remaining = 1; + /* + * Offline rq is schedulable till cpu is completely disabled + * in take_cpu_down(), so we prevent new cfs throttling here. + */ + cfs_rq->runtime_enabled = 0; + if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } @@ -3854,6 +3873,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return NULL; } static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static inline void update_runtime_enabled(struct rq *rq) {} static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #endif /* CONFIG_CFS_BANDWIDTH */ @@ -7362,6 +7382,8 @@ void trigger_load_balance(struct rq *rq) static void rq_online_fair(struct rq *rq) { update_sysctl(); + + update_runtime_enabled(rq); } static void rq_offline_fair(struct rq *rq) -- cgit v0.10.2 From 99b625670f1447ecf0739161efbe7f2f43c0e0b6 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 25 Jun 2014 12:19:48 +0400 Subject: sched/rt: Enqueue just unthrottled rt_rq back on the stack in __disable_runtime() Make rt_rq available for pick_next_task(). Otherwise, their tasks stay prisoned long time till dead cpu becomes alive again. Reviewed-by: Srikar Dronamraju Signed-off-by: Kirill Tkhai CC: Konstantin Khorenko CC: Ben Segall CC: Paul Turner CC: Mike Galbraith Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403684388.3462.43.camel@tkhai Signed-off-by: Ingo Molnar diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a490831..671a8b5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -740,6 +740,9 @@ balanced: rt_rq->rt_throttled = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock); + + /* Make rt_rq available for pick_next_task() */ + sched_rt_rq_enqueue(rt_rq); } } -- cgit v0.10.2 From b728ca06029d085a1585c1926610f26de93b9146 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 25 Jun 2014 12:19:55 +0400 Subject: sched: Rework check_for_tasks() 1) Iterate thru all of threads in the system. Check for all threads, not only for group leaders. 2) Check for p->on_rq instead of p->state and cputime. Preempted task in !TASK_RUNNING state OR just created task may be queued, that we want to be reported too. 3) Use read_lock() instead of write_lock(). This function does not change any structures, and read_lock() is enough. Signed-off-by: Kirill Tkhai Reviewed-by: Srikar Dronamraju Cc: Andrew Morton Cc: Ben Segall Cc: Fabian Frederick Cc: Gautham R. Shenoy Cc: Konstantin Khorenko Cc: Linus Torvalds Cc: Michael wang Cc: Mike Galbraith Cc: Paul Gortmaker Cc: Paul Turner Cc: Rafael J. Wysocki Cc: Srivatsa S. Bhat Cc: Todd E Brandt Cc: Toshi Kani Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403684395.3462.44.camel@tkhai Signed-off-by: Ingo Molnar diff --git a/kernel/cpu.c b/kernel/cpu.c index a343bde..81e2a38 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -274,21 +274,28 @@ void clear_tasks_mm_cpumask(int cpu) rcu_read_unlock(); } -static inline void check_for_tasks(int cpu) +static inline void check_for_tasks(int dead_cpu) { - struct task_struct *p; - cputime_t utime, stime; + struct task_struct *g, *p; - write_lock_irq(&tasklist_lock); - for_each_process(p) { - task_cputime(p, &utime, &stime); - if (task_cpu(p) == cpu && p->state == TASK_RUNNING && - (utime || stime)) - pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", - p->comm, task_pid_nr(p), cpu, - p->state, p->flags); - } - write_unlock_irq(&tasklist_lock); + read_lock_irq(&tasklist_lock); + do_each_thread(g, p) { + if (!p->on_rq) + continue; + /* + * We do the check with unlocked task_rq(p)->lock. + * Order the reading to do not warn about a task, + * which was running on this cpu in the past, and + * it's just been woken on another cpu. + */ + rmb(); + if (task_cpu(p) != dead_cpu) + continue; + + pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", + p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); + } while_each_thread(g, p); + read_unlock_irq(&tasklist_lock); } struct take_cpu_down_param { -- cgit v0.10.2 From 466af29bf4270e84261712428a1304c28e3743fa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 18:52:06 +0200 Subject: sched/deadline: Kill task_struct->pi_top_task Remove task_struct->pi_top_task. The only user, rt_mutex_setprio(), can use a local. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Juri Lelli Cc: Alex Thorlton Cc: Andrew Morton Cc: Daeseok Youn Cc: Dario Faggioli Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric W. Biederman Cc: Linus Torvalds Cc: Matthew Dempsky Cc: Michal Simek Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/20140606165206.GB29465@redhat.com Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index 306f4f0..c9c9ff7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1440,8 +1440,6 @@ struct task_struct { struct rb_node *pi_waiters_leftmost; /* Deadlock detection and priority inheritance handling */ struct rt_mutex_waiter *pi_blocked_on; - /* Top pi_waiters task */ - struct task_struct *pi_top_task; #endif #ifdef CONFIG_DEBUG_MUTEXES diff --git a/kernel/fork.c b/kernel/fork.c index d2799d1..6ff87f442 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1095,7 +1095,6 @@ static void rt_mutex_init_task(struct task_struct *p) p->pi_waiters = RB_ROOT; p->pi_waiters_leftmost = NULL; p->pi_blocked_on = NULL; - p->pi_top_task = NULL; #endif } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2dbc63d..cf7695a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2980,7 +2980,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } trace_sched_pi_setprio(p, prio); - p->pi_top_task = rt_mutex_get_top_task(p); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->on_rq; @@ -3000,8 +2999,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) * running task */ if (dl_prio(prio)) { - if (!dl_prio(p->normal_prio) || (p->pi_top_task && - dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { + struct task_struct *pi_task = rt_mutex_get_top_task(p); + if (!dl_prio(p->normal_prio) || + (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; p->dl.dl_throttled = 0; enqueue_flag = ENQUEUE_REPLENISH; -- cgit v0.10.2 From 8875125efe8402c4d84b08291e68f1281baba8e2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sun, 29 Jun 2014 00:03:57 +0400 Subject: sched: Transform resched_task() into resched_curr() We always use resched_task() with rq->curr argument. It's not possible to reschedule any task but rq's current. The patch introduces resched_curr(struct rq *) to replace all of the repeating patterns. The main aim is cleanup, but there is a little size profit too: (before) $ size kernel/sched/built-in.o text data bss dec hex filename 155274 16445 7042 178761 2ba49 kernel/sched/built-in.o $ size vmlinux text data bss dec hex filename 7411490 1178376 991232 9581098 92322a vmlinux (after) $ size kernel/sched/built-in.o text data bss dec hex filename 155130 16445 7042 178617 2b9b9 kernel/sched/built-in.o $ size vmlinux text data bss dec hex filename 7411362 1178376 991232 9580970 9231aa vmlinux I was choosing between resched_curr() and resched_rq(), and the first name looks better for me. A little lie in Documentation/trace/ftrace.txt. I have not actually collected the tracing again. With a hope the patch won't make execution times much worse :) Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Randy Dunlap Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140628200219.1778.18735.stgit@localhost Signed-off-by: Ingo Molnar diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 2479b2a..4da4261 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -1515,7 +1515,7 @@ Doing the same with chrt -r 5 and function-trace set. -0 3d.h4 1us+: 0:120:R + [003] 2448: 94:R sleep -0 3d.h4 2us : ttwu_do_activate.constprop.87 <-try_to_wake_up -0 3d.h3 3us : check_preempt_curr <-ttwu_do_wakeup - -0 3d.h3 3us : resched_task <-check_preempt_curr + -0 3d.h3 3us : resched_curr <-check_preempt_curr -0 3dNh3 4us : task_woken_rt <-ttwu_do_wakeup -0 3dNh3 4us : _raw_spin_unlock <-try_to_wake_up -0 3dNh3 4us : sub_preempt_count <-_raw_spin_unlock diff --git a/include/linux/sched.h b/include/linux/sched.h index c9c9ff7..41a1953 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2786,7 +2786,7 @@ static inline bool __must_check current_set_polling_and_test(void) /* * Polling state must be visible before we test NEED_RESCHED, - * paired by resched_task() + * paired by resched_curr() */ smp_mb__after_atomic(); @@ -2804,7 +2804,7 @@ static inline bool __must_check current_clr_polling_and_test(void) /* * Polling state must be visible before we test NEED_RESCHED, - * paired by resched_task() + * paired by resched_curr() */ smp_mb__after_atomic(); @@ -2836,7 +2836,7 @@ static inline void current_clr_polling(void) * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also * fold. */ - smp_mb(); /* paired with resched_task() */ + smp_mb(); /* paired with resched_curr() */ preempt_fold_need_resched(); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cf7695a..2f96081 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -589,30 +589,31 @@ static bool set_nr_if_polling(struct task_struct *p) #endif /* - * resched_task - mark a task 'to be rescheduled now'. + * resched_curr - mark rq's current task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -void resched_task(struct task_struct *p) +void resched_curr(struct rq *rq) { + struct task_struct *curr = rq->curr; int cpu; - lockdep_assert_held(&task_rq(p)->lock); + lockdep_assert_held(&rq->lock); - if (test_tsk_need_resched(p)) + if (test_tsk_need_resched(curr)) return; - cpu = task_cpu(p); + cpu = cpu_of(rq); if (cpu == smp_processor_id()) { - set_tsk_need_resched(p); + set_tsk_need_resched(curr); set_preempt_need_resched(); return; } - if (set_nr_and_not_polling(p)) + if (set_nr_and_not_polling(curr)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); @@ -625,7 +626,7 @@ void resched_cpu(int cpu) if (!raw_spin_trylock_irqsave(&rq->lock, flags)) return; - resched_task(cpu_curr(cpu)); + resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1027,7 +1028,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) if (class == rq->curr->sched_class) break; if (class == p->sched_class) { - resched_task(rq->curr); + resched_curr(rq); break; } } @@ -3073,7 +3074,7 @@ void set_user_nice(struct task_struct *p, long nice) * lowered its priority, then reschedule its CPU: */ if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_task(rq->curr); + resched_curr(rq); } out_unlock: task_rq_unlock(rq, p, &flags); @@ -4299,7 +4300,7 @@ again: * fairness. */ if (preempt && rq != p_rq) - resched_task(p_rq->curr); + resched_curr(p_rq); } out_unlock: @@ -7106,7 +7107,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) __setscheduler(rq, p, &attr); if (on_rq) { enqueue_task(rq, p, 0); - resched_task(rq->curr); + resched_curr(rq); } check_class_changed(rq, p, prev_class, old_prio); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fc4f98b1..df0b77a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -535,7 +535,7 @@ again: if (task_has_dl_policy(rq->curr)) check_preempt_curr_dl(rq, p, 0); else - resched_task(rq->curr); + resched_curr(rq); #ifdef CONFIG_SMP /* * Queueing this task back might have overloaded rq, @@ -634,7 +634,7 @@ static void update_curr_dl(struct rq *rq) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) - resched_task(curr); + resched_curr(rq); } /* @@ -964,7 +964,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) cpudl_find(&rq->rd->cpudl, p, NULL) != -1) return; - resched_task(rq->curr); + resched_curr(rq); } static int pull_dl_task(struct rq *this_rq); @@ -979,7 +979,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags) { if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { - resched_task(rq->curr); + resched_curr(rq); return; } @@ -1333,7 +1333,7 @@ retry: if (dl_task(rq->curr) && dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && rq->curr->nr_cpus_allowed > 1) { - resched_task(rq->curr); + resched_curr(rq); return 0; } @@ -1373,7 +1373,7 @@ retry: set_task_cpu(next_task, later_rq->cpu); activate_task(later_rq, next_task, 0); - resched_task(later_rq->curr); + resched_curr(later_rq); double_unlock_balance(rq, later_rq); @@ -1632,14 +1632,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && rq->curr == p) - resched_task(p); + resched_curr(rq); #else /* * Again, we don't know if p has a earlier * or later deadline, so let's blindly set a * (maybe not needed) rescheduling point. */ - resched_task(p); + resched_curr(rq); #endif /* CONFIG_SMP */ } else switched_to_dl(rq, p); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 923fe32..f5f0cc9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2923,7 +2923,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. @@ -2947,7 +2947,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (delta > ideal_runtime) - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); } static void @@ -3087,7 +3087,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); return; } /* @@ -3278,7 +3278,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); } static __always_inline @@ -3438,7 +3438,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) - resched_task(rq->curr); + resched_curr(rq); } static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, @@ -3897,7 +3897,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) if (delta < 0) { if (rq->curr == p) - resched_task(p); + resched_curr(rq); return; } @@ -4766,7 +4766,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; preempt: - resched_task(curr); + resched_curr(rq); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved @@ -7457,7 +7457,7 @@ static void task_fork_fair(struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); - resched_task(rq->curr); + resched_curr(rq); } se->vruntime -= cfs_rq->min_vruntime; @@ -7482,7 +7482,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) */ if (rq->curr == p) { if (p->prio > oldprio) - resched_task(rq->curr); + resched_curr(rq); } else check_preempt_curr(rq, p, 0); } @@ -7545,7 +7545,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * if we can still preempt the current task. */ if (rq->curr == p) - resched_task(rq->curr); + resched_curr(rq); else check_preempt_curr(rq, p, 0); } diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 879f2b7..67ad4e7 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) */ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) { - resched_task(rq->idle); + resched_curr(rq); } static struct task_struct * diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 671a8b5..5f6edca 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -463,9 +463,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + struct rq *rq = rq_of_rt_rq(rt_rq); struct sched_rt_entity *rt_se; - int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + int cpu = cpu_of(rq); rt_se = rt_rq->tg->rt_se[cpu]; @@ -476,7 +477,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) enqueue_rt_entity(rt_se, false); if (rt_rq->highest_prio.curr < curr->prio) - resched_task(curr); + resched_curr(rq); } } @@ -566,7 +567,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) return; enqueue_top_rt_rq(rt_rq); - resched_task(rq->curr); + resched_curr(rq); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) @@ -951,7 +952,7 @@ static void update_curr_rt(struct rq *rq) raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); } } @@ -1366,7 +1367,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * to try and push current away: */ requeue_task_rt(rq, p, 1); - resched_task(rq->curr); + resched_curr(rq); } #endif /* CONFIG_SMP */ @@ -1377,7 +1378,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) { if (p->prio < rq->curr->prio) { - resched_task(rq->curr); + resched_curr(rq); return; } @@ -1693,7 +1694,7 @@ retry: * just reschedule current. */ if (unlikely(next_task->prio < rq->curr->prio)) { - resched_task(rq->curr); + resched_curr(rq); return 0; } @@ -1740,7 +1741,7 @@ retry: activate_task(lowest_rq, next_task, 0); ret = 1; - resched_task(lowest_rq->curr); + resched_curr(lowest_rq); double_unlock_balance(rq, lowest_rq); @@ -1939,7 +1940,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) return; if (pull_rt_task(rq)) - resched_task(rq->curr); + resched_curr(rq); } void __init init_sched_rt_class(void) @@ -1977,7 +1978,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) check_resched = 0; #endif /* CONFIG_SMP */ if (check_resched && p->prio < rq->curr->prio) - resched_task(rq->curr); + resched_curr(rq); } } @@ -2006,11 +2007,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * Only reschedule if p is still on the same runqueue. */ if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) - resched_task(p); + resched_curr(rq); #else /* For UP simply resched on drop of prio */ if (oldprio < p->prio) - resched_task(p); + resched_curr(rq); #endif /* CONFIG_SMP */ } else { /* @@ -2019,7 +2020,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * then reschedule. */ if (p->prio < rq->curr->prio) - resched_task(rq->curr); + resched_curr(rq); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0191ed5..1283945 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1199,7 +1199,7 @@ extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); extern void init_sched_dl_class(void); -extern void resched_task(struct task_struct *p); +extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; -- cgit v0.10.2 From 1b09d29bc00964d9032d80516f958044ac6b3805 Mon Sep 17 00:00:00 2001 From: "xiaofeng.yan" Date: Mon, 7 Jul 2014 05:59:04 +0000 Subject: sched/rt: Fix replenish_dl_entity() comments to match the current upstream code Signed-off-by: xiaofeng.yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1404712744-16986-1-git-send-email-xiaofeng.yan@huawei.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index df0b77a..255ce13 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -306,7 +306,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, * the overrunning entity can't interfere with other entity in the system and * can't make them miss their deadlines. Reasons why this kind of overruns * could happen are, typically, a entity voluntarily trying to overcome its - * runtime, or it just underestimated it during sched_setscheduler_ex(). + * runtime, or it just underestimated it during sched_setattr(). */ static void replenish_dl_entity(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se) -- cgit v0.10.2 From 6e76ea8a8209386c3cc7ee5594e6ea5d25525cf2 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 2 Jul 2014 15:52:41 +0000 Subject: sched: Remove extra static_key*() function indirection I think its a bit simpler without having to follow an extra layer of static inline fuctions. No functional change just cosmetic. Signed-off-by: Jason Baron Signed-off-by: Peter Zijlstra Cc: rostedt@goodmis.org Cc: Linus Torvalds Link: http://lkml.kernel.org/r/2ce52233ce200faad93b6029d90f1411cd926667.1404315388.git.jbaron@akamai.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1283945..579712f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -887,20 +887,10 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct static_key *key) -{ - return static_key_true(key); /* Not out of line branch. */ -} - -static __always_inline bool static_branch__false(struct static_key *key) -{ - return static_key_false(key); /* Out of line branch. */ -} - #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ - return static_branch__##enabled(key); \ + return static_key_##enabled(key); \ } #include "features.h" -- cgit v0.10.2 From 5cd08fbfdb6baa9fe98f530b76898fc5725a6289 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 2 Jul 2014 15:52:44 +0000 Subject: sched: Fix static_key race with sched_feat() As pointed out by Andi Kleen, the usage of static keys can be racy in sched_feat_disable() vs. sched_feat_enable(). Currently, we first check the value of keys->enabled, and subsequently update the branch direction. This, can be racy and can potentially leave the keys in an inconsistent state. Take the i_mutex around these calls to resolve the race. Reported-by: Andi Kleen Signed-off-by: Jason Baron Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/9d7780c83db26683955cd01e6bc654ee2586e67f.1404315388.git.jbaron@akamai.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2f96081..8705125 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -245,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, char buf[64]; char *cmp; int i; + struct inode *inode; if (cnt > 63) cnt = 63; @@ -255,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; cmp = strstrip(buf); + /* Ensure the static_key remains in a consistent state */ + inode = file_inode(filp); + mutex_lock(&inode->i_mutex); i = sched_feat_set(cmp); + mutex_unlock(&inode->i_mutex); if (i == __SCHED_FEAT_NR) return -EINVAL; -- cgit v0.10.2 From e720fff6341fe4b95e5a93c939bd3c77fa55ced4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Jul 2014 16:01:53 +0200 Subject: sched/numa: Revert "Use effective_load() to balance NUMA loads" Due to divergent trees, Rik find that this patch is no longer required. Requested-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-u6odkgkw8wz3m7orgsjfo5pi@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f5f0cc9..45943b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1151,7 +1151,6 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - struct task_group *tg; long src_load, dst_load; long load; long imp = env->p->numa_group ? groupimp : taskimp; @@ -1223,14 +1222,9 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - src_load = env->src_stats.load; - dst_load = env->dst_stats.load; - - /* Calculate the effect of moving env->p from src to dst. */ - load = env->p->se.load.weight; - tg = task_group(env->p); - src_load += effective_load(tg, env->src_cpu, -load, -load); - dst_load += effective_load(tg, env->dst_cpu, load, load); + load = task_h_load(env->p); + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; if (moveimp > imp && moveimp > env->best_imp) { /* @@ -1250,11 +1244,9 @@ balance: goto unlock; if (cur) { - /* Cur moves in the opposite direction. */ - load = cur->se.load.weight; - tg = task_group(cur); - src_load += effective_load(tg, env->src_cpu, load, load); - dst_load += effective_load(tg, env->dst_cpu, -load, -load); + load = task_h_load(cur); + dst_load -= load; + src_load += load; } if (load_too_imbalanced(src_load, dst_load, env)) -- cgit v0.10.2 From 743162013d40ca612b4cb53d3a200dff2d9ab26e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 7 Jul 2014 15:16:04 +1000 Subject: sched: Remove proliferation of wait_on_bit() action functions The current "wait_on_bit" interface requires an 'action' function to be provided which does the actual waiting. There are over 20 such functions, many of them identical. Most cases can be satisfied by one of just two functions, one which uses io_schedule() and one which just uses schedule(). So: Rename wait_on_bit and wait_on_bit_lock to wait_on_bit_action and wait_on_bit_lock_action to make it explicit that they need an action function. Introduce new wait_on_bit{,_lock} and wait_on_bit{,_lock}_io which are *not* given an action function but implicitly use a standard one. The decision to error-out if a signal is pending is now made based on the 'mode' argument rather than being encoded in the action function. All instances of the old wait_on_bit and wait_on_bit_lock which can use the new version have been changed accordingly and their action functions have been discarded. wait_on_bit{_lock} does not return any specific error code in the event of a signal so the caller must check for non-zero and interpolate their own error code as appropriate. The wait_on_bit() call in __fscache_wait_on_invalidate() was ambiguous as it specified TASK_UNINTERRUPTIBLE but used fscache_wait_bit_interruptible as an action function. David Howells confirms this should be uniformly "uninterruptible" The main remaining user of wait_on_bit{,_lock}_action is NFS which needs to use a freezer-aware schedule() call. A comment in fs/gfs2/glock.c notes that having multiple 'action' functions is useful as they display differently in the 'wchan' field of 'ps'. (and /proc/$PID/wchan). As the new bit_wait{,_io} functions are tagged "__sched", they will not show up at all, but something higher in the stack. So the distinction will still be visible, only with different function names (gds2_glock_wait versus gfs2_glock_dq_wait in the gfs2/glock.c case). Since first version of this patch (against 3.15) two new action functions appeared, on in NFS and one in CIFS. CIFS also now uses an action function that makes the same freezer aware schedule call as NFS. Signed-off-by: NeilBrown Acked-by: David Howells (fscache, keys) Acked-by: Steven Whitehouse (gfs2) Acked-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Steve French Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140707051603.28027.72349.stgit@notabene.brown Signed-off-by: Ingo Molnar diff --git a/Documentation/filesystems/caching/operations.txt b/Documentation/filesystems/caching/operations.txt index bee2a5f..a1c052c 100644 --- a/Documentation/filesystems/caching/operations.txt +++ b/Documentation/filesystems/caching/operations.txt @@ -90,7 +90,7 @@ operations: to be cleared before proceeding: wait_on_bit(&op->flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); (2) The operation may be fast asynchronous (FSCACHE_OP_FAST), in which case it diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 4e84095..96c92b7 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -615,16 +615,6 @@ static void write_endio(struct bio *bio, int error) } /* - * This function is called when wait_on_bit is actually waiting. - */ -static int do_io_schedule(void *word) -{ - io_schedule(); - - return 0; -} - -/* * Initiate a write on a dirty buffer, but don't wait for it. * * - If the buffer is not dirty, exit. @@ -640,8 +630,7 @@ static void __write_dirty_buffer(struct dm_buffer *b, return; clear_bit(B_DIRTY, &b->state); - wait_on_bit_lock(&b->state, B_WRITING, - do_io_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); if (!write_list) submit_io(b, WRITE, b->block, write_endio); @@ -675,9 +664,9 @@ static void __make_buffer_clean(struct dm_buffer *b) if (!b->state) /* fast case */ return; - wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); __write_dirty_buffer(b, NULL); - wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); } /* @@ -1030,7 +1019,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, if (need_submit) submit_io(b, READ, b->block, read_endio); - wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); if (b->read_error) { int error = b->read_error; @@ -1209,15 +1198,13 @@ again: dropped_lock = 1; b->hold_count++; dm_bufio_unlock(c); - wait_on_bit(&b->state, B_WRITING, - do_io_schedule, - TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); dm_bufio_lock(c); b->hold_count--; } else - wait_on_bit(&b->state, B_WRITING, - do_io_schedule, - TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); } if (!test_bit(B_DIRTY, &b->state) && @@ -1321,15 +1308,15 @@ retry: __write_dirty_buffer(b, NULL); if (b->hold_count == 1) { - wait_on_bit(&b->state, B_WRITING, - do_io_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); set_bit(B_DIRTY, &b->state); __unlink_buffer(b); __link_buffer(b, new_block, LIST_DIRTY); } else { sector_t old_block; - wait_on_bit_lock(&b->state, B_WRITING, - do_io_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit_lock_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); /* * Relink buffer to "new_block" so that write_callback * sees "new_block" as a block number. @@ -1341,8 +1328,8 @@ retry: __unlink_buffer(b); __link_buffer(b, new_block, b->list_mode); submit_io(b, WRITE, new_block, write_endio); - wait_on_bit(&b->state, B_WRITING, - do_io_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); __unlink_buffer(b); __link_buffer(b, old_block, b->list_mode); } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 5bd2290..864b03f 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1032,21 +1032,13 @@ static void start_merge(struct dm_snapshot *s) snapshot_merge_next_chunks(s); } -static int wait_schedule(void *ptr) -{ - schedule(); - - return 0; -} - /* * Stop the merging process and wait until it finishes. */ static void stop_merge(struct dm_snapshot *s) { set_bit(SHUTDOWN_MERGE, &s->state_bits); - wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule, - TASK_UNINTERRUPTIBLE); + wait_on_bit(&s->state_bits, RUNNING_MERGE, TASK_UNINTERRUPTIBLE); clear_bit(SHUTDOWN_MERGE, &s->state_bits); } diff --git a/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c b/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c index e355806..f296394 100644 --- a/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c +++ b/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c @@ -253,13 +253,6 @@ static int dvb_usbv2_adapter_stream_exit(struct dvb_usb_adapter *adap) return usb_urb_exitv2(&adap->stream); } -static int wait_schedule(void *ptr) -{ - schedule(); - - return 0; -} - static int dvb_usb_start_feed(struct dvb_demux_feed *dvbdmxfeed) { struct dvb_usb_adapter *adap = dvbdmxfeed->demux->priv; @@ -273,8 +266,7 @@ static int dvb_usb_start_feed(struct dvb_demux_feed *dvbdmxfeed) dvbdmxfeed->pid, dvbdmxfeed->index); /* wait init is done */ - wait_on_bit(&adap->state_bits, ADAP_INIT, wait_schedule, - TASK_UNINTERRUPTIBLE); + wait_on_bit(&adap->state_bits, ADAP_INIT, TASK_UNINTERRUPTIBLE); if (adap->active_fe == -1) return -EINVAL; @@ -568,7 +560,7 @@ static int dvb_usb_fe_sleep(struct dvb_frontend *fe) if (!adap->suspend_resume_active) { set_bit(ADAP_SLEEP, &adap->state_bits); - wait_on_bit(&adap->state_bits, ADAP_STREAMING, wait_schedule, + wait_on_bit(&adap->state_bits, ADAP_STREAMING, TASK_UNINTERRUPTIBLE); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a389820..3e11aab 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3437,16 +3437,10 @@ done_unlocked: return 0; } -static int eb_wait(void *word) -{ - io_schedule(); - return 0; -} - void wait_on_extent_buffer_writeback(struct extent_buffer *eb) { - wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, - TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, + TASK_UNINTERRUPTIBLE); } static noinline_for_stack int diff --git a/fs/buffer.c b/fs/buffer.c index eba6e4f..8f05111 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -61,16 +61,9 @@ inline void touch_buffer(struct buffer_head *bh) } EXPORT_SYMBOL(touch_buffer); -static int sleep_on_buffer(void *word) -{ - io_schedule(); - return 0; -} - void __lock_buffer(struct buffer_head *bh) { - wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer, - TASK_UNINTERRUPTIBLE); + wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_buffer); @@ -123,7 +116,7 @@ EXPORT_SYMBOL(buffer_check_dirty_writeback); */ void __wait_on_buffer(struct buffer_head * bh) { - wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__wait_on_buffer); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 20d75b8..b98366f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3934,13 +3934,6 @@ cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb) return tlink_tcon(cifs_sb_master_tlink(cifs_sb)); } -static int -cifs_sb_tcon_pending_wait(void *unused) -{ - schedule(); - return signal_pending(current) ? -ERESTARTSYS : 0; -} - /* find and return a tlink with given uid */ static struct tcon_link * tlink_rb_search(struct rb_root *root, kuid_t uid) @@ -4039,11 +4032,10 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb) } else { wait_for_construction: ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, - cifs_sb_tcon_pending_wait, TASK_INTERRUPTIBLE); if (ret) { cifs_put_tlink(tlink); - return ERR_PTR(ret); + return ERR_PTR(-ERESTARTSYS); } /* if it's good, return it */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index e90a1e9..b88b1ad 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3618,13 +3618,6 @@ static int cifs_launder_page(struct page *page) return rc; } -static int -cifs_pending_writers_wait(void *unused) -{ - schedule(); - return 0; -} - void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, @@ -3636,7 +3629,7 @@ void cifs_oplock_break(struct work_struct *work) int rc = 0; wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, - cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); server->ops->downgrade_oplock(server, cinode, test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a174605..213c458 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1794,8 +1794,8 @@ cifs_revalidate_mapping(struct inode *inode) int rc; unsigned long *flags = &CIFS_I(inode)->flags; - rc = wait_on_bit_lock(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, - TASK_KILLABLE); + rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, + TASK_KILLABLE); if (rc) return rc; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 3b0c62e..6bf55d0 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -582,7 +582,7 @@ int cifs_get_writer(struct cifsInodeInfo *cinode) start: rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, - cifs_oplock_break_wait, TASK_KILLABLE); + TASK_KILLABLE); if (rc) return rc; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index be568b7..ef9bef1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -342,7 +342,8 @@ static void __inode_wait_for_writeback(struct inode *inode) wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); + __wait_on_bit(wqh, &wq, bit_wait, + TASK_UNINTERRUPTIBLE); spin_lock(&inode->i_lock); } } diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index aec01be..89acec7 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -160,7 +160,7 @@ void __fscache_enable_cookie(struct fscache_cookie *cookie, _enter("%p", cookie); wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) goto out_unlock; @@ -255,7 +255,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) if (!fscache_defer_lookup) { _debug("non-deferred lookup %p", &cookie->flags); wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); _debug("complete"); if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) goto unavailable; @@ -463,7 +463,6 @@ void __fscache_wait_on_invalidate(struct fscache_cookie *cookie) _enter("%p", cookie); wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING, - fscache_wait_bit_interruptible, TASK_UNINTERRUPTIBLE); _leave(""); @@ -525,7 +524,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) } wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) goto out_unlock_enable; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index bc6c08f..7872a62 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -97,8 +97,6 @@ static inline bool fscache_object_congested(void) return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); } -extern int fscache_wait_bit(void *); -extern int fscache_wait_bit_interruptible(void *); extern int fscache_wait_atomic_t(atomic_t *); /* diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 63f868e..a31b83c 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -197,24 +197,6 @@ static void __exit fscache_exit(void) module_exit(fscache_exit); /* - * wait_on_bit() sleep function for uninterruptible waiting - */ -int fscache_wait_bit(void *flags) -{ - schedule(); - return 0; -} - -/* - * wait_on_bit() sleep function for interruptible waiting - */ -int fscache_wait_bit_interruptible(void *flags) -{ - schedule(); - return signal_pending(current); -} - -/* * wait_on_atomic_t() sleep function for uninterruptible waiting */ int fscache_wait_atomic_t(atomic_t *p) diff --git a/fs/fscache/page.c b/fs/fscache/page.c index ed70714..85332b9 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -298,7 +298,6 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) jif = jiffies; if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, - fscache_wait_bit_interruptible, TASK_INTERRUPTIBLE) != 0) { fscache_stat(&fscache_n_retrievals_intr); _leave(" = -ERESTARTSYS"); @@ -342,7 +341,6 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, if (stat_op_waits) fscache_stat(stat_op_waits); if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, - fscache_wait_bit_interruptible, TASK_INTERRUPTIBLE) != 0) { ret = fscache_cancel_op(op, do_cancel); if (ret == 0) @@ -351,7 +349,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, /* it's been removed from the pending queue by another party, * so we should get to run shortly */ wait_on_bit(&op->flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); } _debug("<<< GO"); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c355f73..770e167 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -856,27 +856,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh) } /** - * gfs2_glock_holder_wait - * @word: unused - * - * This function and gfs2_glock_demote_wait both show up in the WCHAN - * field. Thus I've separated these otherwise identical functions in - * order to be more informative to the user. - */ - -static int gfs2_glock_holder_wait(void *word) -{ - schedule(); - return 0; -} - -static int gfs2_glock_demote_wait(void *word) -{ - schedule(); - return 0; -} - -/** * gfs2_glock_wait - wait on a glock acquisition * @gh: the glock holder * @@ -888,7 +867,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh) unsigned long time1 = jiffies; might_sleep(); - wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */ /* Lengthen the minimum hold time. */ gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time + @@ -1128,7 +1107,7 @@ void gfs2_glock_dq_wait(struct gfs2_holder *gh) struct gfs2_glock *gl = gh->gh_gl; gfs2_glock_dq(gh); might_sleep(); - wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE); + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE); } /** diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 91f274d..992ca5b 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -936,12 +936,6 @@ fail: return error; } -static int dlm_recovery_wait(void *word) -{ - schedule(); - return 0; -} - static int control_first_done(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; @@ -976,7 +970,7 @@ restart: fs_info(sdp, "control_first_done wait gen %u\n", start_gen); wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY, - dlm_recovery_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); goto restart; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index bc564c0..d3eae24 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1024,20 +1024,13 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp) lm->lm_unmount(sdp); } -static int gfs2_journalid_wait(void *word) -{ - if (signal_pending(current)) - return -EINTR; - schedule(); - return 0; -} - static int wait_on_journal(struct gfs2_sbd *sdp) { if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) return 0; - return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE); + return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, TASK_INTERRUPTIBLE) + ? -EINTR : 0; } void gfs2_online_uevent(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 94555d4..573bd3b 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -591,12 +591,6 @@ done: wake_up_bit(&jd->jd_flags, JDF_RECOVERY); } -static int gfs2_recovery_wait(void *word) -{ - schedule(); - return 0; -} - int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) { int rv; @@ -609,7 +603,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) BUG_ON(!rv); if (wait) - wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, TASK_UNINTERRUPTIBLE); return wait ? jd->jd_recover_error : 0; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 1319b5c..2607ff1 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -864,12 +864,6 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) return error; } -static int gfs2_umount_recovery_wait(void *word) -{ - schedule(); - return 0; -} - /** * gfs2_put_super - Unmount the filesystem * @sb: The VFS superblock @@ -894,7 +888,7 @@ restart: continue; spin_unlock(&sdp->sd_jindex_spin); wait_on_bit(&jd->jd_flags, JDF_RECOVERY, - gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); goto restart; } spin_unlock(&sdp->sd_jindex_spin); diff --git a/fs/inode.c b/fs/inode.c index 6eecb7f..5938f39 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1695,13 +1695,6 @@ int inode_needs_sync(struct inode *inode) } EXPORT_SYMBOL(inode_needs_sync); -int inode_wait(void *word) -{ - schedule(); - return 0; -} -EXPORT_SYMBOL(inode_wait); - /* * If we try to find an inode in the inode hash while it is being * deleted, we have to wait until the filesystem completes its diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6f0f590..5f09370 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -763,12 +763,6 @@ static void warn_dirty_buffer(struct buffer_head *bh) bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } -static int sleep_on_shadow_bh(void *word) -{ - io_schedule(); - return 0; -} - /* * If the buffer is already part of the current transaction, then there * is nothing we need to do. If it is already part of a prior @@ -906,8 +900,8 @@ repeat: if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); jbd_unlock_bh_state(bh); - wait_on_bit(&bh->b_state, BH_Shadow, - sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&bh->b_state, BH_Shadow, + TASK_UNINTERRUPTIBLE); goto repeat; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 4042ff5..524dd80 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -361,8 +361,8 @@ start: * Prevent starvation issues if someone is doing a consistency * sync-to-disk */ - ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); + ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); if (ret) return ret; diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 44bf014..e2a0361 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -783,8 +783,8 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) { might_sleep(); - wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, - nfs_wait_bit_killable, TASK_KILLABLE); + wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING, + nfs_wait_bit_killable, TASK_KILLABLE); } static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 9927913..b7b710e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1074,8 +1074,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) * the bit lock here if it looks like we're going to be doing that. */ for (;;) { - ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); + ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); if (ret) goto out; spin_lock(&inode->i_lock); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 848f685..42f1211 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1251,8 +1251,8 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp) might_sleep(); atomic_inc(&clp->cl_count); - res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, - nfs_wait_bit_killable, TASK_KILLABLE); + res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, + nfs_wait_bit_killable, TASK_KILLABLE); if (res) goto out; if (clp->cl_cons_state < 0) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b6ee3a6..6104d35 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -138,12 +138,6 @@ nfs_iocounter_wait(struct nfs_io_counter *c) return __nfs_iocounter_wait(c); } -static int nfs_wait_bit_uninterruptible(void *word) -{ - io_schedule(); - return 0; -} - /* * nfs_page_group_lock - lock the head of the page group * @req - request in group that is to be locked @@ -158,7 +152,6 @@ nfs_page_group_lock(struct nfs_page *req) WARN_ON_ONCE(head != head->wb_head); wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, - nfs_wait_bit_uninterruptible, TASK_UNINTERRUPTIBLE); } @@ -425,9 +418,8 @@ void nfs_release_request(struct nfs_page *req) int nfs_wait_on_request(struct nfs_page *req) { - return wait_on_bit(&req->wb_flags, PG_BUSY, - nfs_wait_bit_uninterruptible, - TASK_UNINTERRUPTIBLE); + return wait_on_bit_io(&req->wb_flags, PG_BUSY, + TASK_UNINTERRUPTIBLE); } /* diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6fdcd23..a8914b3 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1885,7 +1885,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { if (!sync) goto out; - status = wait_on_bit_lock(&nfsi->flags, + status = wait_on_bit_lock_action(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, nfs_wait_bit_killable, TASK_KILLABLE); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 98ff061..f05f321 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -397,7 +397,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) int err; /* Stop dirtying of new pages while we sync */ - err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, + err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING, nfs_wait_bit_killable, TASK_KILLABLE); if (err) goto out_err; @@ -1475,7 +1475,7 @@ int nfs_commit_inode(struct inode *inode, int how) return error; if (!may_wait) goto out_mark_dirty; - error = wait_on_bit(&NFS_I(inode)->flags, + error = wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_COMMIT, nfs_wait_bit_killable, TASK_KILLABLE); diff --git a/include/linux/wait.h b/include/linux/wait.h index bd68819..73960ff 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -854,11 +854,14 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); (wait)->flags = 0; \ } while (0) + +extern int bit_wait(void *); +extern int bit_wait_io(void *); + /** * wait_on_bit - wait for a bit to be cleared * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on - * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * * There is a standard hashed waitqueue table for generic use. This @@ -867,9 +870,62 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); * call wait_on_bit() in threads waiting for the bit to clear. * One uses wait_on_bit() where one is waiting for the bit to clear, * but has no intention of setting it. + * Returned value will be zero if the bit was cleared, or non-zero + * if the process received a signal and the mode permitted wakeup + * on that signal. + */ +static inline int +wait_on_bit(void *word, int bit, unsigned mode) +{ + if (!test_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit(word, bit, + bit_wait, + mode); +} + +/** + * wait_on_bit_io - wait for a bit to be cleared + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @mode: the task state to sleep in + * + * Use the standard hashed waitqueue table to wait for a bit + * to be cleared. This is similar to wait_on_bit(), but calls + * io_schedule() instead of schedule() for the actual waiting. + * + * Returned value will be zero if the bit was cleared, or non-zero + * if the process received a signal and the mode permitted wakeup + * on that signal. + */ +static inline int +wait_on_bit_io(void *word, int bit, unsigned mode) +{ + if (!test_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit(word, bit, + bit_wait_io, + mode); +} + +/** + * wait_on_bit_action - wait for a bit to be cleared + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @action: the function used to sleep, which may take special actions + * @mode: the task state to sleep in + * + * Use the standard hashed waitqueue table to wait for a bit + * to be cleared, and allow the waiting action to be specified. + * This is like wait_on_bit() but allows fine control of how the waiting + * is done. + * + * Returned value will be zero if the bit was cleared, or non-zero + * if the process received a signal and the mode permitted wakeup + * on that signal. */ static inline int -wait_on_bit(void *word, int bit, int (*action)(void *), unsigned mode) +wait_on_bit_action(void *word, int bit, int (*action)(void *), unsigned mode) { if (!test_bit(bit, word)) return 0; @@ -880,7 +936,6 @@ wait_on_bit(void *word, int bit, int (*action)(void *), unsigned mode) * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on - * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * * There is a standard hashed waitqueue table for generic use. This @@ -891,9 +946,61 @@ wait_on_bit(void *word, int bit, int (*action)(void *), unsigned mode) * wait_on_bit() in threads waiting to be able to set the bit. * One uses wait_on_bit_lock() where one is waiting for the bit to * clear with the intention of setting it, and when done, clearing it. + * + * Returns zero if the bit was (eventually) found to be clear and was + * set. Returns non-zero if a signal was delivered to the process and + * the @mode allows that signal to wake the process. + */ +static inline int +wait_on_bit_lock(void *word, int bit, unsigned mode) +{ + if (!test_and_set_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); +} + +/** + * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @mode: the task state to sleep in + * + * Use the standard hashed waitqueue table to wait for a bit + * to be cleared and then to atomically set it. This is similar + * to wait_on_bit(), but calls io_schedule() instead of schedule() + * for the actual waiting. + * + * Returns zero if the bit was (eventually) found to be clear and was + * set. Returns non-zero if a signal was delivered to the process and + * the @mode allows that signal to wake the process. + */ +static inline int +wait_on_bit_lock_io(void *word, int bit, unsigned mode) +{ + if (!test_and_set_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); +} + +/** + * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @action: the function used to sleep, which may take special actions + * @mode: the task state to sleep in + * + * Use the standard hashed waitqueue table to wait for a bit + * to be cleared and then to set it, and allow the waiting action + * to be specified. + * This is like wait_on_bit() but allows fine control of how the waiting + * is done. + * + * Returns zero if the bit was (eventually) found to be clear and was + * set. Returns non-zero if a signal was delivered to the process and + * the @mode allows that signal to wake the process. */ static inline int -wait_on_bit_lock(void *word, int bit, int (*action)(void *), unsigned mode) +wait_on_bit_lock_action(void *word, int bit, int (*action)(void *), unsigned mode) { if (!test_and_set_bit(bit, word)) return 0; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 5777c13..a219be9 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -90,7 +90,6 @@ struct writeback_control { * fs/fs-writeback.c */ struct bdi_writeback; -int inode_wait(void *); void writeback_inodes_sb(struct super_block *, enum wb_reason reason); void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); @@ -105,7 +104,7 @@ void inode_wait_for_writeback(struct inode *inode); static inline void wait_on_inode(struct inode *inode) { might_sleep(); - wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE); + wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); } /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index adf9862..54e7522 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -28,12 +28,6 @@ #include -static int ptrace_trapping_sleep_fn(void *flags) -{ - schedule(); - return 0; -} - /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. @@ -371,7 +365,7 @@ unlock_creds: out: if (!retval) { wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, - ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); proc_ptrace_connector(task, PTRACE_ATTACH); } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 0ffa20a..a104879 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -502,3 +502,21 @@ void wake_up_atomic_t(atomic_t *p) __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); } EXPORT_SYMBOL(wake_up_atomic_t); + +__sched int bit_wait(void *word) +{ + if (signal_pending_state(current->state, current)) + return 1; + schedule(); + return 0; +} +EXPORT_SYMBOL(bit_wait); + +__sched int bit_wait_io(void *word) +{ + if (signal_pending_state(current->state, current)) + return 1; + io_schedule(); + return 0; +} +EXPORT_SYMBOL(bit_wait_io); diff --git a/mm/filemap.c b/mm/filemap.c index dafb06f..d175917 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -241,18 +241,6 @@ void delete_from_page_cache(struct page *page) } EXPORT_SYMBOL(delete_from_page_cache); -static int sleep_on_page(void *word) -{ - io_schedule(); - return 0; -} - -static int sleep_on_page_killable(void *word) -{ - sleep_on_page(word); - return fatal_signal_pending(current) ? -EINTR : 0; -} - static int filemap_check_errors(struct address_space *mapping) { int ret = 0; @@ -692,7 +680,7 @@ void wait_on_page_bit(struct page *page, int bit_nr) DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); if (test_bit(bit_nr, &page->flags)) - __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, + __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_on_page_bit); @@ -705,7 +693,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) return 0; return __wait_on_bit(page_waitqueue(page), &wait, - sleep_on_page_killable, TASK_KILLABLE); + bit_wait_io, TASK_KILLABLE); } /** @@ -806,7 +794,7 @@ void __lock_page(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, + __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); @@ -816,7 +804,7 @@ int __lock_page_killable(struct page *page) DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); return __wait_on_bit_lock(page_waitqueue(page), &wait, - sleep_on_page_killable, TASK_KILLABLE); + bit_wait_io, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__lock_page_killable); diff --git a/mm/ksm.c b/mm/ksm.c index 346ddc9..fb75902 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1978,18 +1978,12 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage) #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_HOTREMOVE -static int just_wait(void *word) -{ - schedule(); - return 0; -} - static void wait_while_offlining(void) { while (ksm_run & KSM_RUN_OFFLINE) { mutex_unlock(&ksm_thread_mutex); wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), - just_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); mutex_lock(&ksm_thread_mutex); } } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 0a43cce..e090bff 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2186,12 +2186,6 @@ static void hci_inq_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp); } -static int wait_inquiry(void *word) -{ - schedule(); - return signal_pending(current); -} - int hci_inquiry(void __user *arg) { __u8 __user *ptr = arg; @@ -2242,7 +2236,7 @@ int hci_inquiry(void __user *arg) /* Wait until Inquiry procedure finishes (HCI_INQUIRY flag is * cleared). If it is interrupted by a signal, return -EINTR. */ - if (wait_on_bit(&hdev->flags, HCI_INQUIRY, wait_inquiry, + if (wait_on_bit(&hdev->flags, HCI_INQUIRY, TASK_INTERRUPTIBLE)) return -EINTR; } diff --git a/security/keys/gc.c b/security/keys/gc.c index d3222b6..9609a7f 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -92,15 +92,6 @@ static void key_gc_timer_func(unsigned long data) } /* - * wait_on_bit() sleep function for uninterruptible waiting - */ -static int key_gc_wait_bit(void *flags) -{ - schedule(); - return 0; -} - -/* * Reap keys of dead type. * * We use three flags to make sure we see three complete cycles of the garbage @@ -123,7 +114,7 @@ void key_gc_keytype(struct key_type *ktype) schedule_work(&key_gc_work); kdebug("sleep"); - wait_on_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE, key_gc_wait_bit, + wait_on_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE, TASK_UNINTERRUPTIBLE); key_gc_dead_keytype = NULL; diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 3814119..26a94f1 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -21,24 +21,6 @@ #define key_negative_timeout 60 /* default timeout on a negative key's existence */ -/* - * wait_on_bit() sleep function for uninterruptible waiting - */ -static int key_wait_bit(void *flags) -{ - schedule(); - return 0; -} - -/* - * wait_on_bit() sleep function for interruptible waiting - */ -static int key_wait_bit_intr(void *flags) -{ - schedule(); - return signal_pending(current) ? -ERESTARTSYS : 0; -} - /** * complete_request_key - Complete the construction of a key. * @cons: The key construction record. @@ -592,10 +574,9 @@ int wait_for_key_construction(struct key *key, bool intr) int ret; ret = wait_on_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT, - intr ? key_wait_bit_intr : key_wait_bit, intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); - if (ret < 0) - return ret; + if (ret) + return -ERESTARTSYS; if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) { smp_rmb(); return key->type_data.reject_error; -- cgit v0.10.2 From c1221321b7c25b53204447cff9949a6d5a7ddddc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 7 Jul 2014 15:16:04 +1000 Subject: sched: Allow wait_on_bit_action() functions to support a timeout It is currently not possible for various wait_on_bit functions to implement a timeout. While the "action" function that is called to do the waiting could certainly use schedule_timeout(), there is no way to carry forward the remaining timeout after a false wake-up. As false-wakeups a clearly possible at least due to possible hash collisions in bit_waitqueue(), this is a real problem. The 'action' function is currently passed a pointer to the word containing the bit being waited on. No current action functions use this pointer. So changing it to something else will be a little noisy but will have no immediate effect. This patch changes the 'action' function to take a pointer to the "struct wait_bit_key", which contains a pointer to the word containing the bit so nothing is really lost. It also adds a 'private' field to "struct wait_bit_key", which is initialized to zero. An action function can now implement a timeout with something like static int timed_out_waiter(struct wait_bit_key *key) { unsigned long waited; if (key->private == 0) { key->private = jiffies; if (key->private == 0) key->private -= 1; } waited = jiffies - key->private; if (waited > 10 * HZ) return -EAGAIN; schedule_timeout(waited - 10 * HZ); return 0; } If any other need for context in a waiter were found it would be easy to use ->private for some other purpose, or even extend "struct wait_bit_key". My particular need is to support timeouts in nfs_release_page() to avoid deadlocks with loopback mounted NFS. While wait_on_bit_timeout() would be a cleaner interface, it will not meet my need. I need the timeout to be sensitive to the state of the connection with the server, which could change. So I need to use an 'action' interface. Signed-off-by: NeilBrown Acked-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Steve French Cc: David Howells Cc: Steven Whitehouse Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140707051604.28027.41257.stgit@notabene.brown Signed-off-by: Ingo Molnar diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 213c458..41de393 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1780,7 +1780,7 @@ cifs_invalidate_mapping(struct inode *inode) * @word: long word containing the bit lock */ static int -cifs_wait_bit_killable(void *word) +cifs_wait_bit_killable(struct wait_bit_key *key) { if (fatal_signal_pending(current)) return -ERESTARTSYS; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b7b710e..abd37a3 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -75,7 +75,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks * @word: long word containing the bit lock */ -int nfs_wait_bit_killable(void *word) +int nfs_wait_bit_killable(struct wait_bit_key *key) { if (fatal_signal_pending(current)) return -ERESTARTSYS; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 82ddbf4..e0193d6 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -347,7 +347,7 @@ extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); -extern int nfs_wait_bit_killable(void *word); +extern int nfs_wait_bit_killable(struct wait_bit_key *key); /* super.c */ extern const struct super_operations nfs_sops; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6104d35..745a612d 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -117,7 +117,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c) set_bit(NFS_IO_INPROGRESS, &c->flags); if (atomic_read(&c->io_count) == 0) break; - ret = nfs_wait_bit_killable(&c->flags); + ret = nfs_wait_bit_killable(&q.key); } while (atomic_read(&c->io_count) != 0); finish_wait(wq, &q.wait); return ret; diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index ad7dbe2..1a89599 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -236,7 +236,7 @@ void * rpc_malloc(struct rpc_task *, size_t); void rpc_free(void *); int rpciod_up(void); void rpciod_down(void); -int __rpc_wait_for_completion_task(struct rpc_task *task, int (*)(void *)); +int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *); #ifdef RPC_DEBUG struct net; void rpc_show_tasks(struct net *); diff --git a/include/linux/wait.h b/include/linux/wait.h index 73960ff..6fb1ba5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -25,6 +25,7 @@ struct wait_bit_key { void *flags; int bit_nr; #define WAIT_ATOMIC_T_BIT_NR -1 + unsigned long private; }; struct wait_bit_queue { @@ -141,18 +142,19 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) list_del(&old->task_list); } +typedef int wait_bit_action_f(struct wait_bit_key *); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_bit(wait_queue_head_t *, void *, int); -int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned); -int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned); +int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_action_f *, unsigned); +int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_action_f *, unsigned); void wake_up_bit(void *, int); void wake_up_atomic_t(atomic_t *); -int out_of_line_wait_on_bit(void *, int, int (*)(void *), unsigned); -int out_of_line_wait_on_bit_lock(void *, int, int (*)(void *), unsigned); +int out_of_line_wait_on_bit(void *, int, wait_bit_action_f *, unsigned); +int out_of_line_wait_on_bit_lock(void *, int, wait_bit_action_f *, unsigned); int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned); wait_queue_head_t *bit_waitqueue(void *, int); @@ -855,8 +857,8 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); } while (0) -extern int bit_wait(void *); -extern int bit_wait_io(void *); +extern int bit_wait(struct wait_bit_key *); +extern int bit_wait_io(struct wait_bit_key *); /** * wait_on_bit - wait for a bit to be cleared @@ -925,7 +927,7 @@ wait_on_bit_io(void *word, int bit, unsigned mode) * on that signal. */ static inline int -wait_on_bit_action(void *word, int bit, int (*action)(void *), unsigned mode) +wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) { if (!test_bit(bit, word)) return 0; @@ -1000,7 +1002,7 @@ wait_on_bit_lock_io(void *word, int bit, unsigned mode) * the @mode allows that signal to wake the process. */ static inline int -wait_on_bit_lock_action(void *word, int bit, int (*action)(void *), unsigned mode) +wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) { if (!test_and_set_bit(bit, word)) return 0; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index a104879..15cab1a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -319,14 +319,14 @@ EXPORT_SYMBOL(wake_bit_function); */ int __sched __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { int ret = 0; do { prepare_to_wait(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(q->key.flags); + ret = (*action)(&q->key); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); finish_wait(wq, &q->wait); return ret; @@ -334,7 +334,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, EXPORT_SYMBOL(__wait_on_bit); int __sched out_of_line_wait_on_bit(void *word, int bit, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { wait_queue_head_t *wq = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); @@ -345,7 +345,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit); int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { do { int ret; @@ -353,7 +353,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait_exclusive(wq, &q->wait, mode); if (!test_bit(q->key.bit_nr, q->key.flags)) continue; - ret = action(q->key.flags); + ret = action(&q->key); if (!ret) continue; abort_exclusive_wait(wq, &q->wait, mode, &q->key); @@ -365,7 +365,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, EXPORT_SYMBOL(__wait_on_bit_lock); int __sched out_of_line_wait_on_bit_lock(void *word, int bit, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { wait_queue_head_t *wq = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); @@ -503,7 +503,7 @@ void wake_up_atomic_t(atomic_t *p) } EXPORT_SYMBOL(wake_up_atomic_t); -__sched int bit_wait(void *word) +__sched int bit_wait(struct wait_bit_key *word) { if (signal_pending_state(current->state, current)) return 1; @@ -512,7 +512,7 @@ __sched int bit_wait(void *word) } EXPORT_SYMBOL(bit_wait); -__sched int bit_wait_io(void *word) +__sched int bit_wait_io(struct wait_bit_key *word) { if (signal_pending_state(current->state, current)) return 1; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index c0365c1..9358c79 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -250,7 +250,7 @@ void rpc_destroy_wait_queue(struct rpc_wait_queue *queue) } EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); -static int rpc_wait_bit_killable(void *word) +static int rpc_wait_bit_killable(struct wait_bit_key *key) { if (fatal_signal_pending(current)) return -ERESTARTSYS; @@ -309,7 +309,7 @@ static int rpc_complete_task(struct rpc_task *task) * to enforce taking of the wq->lock and hence avoid races with * rpc_complete_task(). */ -int __rpc_wait_for_completion_task(struct rpc_task *task, int (*action)(void *)) +int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *action) { if (action == NULL) action = rpc_wait_bit_killable; -- cgit v0.10.2 From d8d28c8f00e84a72e8bee39a85835635417bee49 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 22 Jul 2014 23:27:41 -0300 Subject: sched: Fix sched_setparam() policy == -1 logic The scheduler uses policy == -1 to preserve the current policy state to implement sched_setparam(). But, as (int) -1 is equals to 0xffffffff, it's matching the if (policy & SCHED_RESET_ON_FORK) on _sched_setscheduler(). This match changes the policy value to an invalid value, breaking the sched_setparam() syscall. This patch checks policy == -1 before check the SCHED_RESET_ON_FORK flag. The following program shows the bug: int main(void) { struct sched_param param = { .sched_priority = 5, }; sched_setscheduler(0, SCHED_FIFO, ¶m); param.sched_priority = 1; sched_setparam(0, ¶m); param.sched_priority = 0; sched_getparam(0, ¶m); if (param.sched_priority != 1) printf("failed priority setting (found %d instead of 1)\n", param.sched_priority); else printf("priority setting fine\n"); } Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra Reviewed-by: Steven Rostedt Cc: # 3.14+ Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Fixes: 7479f3c9cf67 "sched: Move SCHED_RESET_ON_FORK into attr::sched_flags" Link: http://lkml.kernel.org/r/9ebe0566a08dbbb3999759d3f20d6004bb2dbcfa.1406079891.git.bristot@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc1638b..0acf96b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3558,9 +3558,10 @@ static int _sched_setscheduler(struct task_struct *p, int policy, }; /* - * Fixup the legacy SCHED_RESET_ON_FORK hack + * Fixup the legacy SCHED_RESET_ON_FORK hack, except if + * the policy=-1 was passed by sched_setparam(). */ - if (policy & SCHED_RESET_ON_FORK) { + if ((policy != -1) && (policy & SCHED_RESET_ON_FORK)) { attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; policy &= ~SCHED_RESET_ON_FORK; attr.sched_policy = policy; -- cgit v0.10.2 From 6ae72dff37596f736b795426306404f0793e4b1b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Jul 2014 11:47:40 +0200 Subject: sched: Robustify topology setup We hard assume that higher topology levels are supersets of lower levels. Detect, warn and try to fixup when we encounter this violated. Tested-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra Cc: Josh Boyer Cc: "H. Peter Anvin" Cc: Bruno Wolff III Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140722094740.GJ12054@laptop.lan Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 415ab02..2a36a74 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6481,6 +6481,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; sd->child = child; + + if (!cpumask_subset(sched_domain_span(child), + sched_domain_span(sd))) { + pr_err("BUG: arch topology borken\n"); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" the %s domain not a subset of the %s domain\n", + child->name, sd->name); +#endif + /* Fixup, ensure @sd has at least @child cpus. */ + cpumask_or(sched_domain_span(sd), + sched_domain_span(sd), + sched_domain_span(child)); + } + } set_domain_attribute(sd, attr); -- cgit v0.10.2 From c13db6b131a4388edca9867a1457b988b83902f8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 23 Jul 2014 11:28:26 -0400 Subject: sched: Use macro for magic number of -1 for setparam Instead of passing around a magic number -1 for the sched_setparam() policy, use a more descriptive macro name like SETPARAM_POLICY. [ based on top of Daniel's sched_setparam() fix ] Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Cc: Daniel Bristot de Oliveira Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140723112826.6ed6cbce@gandalf.local.home Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2a36a74..2676866 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3218,12 +3218,18 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_yielded = 0; } +/* + * sched_setparam() passes in -1 for its policy, to let the functions + * it calls know not to change it. + */ +#define SETPARAM_POLICY -1 + static void __setscheduler_params(struct task_struct *p, const struct sched_attr *attr) { int policy = attr->sched_policy; - if (policy == -1) /* setparam */ + if (policy == SETPARAM_POLICY) policy = p->policy; p->policy = policy; @@ -3572,11 +3578,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy, .sched_nice = PRIO_TO_NICE(p->static_prio), }; - /* - * Fixup the legacy SCHED_RESET_ON_FORK hack, except if - * the policy=-1 was passed by sched_setparam(). - */ - if ((policy != -1) && (policy & SCHED_RESET_ON_FORK)) { + /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ + if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; policy &= ~SCHED_RESET_ON_FORK; attr.sched_policy = policy; @@ -3746,7 +3749,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, */ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) { - return do_sched_setscheduler(pid, -1, param); + return do_sched_setscheduler(pid, SETPARAM_POLICY, param); } /** -- cgit v0.10.2 From cd3bd4e628a6d57d66afe77835fe8d93ae3e41f8 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Mon, 28 Jul 2014 12:38:06 +0900 Subject: sched/fair: Fix 'make xmldocs' warning caused by missing description This patch fix following warning caused by missing description "overload" in kernel/sched/fair.c Warning(.//kernel/sched/fair.c:5906): No description found for parameter 'overload' Signed-off-by: Masanari Iida Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1406518686-7274-1-git-send-email-standby24x7@gmail.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 45943b2..bfa3c86 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5898,6 +5898,7 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. + * @overload: Indicate more than one runnable task for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, -- cgit v0.10.2