From 095bebf61a460ad7f6a45bb17ddbf3a9df2b4397 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 3 Feb 2015 16:56:48 -0500 Subject: sched/numa: Do not move past the balance point if unbalanced There is a subtle interaction between the logic introduced in commit e63da03639cc ("sched/numa: Allow task switch if load imbalance improves"), the way the load balancer counts the load on each NUMA node, and the way NUMA hinting faults are done. Specifically, the load balancer only counts currently running tasks in the load, while NUMA hinting faults may cause tasks to stop, if the page is locked by another task. This could cause all of the threads of a large single instance workload, like SPECjbb2005, to migrate to the same NUMA node. This was possible because occasionally they all fault on the same few pages, and only one of the threads remains runnable. That thread can move to the process's preferred NUMA node without making the imbalance worse, because nothing else is running at that time. The fix is to check the direction of the net moving of load, and to refuse a NUMA move if it would cause the system to move past the point of balance. In an unbalanced state, only moves that bring us closer to the balance point are allowed. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: mgorman@suse.de Link: http://lkml.kernel.org/r/20150203165648.0e9ac692@annuminas.surriel.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ce18f3..28cbaca 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1196,9 +1196,11 @@ static void task_numa_assign(struct task_numa_env *env, static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { - long imb, old_imb; - long orig_src_load, orig_dst_load; long src_capacity, dst_capacity; + long orig_src_load; + long load_a, load_b; + long moved_load; + long imb; /* * The load is corrected for the CPU capacity available on each node. @@ -1211,30 +1213,39 @@ static bool load_too_imbalanced(long src_load, long dst_load, dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ - if (dst_load < src_load) - swap(dst_load, src_load); + load_a = dst_load; + load_b = src_load; + if (load_a < load_b) + swap(load_a, load_b); /* Is the difference below the threshold? */ - imb = dst_load * src_capacity * 100 - - src_load * dst_capacity * env->imbalance_pct; + imb = load_a * src_capacity * 100 - + load_b * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; /* * The imbalance is above the allowed threshold. - * Compare it with the old imbalance. + * Allow a move that brings us closer to a balanced situation, + * without moving things past the point of balance. */ orig_src_load = env->src_stats.load; - orig_dst_load = env->dst_stats.load; - if (orig_dst_load < orig_src_load) - swap(orig_dst_load, orig_src_load); - - old_imb = orig_dst_load * src_capacity * 100 - - orig_src_load * dst_capacity * env->imbalance_pct; + /* + * In a task swap, there will be one load moving from src to dst, + * and another moving back. This is the net sum of both moves. + * A simple task move will always have a positive value. + * Allow the move if it brings the system closer to a balanced + * situation, without crossing over the balance point. + */ + moved_load = orig_src_load - src_load; - /* Would this change make things worse? */ - return (imb > old_imb); + if (moved_load > 0) + /* Moving src -> dst. Did we overshoot balance? */ + return src_load * dst_capacity < dst_load * src_capacity; + else + /* Moving dst -> src. Did we overshoot balance? */ + return dst_load * src_capacity < src_load * dst_capacity; } /* -- cgit v0.10.2 From 890a5409f9d0c84d75a1e16eebdfe91d8a57ef1e Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 9 Feb 2015 12:30:00 +0100 Subject: sched/numa: Avoid some pointless iterations Commit 81907478c431 ("sched/fair: Avoid using uninitialized variable in preferred_group_nid()") unconditionally initializes max_group with NODE_MASK_NONE, this means that when !max_faults (max_group didn't get set), we'll now continue the iteration with an empty mask. Which in turn makes the actual body of the loop go away, so we'll just iterate until completion; short circuit this by breaking out of the loop as soon as this would happen. Signed-off-by: Jan Beulich Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150209113727.GS5029@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 28cbaca..ee595ef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1774,6 +1774,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) } } /* Next round, evaluate the nodes within max_group. */ + if (!max_faults) + break; nodes = max_group; } return nid; -- cgit v0.10.2 From 1e78cdbd9b2266503339accafe0ebdd99b93a531 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 16 Feb 2015 15:23:49 -0500 Subject: sched/rt/nohz: Stop scheduler tick if running realtime task If the CPU is running a realtime task that does not round-robin with another realtime task of equal priority, there is no point in keeping the scheduler tick going. After all, whenever the scheduler tick runs, the kernel will just decide not to reschedule. Extend sched_can_stop_tick() to recognize these situations, and inform the rest of the kernel that the scheduler tick can be stopped. Tested-by: Luiz Capitulino Signed-off-by: Rik van Riel Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: fweisbec@redhat.com Cc: mtosatti@redhat.com Link: http://lkml.kernel.org/r/20150216152349.6a8ed824@annuminas.surriel.com [ Small cleanliness tweak. ] Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a4869bd..97fe79c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -690,6 +690,23 @@ static inline bool got_nohz_idle_kick(void) bool sched_can_stop_tick(void) { /* + * FIFO realtime policy runs the highest priority task. Other runnable + * tasks are of a lower priority. The scheduler tick does nothing. + */ + if (current->policy == SCHED_FIFO) + return true; + + /* + * Round-robin realtime tasks time slice with other tasks at the same + * realtime priority. Is this task the only one at this priority? + */ + if (current->policy == SCHED_RR) { + struct sched_rt_entity *rt_se = ¤t->rt; + + return rt_se->run_list.prev == rt_se->run_list.next; + } + + /* * More than one running task need preemption. * nr_running update is assumed to be visible * after IPI is sent from wakers. -- cgit v0.10.2 From 44fb085bfa17628c6d2aaa6af6b292a8499e9cbd Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 10 Mar 2015 12:20:00 +0800 Subject: sched/deadline: Add rq->clock update skip for dl task yield This patch adds rq->clock update skip for SCHED_DEADLINE task yield, to tell update_rq_clock() that we've just updated the clock, so that we don't do a microscopic update in schedule() and double the fastpath cost. Signed-off-by: Wanpeng Li Cc: Juri Lelli Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1425961200-3809-1-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3fa8fa6..0a81a95 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -914,6 +914,12 @@ static void yield_task_dl(struct rq *rq) } update_rq_clock(rq); update_curr_dl(rq); + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() + * and double the fastpath cost. + */ + rq_clock_skip_update(rq, true); } #ifdef CONFIG_SMP -- cgit v0.10.2 From b253149b843f89cd300cbdbea27ce1f847506f99 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 15 Jan 2014 00:37:34 -0500 Subject: sched/idle/x86: Restore mwait_idle() to fix boot hangs, to improve power savings and to improve performance In Linux-3.9 we removed the mwait_idle() loop: 69fb3676df33 ("x86 idle: remove mwait_idle() and "idle=mwait" cmdline param") The reasoning was that modern machines should be sufficiently happy during the boot process using the default_idle() HALT loop, until cpuidle loads and either acpi_idle or intel_idle invoke the newer MWAIT-with-hints idle loop. But two machines reported problems: 1. Certain Core2-era machines support MWAIT-C1 and HALT only. MWAIT-C1 is preferred for optimal power and performance. But if they support just C1, cpuidle never loads and so they use the boot-time default idle loop forever. 2. Some laptops will boot-hang if HALT is used, but will boot successfully if MWAIT is used. This appears to be a hidden assumption in BIOS SMI, that is presumably valid on the proprietary OS where the BIOS was validated. https://bugzilla.kernel.org/show_bug.cgi?id=60770 So here we effectively revert the patch above, restoring the mwait_idle() loop. However, we don't bother restoring the idle=mwait cmdline parameter, since it appears to add no value. Maintainer notes: For 3.9, simply revert 69fb3676df for 3.10, patch -F3 applies, fuzz needed due to __cpuinit use in context For 3.11, 3.12, 3.13, this patch applies cleanly Tested-by: Mike Galbraith Signed-off-by: Len Brown Acked-by: Mike Galbraith Cc: # 3.9+ Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Ian Malone Cc: Josh Boyer Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/345254a551eb5a6a866e048d7ab570fd2193aca4.1389763084.git.len.brown@intel.com [ Ported to recent kernels. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index a1410db..653dfa7 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -30,6 +30,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) :: "a" (eax), "c" (ecx)); } +static inline void __sti_mwait(unsigned long eax, unsigned long ecx) +{ + trace_hardirqs_on(); + /* "mwait %eax, %ecx;" */ + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + /* * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, * which can obviate IPI to trigger checking of need_resched. diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e127dda..da06f74 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -398,6 +399,49 @@ static void amd_e400_idle(void) default_idle(); } +/* + * Intel Core2 and older machines prefer MWAIT over HALT for C1. + * We can't rely on cpuidle installing MWAIT, because it will not load + * on systems that support only C1 -- so the boot default must be MWAIT. + * + * Some AMD machines are the opposite, they depend on using HALT. + * + * So for default C1, which is used during boot until cpuidle loads, + * use MWAIT-C1 on Intel HW that has it, else use HALT. + */ +static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) +{ + if (c->x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (!cpu_has(c, X86_FEATURE_MWAIT)) + return 0; + + return 1; +} + +/* + * MONITOR/MWAIT with no hints, used for default default C1 state. + * This invokes MWAIT with interrutps enabled and no flags, + * which is backwards compatible with the original MWAIT implementation. + */ + +static void mwait_idle(void) +{ + if (!need_resched()) { + if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __sti_mwait(0, 0); + else + local_irq_enable(); + } else + local_irq_enable(); +} + void select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP @@ -411,6 +455,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c) /* E400: APIC timer interrupt does not wake up CPU from C1e */ pr_info("using AMD E400 aware idle routine\n"); x86_idle = amd_e400_idle; + } else if (prefer_mwait_c1_over_halt(c)) { + pr_info("using mwait in idle threads\n"); + x86_idle = mwait_idle; } else x86_idle = default_idle; } -- cgit v0.10.2 From f8e617f4582995f7c25ef25b4167213120ad122b Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sat, 18 Jan 2014 17:14:44 +0100 Subject: sched/idle/x86: Optimize unnecessary mwait_idle() resched IPIs To fully take advantage of MWAIT, apparently the CLFLUSH instruction needs another quirk on certain CPUs: proper barriers around it on certain machines. On a Q6600 SMP system, pipe-test scheduling performance, cross core, improves significantly: 3.8.13 487.2 KHz 1.000 3.13.0-master 415.5 KHz .852 3.13.0-master+ 415.2 KHz .852 + restore mwait_idle 3.13.0-master++ 488.5 KHz 1.002 + restore mwait_idle + IPI fix Since X86_BUG_CLFLUSH_MONITOR is already a quirk, don't create a separate quirk for the extra smp_mb()s. Signed-off-by: Mike Galbraith Cc: # 3.10+ Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Ian Malone Cc: Josh Boyer Cc: Len Brown Cc: Len Brown Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1390061684.5566.4.camel@marge.simpson.net [ Ported to recent kernel, added comments about the quirk. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index da06f74..6ad8a63 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -428,18 +428,22 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) static void mwait_idle(void) { - if (!need_resched()) { - if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) + if (!current_set_polling_and_test()) { + if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { + smp_mb(); /* quirk */ clflush((void *)¤t_thread_info()->flags); + smp_mb(); /* quirk */ + } __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); if (!need_resched()) __sti_mwait(0, 0); else local_irq_enable(); - } else + } else { local_irq_enable(); + } + __current_clr_polling(); } void select_idle_routine(const struct cpuinfo_x86 *c) -- cgit v0.10.2 From 71ad00d61ec861dc68b4544887729850e58cb99b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Mar 2015 10:18:51 -0400 Subject: irq_work: Fix build failure when CONFIG_IRQ_WORK is not defined When CONFIG_IRQ_WORK is not defined (difficult to do, as it also requires CONFIG_PRINTK not to be defined), we get a build failure: kernel/built-in.o: In function `flush_smp_call_function_queue': kernel/smp.c:263: undefined reference to `irq_work_run' kernel/smp.c:263: undefined reference to `irq_work_run' Makefile:933: recipe for target 'vmlinux' failed Simplest thing to do is to make irq_work_run() a nop when not set. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150319101851.4d224d9b@gandalf.local.home Signed-off-by: Ingo Molnar diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index bf3fe71..47b9ebd 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -38,16 +38,17 @@ bool irq_work_queue(struct irq_work *work); bool irq_work_queue_on(struct irq_work *work, int cpu); #endif -void irq_work_run(void); void irq_work_tick(void); void irq_work_sync(struct irq_work *work); #ifdef CONFIG_IRQ_WORK #include +void irq_work_run(void); bool irq_work_needs_cpu(void); #else static inline bool irq_work_needs_cpu(void) { return false; } +static inline void irq_work_run(void) { } #endif #endif /* _LINUX_IRQ_WORK_H */ -- cgit v0.10.2 From b6366f048e0caff28af5335b7af2031266e1b06b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 18 Mar 2015 14:49:46 -0400 Subject: sched/rt: Use IPI to trigger RT task push migration instead of pulling When debugging the latencies on a 40 core box, where we hit 300 to 500 microsecond latencies, I found there was a huge contention on the runqueue locks. Investigating it further, running ftrace, I found that it was due to the pulling of RT tasks. The test that was run was the following: cyclictest --numa -p95 -m -d0 -i100 This created a thread on each CPU, that would set its wakeup in iterations of 100 microseconds. The -d0 means that all the threads had the same interval (100us). Each thread sleeps for 100us and wakes up and measures its latencies. cyclictest is maintained at: git://git.kernel.org/pub/scm/linux/kernel/git/clrkwllms/rt-tests.git What happened was another RT task would be scheduled on one of the CPUs that was running our test, when the other CPU tests went to sleep and scheduled idle. This caused the "pull" operation to execute on all these CPUs. Each one of these saw the RT task that was overloaded on the CPU of the test that was still running, and each one tried to grab that task in a thundering herd way. To grab the task, each thread would do a double rq lock grab, grabbing its own lock as well as the rq of the overloaded CPU. As the sched domains on this box was rather flat for its size, I saw up to 12 CPUs block on this lock at once. This caused a ripple affect with the rq locks especially since the taking was done via a double rq lock, which means that several of the CPUs had their own rq locks held while trying to take this rq lock. As these locks were blocked, any wakeups or load balanceing on these CPUs would also block on these locks, and the wait time escalated. I've tried various methods to lessen the load, but things like an atomic counter to only let one CPU grab the task wont work, because the task may have a limited affinity, and we may pick the wrong CPU to take that lock and do the pull, to only find out that the CPU we picked isn't in the task's affinity. Instead of doing the PULL, I now have the CPUs that want the pull to send over an IPI to the overloaded CPU, and let that CPU pick what CPU to push the task to. No more need to grab the rq lock, and the push/pull algorithm still works fine. With this patch, the latency dropped to just 150us over a 20 hour run. Without the patch, the huge latencies would trigger in seconds. I've created a new sched feature called RT_PUSH_IPI, which is enabled by default. When RT_PUSH_IPI is not enabled, the old method of grabbing the rq locks and having the pulling CPU do the work is implemented. When RT_PUSH_IPI is enabled, the IPI is sent to the overloaded CPU to do a push. To enabled or disable this at run time: # mount -t debugfs nodev /sys/kernel/debug # echo RT_PUSH_IPI > /sys/kernel/debug/sched_features or # echo NO_RT_PUSH_IPI > /sys/kernel/debug/sched_features Update: This original patch would send an IPI to all CPUs in the RT overload list. But that could theoretically cause the reverse issue. That is, there could be lots of overloaded RT queues and one CPU lowers its priority. It would then send an IPI to all the overloaded RT queues and they could then all try to grab the rq lock of the CPU lowering its priority, and then we have the same problem. The latest design sends out only one IPI to the first overloaded CPU. It tries to push any tasks that it can, and then looks for the next overloaded CPU that can push to the source CPU. The IPIs stop when all overloaded CPUs that have pushable tasks that have priorities greater than the source CPU are covered. In case the source CPU lowers its priority again, a flag is set to tell the IPI traversal to restart with the first RT overloaded CPU after the source CPU. Parts-suggested-by: Peter Zijlstra Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Cc: Joern Engel Cc: Clark Williams Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150318144946.2f3cc982@gandalf.local.home Signed-off-by: Ingo Molnar diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 90284d1..91e33cd 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true) */ SCHED_FEAT(TTWU_QUEUE, true) +#ifdef HAVE_RT_PUSH_IPI +/* + * In order to avoid a thundering herd attack of CPUs that are + * lowering their priorities at the same time, and there being + * a single CPU that has an RT task that can migrate and is waiting + * to run, where the other CPUs will try to take that CPUs + * rq lock and possibly create a large contention, sending an + * IPI to that CPU and let that CPU push the RT task to where + * it should go may be a better scenario. + */ +SCHED_FEAT(RT_PUSH_IPI, true) +#endif + SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f4d4b07..ad02415 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -6,6 +6,7 @@ #include "sched.h" #include +#include int sched_rr_timeslice = RR_TIMESLICE; @@ -59,6 +60,10 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } +#ifdef CONFIG_SMP +static void push_irq_work_func(struct irq_work *work); +#endif + void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) { struct rt_prio_array *array; @@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); + +#ifdef HAVE_RT_PUSH_IPI + rt_rq->push_flags = 0; + rt_rq->push_cpu = nr_cpu_ids; + raw_spin_lock_init(&rt_rq->push_lock); + init_irq_work(&rt_rq->push_work, push_irq_work_func); #endif +#endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -1778,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq) ; } +#ifdef HAVE_RT_PUSH_IPI +/* + * The search for the next cpu always starts at rq->cpu and ends + * when we reach rq->cpu again. It will never return rq->cpu. + * This returns the next cpu to check, or nr_cpu_ids if the loop + * is complete. + * + * rq->rt.push_cpu holds the last cpu returned by this function, + * or if this is the first instance, it must hold rq->cpu. + */ +static int rto_next_cpu(struct rq *rq) +{ + int prev_cpu = rq->rt.push_cpu; + int cpu; + + cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); + + /* + * If the previous cpu is less than the rq's CPU, then it already + * passed the end of the mask, and has started from the beginning. + * We end if the next CPU is greater or equal to rq's CPU. + */ + if (prev_cpu < rq->cpu) { + if (cpu >= rq->cpu) + return nr_cpu_ids; + + } else if (cpu >= nr_cpu_ids) { + /* + * We passed the end of the mask, start at the beginning. + * If the result is greater or equal to the rq's CPU, then + * the loop is finished. + */ + cpu = cpumask_first(rq->rd->rto_mask); + if (cpu >= rq->cpu) + return nr_cpu_ids; + } + rq->rt.push_cpu = cpu; + + /* Return cpu to let the caller know if the loop is finished or not */ + return cpu; +} + +static int find_next_push_cpu(struct rq *rq) +{ + struct rq *next_rq; + int cpu; + + while (1) { + cpu = rto_next_cpu(rq); + if (cpu >= nr_cpu_ids) + break; + next_rq = cpu_rq(cpu); + + /* Make sure the next rq can push to this rq */ + if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) + break; + } + + return cpu; +} + +#define RT_PUSH_IPI_EXECUTING 1 +#define RT_PUSH_IPI_RESTART 2 + +static void tell_cpu_to_push(struct rq *rq) +{ + int cpu; + + if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { + raw_spin_lock(&rq->rt.push_lock); + /* Make sure it's still executing */ + if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { + /* + * Tell the IPI to restart the loop as things have + * changed since it started. + */ + rq->rt.push_flags |= RT_PUSH_IPI_RESTART; + raw_spin_unlock(&rq->rt.push_lock); + return; + } + raw_spin_unlock(&rq->rt.push_lock); + } + + /* When here, there's no IPI going around */ + + rq->rt.push_cpu = rq->cpu; + cpu = find_next_push_cpu(rq); + if (cpu >= nr_cpu_ids) + return; + + rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; + + irq_work_queue_on(&rq->rt.push_work, cpu); +} + +/* Called from hardirq context */ +static void try_to_push_tasks(void *arg) +{ + struct rt_rq *rt_rq = arg; + struct rq *rq, *src_rq; + int this_cpu; + int cpu; + + this_cpu = rt_rq->push_cpu; + + /* Paranoid check */ + BUG_ON(this_cpu != smp_processor_id()); + + rq = cpu_rq(this_cpu); + src_rq = rq_of_rt_rq(rt_rq); + +again: + if (has_pushable_tasks(rq)) { + raw_spin_lock(&rq->lock); + push_rt_task(rq); + raw_spin_unlock(&rq->lock); + } + + /* Pass the IPI to the next rt overloaded queue */ + raw_spin_lock(&rt_rq->push_lock); + /* + * If the source queue changed since the IPI went out, + * we need to restart the search from that CPU again. + */ + if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { + rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; + rt_rq->push_cpu = src_rq->cpu; + } + + cpu = find_next_push_cpu(src_rq); + + if (cpu >= nr_cpu_ids) + rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; + raw_spin_unlock(&rt_rq->push_lock); + + if (cpu >= nr_cpu_ids) + return; + + /* + * It is possible that a restart caused this CPU to be + * chosen again. Don't bother with an IPI, just see if we + * have more to push. + */ + if (unlikely(cpu == rq->cpu)) + goto again; + + /* Try the next RT overloaded CPU */ + irq_work_queue_on(&rt_rq->push_work, cpu); +} + +static void push_irq_work_func(struct irq_work *work) +{ + struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); + + try_to_push_tasks(rt_rq); +} +#endif /* HAVE_RT_PUSH_IPI */ + static int pull_rt_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, ret = 0, cpu; @@ -1793,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq) */ smp_rmb(); +#ifdef HAVE_RT_PUSH_IPI + if (sched_feat(RT_PUSH_IPI)) { + tell_cpu_to_push(this_rq); + return 0; + } +#endif + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dc0f435..c2c0d7b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -418,6 +419,11 @@ static inline int rt_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } +/* RT IPI pull logic requires IRQ_WORK */ +#ifdef CONFIG_IRQ_WORK +# define HAVE_RT_PUSH_IPI +#endif + /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; @@ -435,7 +441,13 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; +#ifdef HAVE_RT_PUSH_IPI + int push_flags; + int push_cpu; + struct irq_work push_work; + raw_spinlock_t push_lock; #endif +#endif /* CONFIG_SMP */ int rt_queued; int rt_throttled; -- cgit v0.10.2 From 36ee28e45df50c2c8624b978335516e42d84ae1f Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:04 +0100 Subject: sched: Add sched_avg::utilization_avg_contrib Add new statistics which reflect the average time a task is running on the CPU and the sum of these running time of the tasks on a runqueue. The latter is named utilization_load_avg. This patch is based on the usage metric that was proposed in the 1st versions of the per-entity load tracking patchset by Paul Turner but that has be removed afterwards. This version differs from the original one in the sense that it's not linked to task_group. The rq's utilization_load_avg will be used to check if a rq is overloaded or not instead of trying to compute how many tasks a group of CPUs can handle. Rename runnable_avg_period into avg_period as it is now used with both runnable_avg_sum and running_avg_sum. Add some descriptions of the variables to explain their differences. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Morten Rasmussen Cc: Paul Turner Cc: Ben Segall Cc: Ben Segall Cc: Morten.Rasmussen@arm.com Cc: Paul Turner Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d77432..fdca05c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1115,15 +1115,28 @@ struct load_weight { }; struct sched_avg { + u64 last_runnable_update; + s64 decay_count; + /* + * utilization_avg_contrib describes the amount of time that a + * sched_entity is running on a CPU. It is based on running_avg_sum + * and is scaled in the range [0..SCHED_LOAD_SCALE]. + * load_avg_contrib described the amount of time that a sched_entity + * is runnable on a rq. It is based on both runnable_avg_sum and the + * weight of the task. + */ + unsigned long load_avg_contrib, utilization_avg_contrib; /* * These sums represent an infinite geometric series and so are bound * above by 1024/(1-y). Thus we only need a u32 to store them for all * choices of y < 1-2^(-32)*1024. + * running_avg_sum reflects the time that the sched_entity is + * effectively running on the CPU. + * runnable_avg_sum represents the amount of time a sched_entity is on + * a runqueue which includes the running time that is monitored by + * running_avg_sum. */ - u32 runnable_avg_sum, runnable_avg_period; - u64 last_runnable_update; - s64 decay_count; - unsigned long load_avg_contrib; + u32 runnable_avg_sum, avg_period, running_avg_sum; }; #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8baaf85..578ff83 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group if (!se) { struct sched_avg *avg = &cpu_rq(cpu)->avg; P(avg->runnable_avg_sum); - P(avg->runnable_avg_period); + P(avg->avg_period); return; } @@ -94,7 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); - P(se->avg.runnable_avg_period); + P(se->avg.avg_period); P(se->avg.load_avg_contrib); P(se->avg.decay_count); #endif @@ -214,6 +214,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", + cfs_rq->utilization_load_avg); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); @@ -636,8 +638,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.load.weight); #ifdef CONFIG_SMP P(se.avg.runnable_avg_sum); - P(se.avg.runnable_avg_period); + P(se.avg.running_avg_sum); + P(se.avg.avg_period); P(se.avg.load_avg_contrib); + P(se.avg.utilization_avg_contrib); P(se.avg.decay_count); #endif P(policy); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ee595ef..414408dd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); static inline void __update_task_entity_contrib(struct sched_entity *se); +static inline void __update_task_entity_utilization(struct sched_entity *se); /* Give new task start runnable values to heavy its load in infant time */ void init_task_runnable_average(struct task_struct *p) @@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p) u32 slice; slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; - p->se.avg.runnable_avg_sum = slice; - p->se.avg.runnable_avg_period = slice; + p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; + p->se.avg.avg_period = slice; __update_task_entity_contrib(&p->se); + __update_task_entity_utilization(&p->se); } #else void init_task_runnable_average(struct task_struct *p) @@ -1684,7 +1686,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) *period = now - p->last_task_numa_placement; } else { delta = p->se.avg.runnable_avg_sum; - *period = p->se.avg.runnable_avg_period; + *period = p->se.avg.avg_period; } p->last_sum_exec_runtime = runtime; @@ -2512,7 +2514,8 @@ static u32 __compute_runnable_contrib(u64 n) */ static __always_inline int __update_entity_runnable_avg(u64 now, struct sched_avg *sa, - int runnable) + int runnable, + int running) { u64 delta, periods; u32 runnable_contrib; @@ -2538,7 +2541,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, sa->last_runnable_update = now; /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->runnable_avg_period % 1024; + delta_w = sa->avg_period % 1024; if (delta + delta_w >= 1024) { /* period roll-over */ decayed = 1; @@ -2551,7 +2554,9 @@ static __always_inline int __update_entity_runnable_avg(u64 now, delta_w = 1024 - delta_w; if (runnable) sa->runnable_avg_sum += delta_w; - sa->runnable_avg_period += delta_w; + if (running) + sa->running_avg_sum += delta_w; + sa->avg_period += delta_w; delta -= delta_w; @@ -2561,20 +2566,26 @@ static __always_inline int __update_entity_runnable_avg(u64 now, sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, periods + 1); - sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + sa->running_avg_sum = decay_load(sa->running_avg_sum, + periods + 1); + sa->avg_period = decay_load(sa->avg_period, periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ runnable_contrib = __compute_runnable_contrib(periods); if (runnable) sa->runnable_avg_sum += runnable_contrib; - sa->runnable_avg_period += runnable_contrib; + if (running) + sa->running_avg_sum += runnable_contrib; + sa->avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ if (runnable) sa->runnable_avg_sum += delta; - sa->runnable_avg_period += delta; + if (running) + sa->running_avg_sum += delta; + sa->avg_period += delta; return decayed; } @@ -2591,6 +2602,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) return 0; se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + se->avg.utilization_avg_contrib = + decay_load(se->avg.utilization_avg_contrib, decays); return decays; } @@ -2626,7 +2639,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, /* The fraction of a cpu used by this cfs_rq */ contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, - sa->runnable_avg_period + 1); + sa->avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { @@ -2679,7 +2692,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable, + runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -2697,7 +2711,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); - contrib /= (se->avg.runnable_avg_period + 1); + contrib /= (se->avg.avg_period + 1); se->avg.load_avg_contrib = scale_load(contrib); } @@ -2716,6 +2730,27 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) return se->avg.load_avg_contrib - old_contrib; } + +static inline void __update_task_entity_utilization(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); + contrib /= (se->avg.avg_period + 1); + se->avg.utilization_avg_contrib = scale_load(contrib); +} + +static long __update_entity_utilization_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.utilization_avg_contrib; + + if (entity_is_task(se)) + __update_task_entity_utilization(se); + + return se->avg.utilization_avg_contrib - old_contrib; +} + static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, long load_contrib) { @@ -2732,7 +2767,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta; + long contrib_delta, utilization_delta; u64 now; /* @@ -2744,18 +2779,22 @@ static inline void update_entity_load_avg(struct sched_entity *se, else now = cfs_rq_clock_task(group_cfs_rq(se)); - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) + if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, + cfs_rq->curr == se)) return; contrib_delta = __update_entity_load_avg_contrib(se); + utilization_delta = __update_entity_utilization_avg_contrib(se); if (!update_cfs_rq) return; - if (se->on_rq) + if (se->on_rq) { cfs_rq->runnable_load_avg += contrib_delta; - else + cfs_rq->utilization_load_avg += utilization_delta; + } else { subtract_blocked_load_contrib(cfs_rq, -contrib_delta); + } } /* @@ -2830,6 +2869,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, } cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; /* we force update consideration on load-balancer moves */ update_cfs_rq_blocked_load(cfs_rq, !wakeup); } @@ -2848,6 +2888,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, update_cfs_rq_blocked_load(cfs_rq, !sleep); cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; if (sleep) { cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); @@ -3185,6 +3226,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); + update_entity_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c2c0d7b..4c95cc2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -363,8 +363,14 @@ struct cfs_rq { * Under CFS, load is tracked on a per-entity basis and aggregated up. * This allows for the description of both thread and group usage (in * the FAIR_GROUP_SCHED case). + * runnable_load_avg is the sum of the load_avg_contrib of the + * sched_entities on the rq. + * blocked_load_avg is similar to runnable_load_avg except that its + * the blocked sched_entities on the rq. + * utilization_load_avg is the sum of the average running time of the + * sched_entities on the rq. */ - unsigned long runnable_load_avg, blocked_load_avg; + unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; atomic64_t decay_counter; u64 last_decay; atomic_long_t removed_load; -- cgit v0.10.2 From 21f4486630b0bd1b6dbcc04f61836987fa54278f Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 27 Feb 2015 16:54:05 +0100 Subject: sched: Track group sched_entity usage contributions Add usage contribution tracking for group entities. Unlike se->avg.load_avg_contrib, se->avg.utilization_avg_contrib for group entities is the sum of se->avg.utilization_avg_contrib for all entities on the group runqueue. It is _not_ influenced in any way by the task group h_load. Hence it is representing the actual cpu usage of the group, not its intended load contribution which may differ significantly from the utilization on lightly utilized systems. Signed-off-by: Morten Rasmussen Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Paul Turner Cc: Ben Segall Cc: Ben Segall Cc: Morten.Rasmussen@arm.com Cc: Paul Turner Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 578ff83..a245c1f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); + P(se->avg.running_avg_sum); P(se->avg.avg_period); P(se->avg.load_avg_contrib); + P(se->avg.utilization_avg_contrib); P(se->avg.decay_count); #endif #undef PN diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 414408dd..d94a865 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2747,6 +2747,9 @@ static long __update_entity_utilization_avg_contrib(struct sched_entity *se) if (entity_is_task(se)) __update_task_entity_utilization(se); + else + se->avg.utilization_avg_contrib = + group_cfs_rq(se)->utilization_load_avg; return se->avg.utilization_avg_contrib - old_contrib; } -- cgit v0.10.2 From a8faa8f55d48496f64d96df48298e54fd380f6af Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:06 +0100 Subject: sched: Remove frequency scaling from cpu_capacity Now that arch_scale_cpu_capacity has been introduced to scale the original capacity, the arch_scale_freq_capacity is no longer used (it was previously used by ARM arch). Remove arch_scale_freq_capacity from the computation of cpu_capacity. The frequency invariance will be handled in the load tracking and not in the CPU capacity. arch_scale_freq_capacity will be revisited for scaling load with the current frequency of the CPUs in a later patch. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Morten Rasmussen Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d94a865..e54231f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6042,13 +6042,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) sdg->sgc->capacity_orig = capacity; - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_freq_capacity(sd, cpu); - else - capacity *= default_scale_capacity(sd, cpu); - - capacity >>= SCHED_CAPACITY_SHIFT; - capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; -- cgit v0.10.2 From 0c1dc6b27dac883ee78392189c8e20e764d79bfa Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Mar 2015 08:46:26 +0100 Subject: sched: Make sched entity usage tracking scale-invariant Apply frequency scale-invariance correction factor to usage tracking. Each segment of the running_avg_sum geometric series is now scaled by the current frequency so the utilization_avg_contrib of each entity will be invariant with frequency scaling. As a result, utilization_load_avg which is the sum of utilization_avg_contrib, becomes invariant too. So the usage level that is returned by get_cpu_usage(), stays relative to the max frequency as the cpu_capacity which is is compared against. Then, we want the keep the load tracking values in a 32-bit type, which implies that the max value of {runnable|running}_avg_sum must be lower than 2^32/88761=48388 (88761 is the max weigth of a task). As LOAD_AVG_MAX = 47742, arch_scale_freq_capacity() must return a value less than (48388/47742) << SCHED_CAPACITY_SHIFT = 1037 (SCHED_SCALE_CAPACITY = 1024). So we define the range to [0..SCHED_SCALE_CAPACITY] in order to avoid overflow. Signed-off-by: Morten Rasmussen Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Paul Turner Cc: Ben Segall Cc: Ben Segall Cc: Morten.Rasmussen@arm.com Cc: Paul Turner Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425455186-13451-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e54231f..7f031e4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2484,6 +2484,8 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu); + /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -2512,7 +2514,7 @@ static u32 __compute_runnable_contrib(u64 n) * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int __update_entity_runnable_avg(u64 now, +static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, struct sched_avg *sa, int runnable, int running) @@ -2520,6 +2522,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, u64 delta, periods; u32 runnable_contrib; int delta_w, decayed = 0; + unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); delta = now - sa->last_runnable_update; /* @@ -2555,7 +2558,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now, if (runnable) sa->runnable_avg_sum += delta_w; if (running) - sa->running_avg_sum += delta_w; + sa->running_avg_sum += delta_w * scale_freq + >> SCHED_CAPACITY_SHIFT; sa->avg_period += delta_w; delta -= delta_w; @@ -2576,7 +2580,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now, if (runnable) sa->runnable_avg_sum += runnable_contrib; if (running) - sa->running_avg_sum += runnable_contrib; + sa->running_avg_sum += runnable_contrib * scale_freq + >> SCHED_CAPACITY_SHIFT; sa->avg_period += runnable_contrib; } @@ -2584,7 +2589,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now, if (runnable) sa->runnable_avg_sum += delta; if (running) - sa->running_avg_sum += delta; + sa->running_avg_sum += delta * scale_freq + >> SCHED_CAPACITY_SHIFT; sa->avg_period += delta; return decayed; @@ -2692,8 +2698,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable, - runnable); + __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, + runnable, runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -2771,6 +2777,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, { struct cfs_rq *cfs_rq = cfs_rq_of(se); long contrib_delta, utilization_delta; + int cpu = cpu_of(rq_of(cfs_rq)); u64 now; /* @@ -2782,7 +2789,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, else now = cfs_rq_clock_task(group_cfs_rq(se)); - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, + if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, cfs_rq->curr == se)) return; -- cgit v0.10.2 From b5b4860d1d61ddc5308c7d492cbeaa3a6e508d7f Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:08 +0100 Subject: sched: Make scale_rt invariant with frequency The average running time of RT tasks is used to estimate the remaining compute capacity for CFS tasks. This remaining capacity is the original capacity scaled down by a factor (aka scale_rt_capacity). This estimation of available capacity must also be invariant with frequency scaling. A frequency scaling factor is applied on the running time of the RT tasks for computing scale_rt_capacity. In sched_rt_avg_update(), we now scale the RT execution time like below: rq->rt_avg += rt_delta * arch_scale_freq_capacity() >> SCHED_CAPACITY_SHIFT Then, scale_rt_capacity can be summarized by: scale_rt_capacity = SCHED_CAPACITY_SCALE * available / total with available = total - rq->rt_avg This has been been optimized in current code by: scale_rt_capacity = available / (total >> SCHED_CAPACITY_SHIFT) But we can also developed the equation like below: scale_rt_capacity = SCHED_CAPACITY_SCALE - ((rq->rt_avg << SCHED_CAPACITY_SHIFT) / total) and we can optimize the equation by removing SCHED_CAPACITY_SHIFT shift in the computation of rq->rt_avg and scale_rt_capacity(). so rq->rt_avg += rt_delta * arch_scale_freq_capacity() and scale_rt_capacity = SCHED_CAPACITY_SCALE - (rq->rt_avg / total) arch_scale_frequency_capacity() will be called in the hot path of the scheduler which implies to have a short and efficient function. As an example, arch_scale_frequency_capacity() should return a cached value that is updated periodically outside of the hot path. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Morten Rasmussen Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-6-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7f031e4..dc7c693 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6004,7 +6004,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, available, age_stamp, avg; + u64 total, used, age_stamp, avg; s64 delta; /* @@ -6020,19 +6020,12 @@ static unsigned long scale_rt_capacity(int cpu) total = sched_avg_period() + delta; - if (unlikely(total < avg)) { - /* Ensures that capacity won't end up being negative */ - available = 0; - } else { - available = total - avg; - } + used = div_u64(avg, total); - if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) - total = SCHED_CAPACITY_SCALE; + if (likely(used < SCHED_CAPACITY_SCALE)) + return SCHED_CAPACITY_SCALE - used; - total >>= SCHED_CAPACITY_SHIFT; - - return div_u64(available, total); + return 1; } static void update_cpu_capacity(struct sched_domain *sd, int cpu) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4c95cc2..3600002 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1386,9 +1386,11 @@ static inline int hrtick_enabled(struct rq *rq) #ifdef CONFIG_SMP extern void sched_avg_update(struct rq *rq); +extern unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu); + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { - rq->rt_avg += rt_delta; + rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); sched_avg_update(rq); } #else -- cgit v0.10.2 From ca6d75e6908efbc350d536e0b496ebdac36b20d2 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:09 +0100 Subject: sched: Add struct rq::cpu_capacity_orig This new field 'cpu_capacity_orig' reflects the original capacity of a CPU before being altered by rt tasks and/or IRQ The cpu_capacity_orig will be used: - to detect when the capacity of a CPU has been noticeably reduced so we can trig load balance to look for a CPU with better capacity. As an example, we can detect when a CPU handles a significant amount of irq (with CONFIG_IRQ_TIME_ACCOUNTING) but this CPU is seen as an idle CPU by scheduler whereas CPUs, which are really idle, are available. - evaluate the available capacity for CFS tasks Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kamalesh Babulal Acked-by: Morten Rasmussen Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-7-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index feda520..7022e90 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7216,7 +7216,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = SCHED_CAPACITY_SCALE; + rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dc7c693..10f84c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4363,6 +4363,11 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; } +static unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6040,6 +6045,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) capacity >>= SCHED_CAPACITY_SHIFT; + cpu_rq(cpu)->cpu_capacity_orig = capacity; sdg->sgc->capacity_orig = capacity; capacity *= scale_rt_capacity(cpu); @@ -6094,7 +6100,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - capacity_orig += capacity_of(cpu); + capacity_orig += capacity_orig_of(cpu); capacity += capacity_of(cpu); continue; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3600002..be56dfd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -615,6 +615,7 @@ struct rq { struct sched_domain *sd; unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; unsigned char idle_balance; /* For active balancing */ -- cgit v0.10.2 From 8bb5b00c2f90100a272b09a9d17ec7875d088aa7 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 4 Mar 2015 08:48:47 +0100 Subject: sched: Calculate CPU's usage statistic and put it into struct sg_lb_stats::group_usage Monitor the usage level of each group of each sched_domain level. The usage is the portion of cpu_capacity_orig that is currently used on a CPU or group of CPUs. We use the utilization_load_avg to evaluate the usage level of each group. The utilization_load_avg only takes into account the running time of the CFS tasks on a CPU with a maximum value of SCHED_LOAD_SCALE when the CPU is fully utilized. Nevertheless, we must cap utilization_load_avg which can be temporally greater than SCHED_LOAD_SCALE after the migration of a task on this CPU and until the metrics are stabilized. The utilization_load_avg is in the range [0..SCHED_LOAD_SCALE] to reflect the running load on the CPU whereas the available capacity for the CFS task is in the range [0..cpu_capacity_orig]. In order to test if a CPU is fully utilized by CFS tasks, we have to scale the utilization in the cpu_capacity_orig range of the CPU to get the usage of the latter. The usage can then be compared with the available capacity (ie cpu_capacity) to deduct the usage level of a CPU. The frequency scaling invariance of the usage is not taken into account in this patch, it will be solved in another patch which will deal with frequency scaling invariance on the utilization_load_avg. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Morten Rasmussen Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425455327-13508-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 10f84c3..471193b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4781,6 +4781,33 @@ next: done: return target; } +/* + * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the usage with the capacity of the CPU that is available for CFS + * task (ie cpu_capacity). + * cfs.utilization_load_avg is the sum of running time of runnable tasks on a + * CPU. It represents the amount of utilization of a CPU in the range + * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full + * capacity of the CPU because it's about the running time on this CPU. + * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE + * because of unfortunate rounding in avg_period and running_load_avg or just + * after migrating tasks until the average stabilizes with the new running + * time. So we need to check that the usage stays into the range + * [0..cpu_capacity_orig] and cap if necessary. + * Without capping the usage, a group could be seen as overloaded (CPU0 usage + * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity + */ +static int get_cpu_usage(int cpu) +{ + unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; + unsigned long capacity = capacity_orig_of(cpu); + + if (usage >= SCHED_LOAD_SCALE) + return capacity; + + return (usage * capacity) >> SCHED_LOAD_SHIFT; +} /* * select_task_rq_fair: Select target runqueue for the waking task in domains @@ -5907,6 +5934,7 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; + unsigned long group_usage; /* Total usage of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int group_capacity_factor; unsigned int idle_cpus; @@ -6255,6 +6283,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; + sgs->group_usage += get_cpu_usage(i); sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) -- cgit v0.10.2 From ea67821b9a3edadf602b7772a0b2a69657ced746 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:11 +0100 Subject: sched: Replace capacity_factor by usage The scheduler tries to compute how many tasks a group of CPUs can handle by assuming that a task's load is SCHED_LOAD_SCALE and a CPU's capacity is SCHED_CAPACITY_SCALE. 'struct sg_lb_stats:group_capacity_factor' divides the capacity of the group by SCHED_LOAD_SCALE to estimate how many task can run in the group. Then, it compares this value with the sum of nr_running to decide if the group is overloaded or not. But the 'group_capacity_factor' concept is hardly working for SMT systems, it sometimes works for big cores but fails to do the right thing for little cores. Below are two examples to illustrate the problem that this patch solves: 1- If the original capacity of a CPU is less than SCHED_CAPACITY_SCALE (640 as an example), a group of 3 CPUS will have a max capacity_factor of 2 (div_round_closest(3x640/1024) = 2) which means that it will be seen as overloaded even if we have only one task per CPU. 2 - If the original capacity of a CPU is greater than SCHED_CAPACITY_SCALE (1512 as an example), a group of 4 CPUs will have a capacity_factor of 4 (at max and thanks to the fix [0] for SMT system that prevent the apparition of ghost CPUs) but if one CPU is fully used by rt tasks (and its capacity is reduced to nearly nothing), the capacity factor of the group will still be 4 (div_round_closest(3*1512/1024) = 5 which is cap to 4 with [0]). So, this patch tries to solve this issue by removing capacity_factor and replacing it with the 2 following metrics: - The available CPU's capacity for CFS tasks which is already used by load_balance(). - The usage of the CPU by the CFS tasks. For the latter, utilization_avg_contrib has been re-introduced to compute the usage of a CPU by CFS tasks. 'group_capacity_factor' and 'group_has_free_capacity' has been removed and replaced by 'group_no_capacity'. We compare the number of task with the number of CPUs and we evaluate the level of utilization of the CPUs to define if a group is overloaded or if a group has capacity to handle more tasks. For SD_PREFER_SIBLING, a group is tagged overloaded if it has more than 1 task so it will be selected in priority (among the overloaded groups). Since [1], SD_PREFER_SIBLING is no more concerned by the computation of 'load_above_capacity' because local is not overloaded. [1] 9a5d9ba6a363 ("sched/fair: Allow calculate_imbalance() to move idle cpus") Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1425052454-25797-9-git-send-email-vincent.guittot@linaro.org [ Tidied up the changelog. ] Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 471193b..7e13dd0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5936,11 +5936,10 @@ struct sg_lb_stats { unsigned long group_capacity; unsigned long group_usage; /* Total usage of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ - unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; - int group_has_free_capacity; + int group_no_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -6156,28 +6155,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } /* - * Try and fix up capacity for tiny siblings, this is needed when - * things like SD_ASYM_PACKING need f_b_g to select another sibling - * which on its own isn't powerful enough. - * - * See update_sd_pick_busiest() and check_asym_packing(). + * Check whether the capacity of the rq has been noticeably reduced by side + * activity. The imbalance_pct is used for the threshold. + * Return true is the capacity is reduced */ static inline int -fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { - /* - * Only siblings can have significantly less than SCHED_CAPACITY_SCALE - */ - if (!(sd->flags & SD_SHARE_CPUCAPACITY)) - return 0; - - /* - * If ~90% of the cpu_capacity is still there, we're good. - */ - if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) - return 1; - - return 0; + return ((rq->cpu_capacity * sd->imbalance_pct) < + (rq->cpu_capacity_orig * 100)); } /* @@ -6215,37 +6201,56 @@ static inline int sg_imbalanced(struct sched_group *group) } /* - * Compute the group capacity factor. - * - * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by - * first dividing out the smt factor and computing the actual number of cores - * and limit unit capacity with that. + * group_has_capacity returns true if the group has spare capacity that could + * be used by some tasks. + * We consider that a group has spare capacity if the * number of task is + * smaller than the number of CPUs or if the usage is lower than the available + * capacity for CFS tasks. + * For the latter, we use a threshold to stabilize the state, to take into + * account the variance of the tasks' load and to return true if the available + * capacity in meaningful for the load balancer. + * As an example, an available capacity of 1% can appear but it doesn't make + * any benefit for the load balance. */ -static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) +static inline bool +group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) { - unsigned int capacity_factor, smt, cpus; - unsigned int capacity, capacity_orig; + if (sgs->sum_nr_running < sgs->group_weight) + return true; - capacity = group->sgc->capacity; - capacity_orig = group->sgc->capacity_orig; - cpus = group->group_weight; + if ((sgs->group_capacity * 100) > + (sgs->group_usage * env->sd->imbalance_pct)) + return true; - /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); - capacity_factor = cpus / smt; /* cores */ + return false; +} + +/* + * group_is_overloaded returns true if the group has more tasks than it can + * handle. + * group_is_overloaded is not equals to !group_has_capacity because a group + * with the exact right number of tasks, has no more spare capacity but is not + * overloaded so both group_has_capacity and group_is_overloaded return + * false. + */ +static inline bool +group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running <= sgs->group_weight) + return false; - capacity_factor = min_t(unsigned, - capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); + if ((sgs->group_capacity * 100) < + (sgs->group_usage * env->sd->imbalance_pct)) + return true; - return capacity_factor; + return false; } -static enum group_type -group_classify(struct sched_group *group, struct sg_lb_stats *sgs) +static enum group_type group_classify(struct lb_env *env, + struct sched_group *group, + struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running > sgs->group_capacity_factor) + if (sgs->group_no_capacity) return group_overloaded; if (sg_imbalanced(group)) @@ -6306,11 +6311,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; - sgs->group_capacity_factor = sg_capacity_factor(env, group); - sgs->group_type = group_classify(group, sgs); - if (sgs->group_capacity_factor > sgs->sum_nr_running) - sgs->group_has_free_capacity = 1; + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(env, group, sgs); } /** @@ -6432,18 +6435,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity factor to one so that we'll try + * first, lower the sg capacity so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity_factor. The - * extra check prevents the case where you always pull from the - * heaviest group when it is already under-utilized (possible - * with a large weight task outweighs the tasks on the system). + * these excess tasks. The extra check prevents the case where + * you always pull from the heaviest group when it is already + * under-utilized (possible with a large weight task outweighs + * the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_free_capacity) { - sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); - sgs->group_type = group_classify(sg, sgs); + group_has_capacity(env, &sds->local_stat) && + (sgs->sum_nr_running > 1)) { + sgs->group_no_capacity = 1; + sgs->group_type = group_overloaded; } if (update_sd_pick_busiest(env, sds, sg, sgs)) { @@ -6623,11 +6627,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { - load_above_capacity = - (busiest->sum_nr_running - busiest->group_capacity_factor); - - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); - load_above_capacity /= busiest->group_capacity; + load_above_capacity = busiest->sum_nr_running * + SCHED_LOAD_SCALE; + if (load_above_capacity > busiest->group_capacity) + load_above_capacity -= busiest->group_capacity; + else + load_above_capacity = ~0UL; } /* @@ -6690,6 +6695,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) local = &sds.local_stat; busiest = &sds.busiest_stat; + /* ASYM feature bypasses nice load balance check */ if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && check_asym_packing(env, &sds)) return sds.busiest; @@ -6710,8 +6716,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && - !busiest->group_has_free_capacity) + if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + busiest->group_no_capacity) goto force_balance; /* @@ -6770,7 +6776,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long capacity, capacity_factor, wl; + unsigned long capacity, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6799,9 +6805,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; capacity = capacity_of(i); - capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); wl = weighted_cpuload(i); @@ -6809,7 +6812,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu capacity. */ - if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) + + if (rq->nr_running == 1 && wl > env->imbalance && + !check_cpu_capacity(rq, env->sd)) continue; /* -- cgit v0.10.2 From dc7ff76eadb4b89fd39bb466b8f3773e5467c11d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 3 Mar 2015 11:35:03 +0100 Subject: sched: Remove unused struct sched_group_capacity::capacity_orig The 'struct sched_group_capacity::capacity_orig' field is no longer used in the scheduler so we can remove it. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425378903-5349-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7022e90..838fc9d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5447,17 +5447,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - /* - * Even though we initialize ->capacity to something semi-sane, - * we leave capacity_orig unset. This allows us to detect if - * domain iteration is still funny without causing /0 traps. - */ - if (!group->sgc->capacity_orig) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); - break; - } - if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); @@ -5941,7 +5930,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->capacity_orig = sg->sgc->capacity; /* * Make sure the first group of this domain contains the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e13dd0..d36f8d2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6073,7 +6073,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) capacity >>= SCHED_CAPACITY_SHIFT; cpu_rq(cpu)->cpu_capacity_orig = capacity; - sdg->sgc->capacity_orig = capacity; capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; @@ -6089,7 +6088,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, capacity_orig; + unsigned long capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6101,7 +6100,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) return; } - capacity_orig = capacity = 0; + capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -6121,19 +6120,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * Use capacity_of(), which is set irrespective of domains * in update_cpu_capacity(). * - * This avoids capacity/capacity_orig from being 0 and + * This avoids capacity from being 0 and * causing divide-by-zero issues on boot. - * - * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - capacity_orig += capacity_orig_of(cpu); capacity += capacity_of(cpu); continue; } sgc = rq->sd->groups->sgc; - capacity_orig += sgc->capacity_orig; capacity += sgc->capacity; } } else { @@ -6144,13 +6139,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity_orig += group->sgc->capacity_orig; capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } - sdg->sgc->capacity_orig = capacity_orig; sdg->sgc->capacity = capacity; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be56dfd..dd532c5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -826,7 +826,7 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity * for a single CPU. */ - unsigned int capacity, capacity_orig; + unsigned int capacity; unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* -- cgit v0.10.2 From caff37ef96eac7fe96a582d032f6958e834e9447 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:13 +0100 Subject: sched: Add SD_PREFER_SIBLING for SMT level Add the SD_PREFER_SIBLING flag for SMT level in order to ensure that the scheduler will place at least one task per core. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Preeti U. Murthy Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-11-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 838fc9d..043e2a1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6240,6 +6240,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) */ if (sd->flags & SD_SHARE_CPUCAPACITY) { + sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ -- cgit v0.10.2 From 1aaf90a4b88aae26a4535ba01dacab520a310d17 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:14 +0100 Subject: sched: Move CFS tasks to CPUs with higher capacity When a CPU is used to handle a lot of IRQs or some RT tasks, the remaining capacity for CFS tasks can be significantly reduced. Once we detect such situation by comparing cpu_capacity_orig and cpu_capacity, we trig an idle load balance to check if it's worth moving its tasks on an idle CPU. It's worth trying to move the task before the CPU is fully utilized to minimize the preemption by irq or RT tasks. Once the idle load_balance has selected the busiest CPU, it will look for an active load balance for only two cases: - There is only 1 task on the busiest CPU. - We haven't been able to move a task of the busiest rq. A CPU with a reduced capacity is included in the 1st case, and it's worth to actively migrate its task if the idle CPU has got more available capacity for CFS tasks. This test has been added in need_active_balance. As a sidenote, this will not generate more spurious ilb because we already trig an ilb if there is more than 1 busy cpu. If this cpu is the only one that has a task, we will trig the ilb once for migrating the task. The nohz_kick_needed function has been cleaned up a bit while adding the new test env.src_cpu and env.src_rq must be set unconditionnally because they are used in need_active_balance which is called even if busiest->nr_running equals 1 Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-12-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d36f8d2..0576ce0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6855,6 +6855,19 @@ static int need_active_balance(struct lb_env *env) return 1; } + /* + * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. + * It's worth migrating the task if the src_cpu's capacity is reduced + * because of other sched_class or IRQs if more capacity stays + * available on dst_cpu. + */ + if ((env->idle != CPU_NOT_IDLE) && + (env->src_rq->cfs.h_nr_running == 1)) { + if ((check_cpu_capacity(env->src_rq, sd)) && + (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -6954,6 +6967,9 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + ld_moved = 0; if (busiest->nr_running > 1) { /* @@ -6963,8 +6979,6 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.src_cpu = busiest->cpu; - env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: @@ -7664,22 +7678,25 @@ end: /* * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu is the system. + * of an idle cpu in the system. * - This rq has more than one task. - * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's capacity. + * - This rq has at least one CFS task and the capacity of the CPU is + * significantly reduced because of RT tasks or IRQs. + * - At parent of LLC scheduler domain level, this cpu's scheduler group has + * multiple busy cpu. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline int nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; + bool kick = false; if (unlikely(rq->idle_balance)) - return 0; + return false; /* * We may be recently in ticked or tickless idle mode. At the first @@ -7693,38 +7710,46 @@ static inline int nohz_kick_needed(struct rq *rq) * balancing. */ if (likely(!atomic_read(&nohz.nr_cpus))) - return 0; + return false; if (time_before(now, nohz.next_balance)) - return 0; + return false; if (rq->nr_running >= 2) - goto need_kick; + return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); - if (nr_busy > 1) - goto need_kick_unlock; + if (nr_busy > 1) { + kick = true; + goto unlock; + } + } - sd = rcu_dereference(per_cpu(sd_asym, cpu)); + sd = rcu_dereference(rq->sd); + if (sd) { + if ((rq->cfs.h_nr_running >= 1) && + check_cpu_capacity(rq, sd)) { + kick = true; + goto unlock; + } + } + sd = rcu_dereference(per_cpu(sd_asym, cpu)); if (sd && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) - goto need_kick_unlock; - - rcu_read_unlock(); - return 0; + sched_domain_span(sd)) < cpu)) { + kick = true; + goto unlock; + } -need_kick_unlock: +unlock: rcu_read_unlock(); -need_kick: - return 1; + return kick; } #else static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } -- cgit v0.10.2 From dfbca41f347997e57048a53755611c8e2d792924 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2015 14:19:05 +0100 Subject: sched: Optimize freq invariant accounting Currently the freq invariant accounting (in __update_entity_runnable_avg() and sched_rt_avg_update()) get the scale factor from a weak function call, this means that even for archs that default on their implementation the compiler cannot see into this function and optimize the extra scaling math away. This is sad, esp. since its a 64-bit multiplication which can be quite costly on some platforms. So replace the weak function with #ifdef and __always_inline goo. This is not quite as nice from an arch support PoV but should at least result in compile time errors if done wrong. Signed-off-by: Peter Zijlstra (Intel) Cc: Ben Segall Cc: Morten.Rasmussen@arm.com Cc: Paul Turner Cc: Vincent Guittot Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/20150323131905.GF23123@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0576ce0..3a798ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2484,8 +2484,6 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } -unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu); - /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -6010,16 +6008,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) -{ - return SCHED_CAPACITY_SCALE; -} - -unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) -{ - return default_scale_capacity(sd, cpu); -} - static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) { if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dd532c5..91c6736 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1387,7 +1387,14 @@ static inline int hrtick_enabled(struct rq *rq) #ifdef CONFIG_SMP extern void sched_avg_update(struct rq *rq); -extern unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu); + +#ifndef arch_scale_freq_capacity +static __always_inline +unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { -- cgit v0.10.2 From d4573c3e1c992668f5dcd57d1c2ced56ae9650b9 Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Thu, 26 Mar 2015 18:32:44 +0530 Subject: sched: Improve load balancing in the presence of idle CPUs When a CPU is kicked to do nohz idle balancing, it wakes up to do load balancing on itself, followed by load balancing on behalf of idle CPUs. But it may end up with load after the load balancing attempt on itself. This aborts nohz idle balancing. As a result several idle CPUs are left without tasks till such a time that an ILB CPU finds it unfavorable to pull tasks upon itself. This delays spreading of load across idle CPUs and worse, clutters only a few CPUs with tasks. The effect of the above problem was observed on an SMT8 POWER server with 2 levels of numa domains. Busy loops equal to number of cores were spawned. Since load balancing on fork/exec is discouraged across numa domains, all busy loops would start on one of the numa domains. However it was expected that eventually one busy loop would run per core across all domains due to nohz idle load balancing. But it was observed that it took as long as 10 seconds to spread the load across numa domains. Further investigation showed that this was a consequence of the following: 1. An ILB CPU was chosen from the first numa domain to trigger nohz idle load balancing [Given the experiment, upto 6 CPUs per core could be potentially idle in this domain.] 2. However the ILB CPU would call load_balance() on itself before initiating nohz idle load balancing. 3. Given cores are SMT8, the ILB CPU had enough opportunities to pull tasks from its sibling cores to even out load. 4. Now that the ILB CPU was no longer idle, it would abort nohz idle load balancing As a result the opportunities to spread load across numa domains were lost until such a time that the cores within the first numa domain had equal number of tasks among themselves. This is a pretty bad scenario, since the cores within the first numa domain would have as many as 4 tasks each, while cores in the neighbouring numa domains would all remain idle. Fix this, by checking if a CPU was woken up to do nohz idle load balancing, before it does load balancing upon itself. This way we allow idle CPUs across the system to do load balancing which results in quicker spread of load, instead of performing load balancing within the local sched domain hierarchy of the ILB CPU alone under circumstances such as above. Signed-off-by: Preeti U Murthy Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jason Low Cc: benh@kernel.crashing.org Cc: daniel.lezcano@linaro.org Cc: efault@gmx.de Cc: iamjoonsoo.kim@lge.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: riel@redhat.com Cc: srikar@linux.vnet.ibm.com Cc: svaidy@linux.vnet.ibm.com Cc: tim.c.chen@linux.intel.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/20150326130014.21532.17158.stgit@preeti.in.ibm.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3a798ec..46855d0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7753,14 +7753,16 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; - rebalance_domains(this_rq, idle); - /* * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are - * stopped. + * stopped. Do nohz_idle_balance *before* rebalance_domains to + * give the idle cpus a chance to load balance. Else we may + * load balance only within the local sched_domain hierarchy + * and abort nohz_idle_balance altogether if we pull some load. */ nohz_idle_balance(this_rq, idle); + rebalance_domains(this_rq, idle); } /* -- cgit v0.10.2 From bd4bde14b93cce8fa77765ff709e0be55abdba2c Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 17 Mar 2015 19:15:30 +0800 Subject: sched/deadline: Avoid a superfluous check Since commit 40767b0dc768 ("sched/deadline: Fix deadline parameter modification handling") we clear the thottled state when switching from a dl task, therefore we should never find it set in switching to a dl task. Signed-off-by: Wanpeng Li [ Improved the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: http://lkml.kernel.org/r/1426590931-4639-1-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0a81a95..24c18dc 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1665,14 +1665,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) { int check_resched = 1; - /* - * If p is throttled, don't consider the possibility - * of preempting rq->curr, the check will be done right - * after its runtime will get replenished. - */ - if (unlikely(p->dl.dl_throttled)) - return; - if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && -- cgit v0.10.2 From a1963b81deec54c113e770b0020e5f1c3188a087 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 17 Mar 2015 19:15:31 +0800 Subject: sched/deadline: Fix rt runtime corruption when dl fails its global constraints One version of sched_rt_global_constaints() (the !rt-cgroup one) changes state, therefore if we fail the later sched_dl_global_constraints() call the state is left in an inconsistent state. Fix this by changing the order of the calls. Signed-off-by: Wanpeng Li [ Improved the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: http://lkml.kernel.org/r/1426590931-4639-2-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 043e2a1..4b3b688 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7804,7 +7804,7 @@ static int sched_rt_global_constraints(void) } #endif /* CONFIG_RT_GROUP_SCHED */ -static int sched_dl_global_constraints(void) +static int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); @@ -7905,11 +7905,11 @@ int sched_rt_handler(struct ctl_table *table, int write, if (ret) goto undo; - ret = sched_rt_global_constraints(); + ret = sched_dl_global_validate(); if (ret) goto undo; - ret = sched_dl_global_constraints(); + ret = sched_rt_global_constraints(); if (ret) goto undo; -- cgit v0.10.2 From 07c54f7a7ff77bb47bae26e566969e9c4b6fb0c6 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Tue, 3 Mar 2015 13:50:27 +0200 Subject: sched/core: Remove unused argument from init_[rt|dl]_rq() Obviously, 'rq' is not used in these two functions, therefore, there is no reason for it to be passed as an argument. Signed-off-by: Abel Vesa Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1425383427-26244-1-git-send-email-abelvesa@gmail.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4b3b688..4c49e75 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7164,8 +7164,8 @@ void __init sched_init(void) rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs); - init_rt_rq(&rq->rt, rq); - init_dl_rq(&rq->dl, rq); + init_rt_rq(&rq->rt); + init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 24c18dc..5e2f99b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b) dl_b->total_bw = 0; } -void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) +void init_dl_rq(struct dl_rq *dl_rq) { dl_rq->rb_root = RB_ROOT; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ad02415..575da76 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -64,7 +64,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) static void push_irq_work_func(struct irq_work *work); #endif -void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; int i; @@ -205,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; - init_rt_rq(rt_rq, cpu_rq(i)); + init_rt_rq(rt_rq); rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 91c6736..e0e1299 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1671,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); -extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); +extern void init_rt_rq(struct rt_rq *rt_rq); +extern void init_dl_rq(struct dl_rq *dl_rq); extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); -- cgit v0.10.2 From 4cd57f97135840f637431c92380c8da3edbe44ed Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 31 Mar 2015 09:53:36 +0100 Subject: sched/deadline: Always enqueue on previous rq when dl_task_timer() fires dl_task_timer() may fire on a different rq from where a task was removed after throttling. Since the call path is: dl_task_timer() -> enqueue_task_dl() -> enqueue_dl_entity() -> replenish_dl_entity() and replenish_dl_entity() uses dl_se's rq, we can't use current's rq in dl_task_timer(), but we need to lock the task's previous one. Tested-by: Wanpeng Li Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Acked-by: Kirill Tkhai Cc: Juri Lelli Fixes: 3960c8c0c789 ("sched: Make dl_task_time() use task_rq_lock()") Link: http://lkml.kernel.org/r/1427792017-7356-1-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5e2f99b..9d3ad64 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -514,7 +514,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) unsigned long flags; struct rq *rq; - rq = task_rq_lock(current, &flags); + rq = task_rq_lock(p, &flags); /* * We need to take care of several possible races here: @@ -569,7 +569,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) push_dl_task(rq); #endif unlock: - task_rq_unlock(rq, current, &flags); + task_rq_unlock(rq, p, &flags); return HRTIMER_NORESTART; } -- cgit v0.10.2 From 3c18d447b3b36a8d3c90dc37dfbd363cdb685d0a Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 31 Mar 2015 09:53:37 +0100 Subject: sched/core: Check for available DL bandwidth in cpuset_cpu_inactive() Hotplug operations are destructive w.r.t. cpusets. In case such an operation is performed on a CPU belonging to an exlusive cpuset, the DL bandwidth information associated with the corresponding root domain is gone even if the operation fails (in sched_cpu_inactive()). For this reason we need to move the check we currently have in sched_cpu_inactive() to cpuset_cpu_inactive() to prevent useless cpusets reconfiguration in the CPU_DOWN_FAILED path. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Link: http://lkml.kernel.org/r/1427792017-7356-2-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4c49e75..28b0d75 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5337,36 +5337,13 @@ static int sched_cpu_active(struct notifier_block *nfb, static int sched_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned long flags; - long cpu = (long)hcpu; - struct dl_bw *dl_b; - switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - set_cpu_active(cpu, false); - - /* explicitly allow suspend */ - if (!(action & CPU_TASKS_FROZEN)) { - bool overflow; - int cpus; - - rcu_read_lock_sched(); - dl_b = dl_bw_of(cpu); - - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - - rcu_read_unlock_sched(); - - if (overflow) - return notifier_from_errno(-EBUSY); - } + set_cpu_active((long)hcpu, false); return NOTIFY_OK; + default: + return NOTIFY_DONE; } - - return NOTIFY_DONE; } static int __init migration_init(void) @@ -7006,7 +6983,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, */ case CPU_ONLINE: - case CPU_DOWN_FAILED: cpuset_update_active_cpus(true); break; default: @@ -7018,8 +6994,32 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action) { + unsigned long flags; + long cpu = (long)hcpu; + struct dl_bw *dl_b; + + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: + /* explicitly allow suspend */ + if (!(action & CPU_TASKS_FROZEN)) { + bool overflow; + int cpus; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(cpu); + overflow = __dl_overflow(dl_b, cpus, 0, 0); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); + + if (overflow) { + trace_printk("hotplug failed for cpu %lu", cpu); + return notifier_from_errno(-EBUSY); + } + } cpuset_update_active_cpus(false); break; case CPU_DOWN_PREPARE_FROZEN: -- cgit v0.10.2 From fa9c9d10e97e38d9903fad1829535175ad261f45 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 27 Mar 2015 07:08:35 +0800 Subject: sched/deadline: Support DL task migration during CPU hotplug I observed that DL tasks can't be migrated to other CPUs during CPU hotplug, in addition, task may/may not be running again if CPU is added back. The root cause which I found is that DL tasks will be throtted and removed from the DL rq after comsuming all their budget, which leads to the situation that stop task can't pick them up from the DL rq and migrate them to other CPUs during hotplug. The method to reproduce: schedtool -E -t 50000:100000 -e ./test Actually './test' is just a simple for loop. Then observe which CPU the test task is on and offline it: echo 0 > /sys/devices/system/cpu/cpuN/online This patch adds the DL task migration during CPU hotplug by finding a most suitable later deadline rq after DL timer fires if current rq is offline. If it fails to find a suitable later deadline rq then it falls back to any eligible online CPU in so that the deadline task will come back to us, and the push/pull mechanism should then move it around properly. Suggested-and-Acked-by: Juri Lelli Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1427411315-4298-1-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9d3ad64..5e95145 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq) rq->post_schedule = has_pushable_dl_tasks(rq); } +static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); + +static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) +{ + struct rq *later_rq = NULL; + bool fallback = false; + + later_rq = find_lock_later_rq(p, rq); + + if (!later_rq) { + int cpu; + + /* + * If we cannot preempt any rq, fall back to pick any + * online cpu. + */ + fallback = true; + cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); + if (cpu >= nr_cpu_ids) { + /* + * Fail to find any suitable cpu. + * The task will never come back! + */ + BUG_ON(dl_bandwidth_enabled()); + + /* + * If admission control is disabled we + * try a little harder to let the task + * run. + */ + cpu = cpumask_any(cpu_active_mask); + } + later_rq = cpu_rq(cpu); + double_lock_balance(rq, later_rq); + } + + deactivate_task(rq, p, 0); + set_task_cpu(p, later_rq->cpu); + activate_task(later_rq, p, ENQUEUE_REPLENISH); + + if (!fallback) + resched_curr(later_rq); + + double_unlock_balance(rq, later_rq); +} + #else static inline @@ -536,6 +582,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); +#ifdef CONFIG_SMP + /* + * If we find that the rq the task was on is no longer + * available, we need to select a new rq. + */ + if (unlikely(!rq->online)) { + dl_task_offline_migration(rq, p); + goto unlock; + } +#endif + /* * If the throttle happened during sched-out; like: * -- cgit v0.10.2 From 62a935b256f68a71697716595347209fb5275426 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 3 Apr 2015 10:42:50 +0200 Subject: sched/core: Drop debugging leftover trace_printk call Commit: 3c18d447b3b3 ("sched/core: Check for available DL bandwidth in cpuset_cpu_inactive()") forgot a trace_printk() debugging piece in and Steve's banner screamed in dmesg. Remove it. Signed-off-by: Borislav Petkov Cc: Juri Lelli Cc: Juri Lelli Cc: Peter Zijlstra (Intel) Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1428050570-21041-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 28b0d75..8027cfd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7015,10 +7015,8 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, rcu_read_unlock_sched(); - if (overflow) { - trace_printk("hotplug failed for cpu %lu", cpu); + if (overflow) return notifier_from_errno(-EBUSY); - } } cpuset_update_active_cpus(false); break; -- cgit v0.10.2