From 44142fac3446d08c08c5d717ec11d50a737e8640 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:01 +0200 Subject: sched: fix sysctl_sched_child_runs_first flag fix the sched_child_runs_first flag: always call into ->task_new() if we are on the same CPU, as SCHED_OTHER tasks depend on it for correct initial setup. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index 6c10fa7..2054e55 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1688,10 +1688,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) else p->sched_class = &fair_sched_class; - if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || - (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || - !current->se.on_rq) { - + if (task_cpu(p) != this_cpu || !p->sched_class->task_new || + !current->se.on_rq) { activate_task(rq, p, 0); } else { /* -- cgit v0.10.2 From bb61c210835db95b0e9fb612a316422e7cc675e3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:02 +0200 Subject: sched: resched task in task_new_fair() to get full child-runs-first semantics make sure the parent is rescheduled. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 67c67a8..0990b20 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1191,6 +1191,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) se->wait_runtime = -(sched_granularity(cfs_rq) / 2); __enqueue_entity(cfs_rq, se); + resched_task(rq->curr); } #ifdef CONFIG_FAIR_GROUP_SCHED -- cgit v0.10.2 From 2e45874c5aabe573b6ab4328f303c765701394f9 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Mon, 15 Oct 2007 17:00:02 +0200 Subject: sched: use list_for_each_entry_safe() in __wake_up_common() Use list_for_each_entry_safe() instead of list_for_each_safe() in __wake_up_common() Signed-off-by: Matthias Kaehlcke Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index 2054e55..e92b185 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3634,10 +3634,9 @@ EXPORT_SYMBOL(default_wake_function); static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync, void *key) { - struct list_head *tmp, *next; + wait_queue_t *curr, *next; - list_for_each_safe(tmp, next, &q->task_list) { - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; if (curr->func(curr, mode, sync, key) && -- cgit v0.10.2 From a4b29ba2f72673aaa60ba11ced74d579771dd578 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:02 +0200 Subject: sched: small sched_debug cleanup small kernel/sched_debug.c cleanup - break up multi-variable assignment. no code changed: text data bss dec hex filename 38869 3550 24 42443 a5cb sched.o.before 38869 3550 24 42443 a5cb sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index c3ee38b..94915f1 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -279,9 +279,13 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) void proc_sched_set_task(struct task_struct *p) { #ifdef CONFIG_SCHEDSTATS - p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; - p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; + p->se.sleep_max = 0; + p->se.block_max = 0; + p->se.exec_max = 0; + p->se.wait_max = 0; + p->se.wait_runtime_overruns = 0; + p->se.wait_runtime_underruns = 0; #endif - p->se.sum_exec_runtime = 0; + p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; } -- cgit v0.10.2 From eba1ed4b7e52720e3099325874811c38a5ec1562 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:02 +0200 Subject: sched: debug: track maximum 'slice' track the maximum amount of time a task has executed while the CPU load was at least 2x. (i.e. at least two nice-0 tasks were runnable) Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 833f7dc..9761b16 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -921,6 +921,7 @@ struct sched_entity { u64 block_start; u64 block_max; u64 exec_max; + u64 slice_max; unsigned long wait_runtime_overruns; unsigned long wait_runtime_underruns; diff --git a/kernel/sched.c b/kernel/sched.c index e92b185..282d037 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1603,6 +1603,7 @@ static void __sched_fork(struct task_struct *p) p->se.sleep_max = 0; p->se.block_max = 0; p->se.exec_max = 0; + p->se.slice_max = 0; p->se.wait_max = 0; p->se.wait_runtime_overruns = 0; p->se.wait_runtime_underruns = 0; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 94915f1..fd080f6 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -254,6 +254,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.sleep_max); P(se.block_max); P(se.exec_max); + P(se.slice_max); P(se.wait_max); P(se.wait_runtime_overruns); P(se.wait_runtime_underruns); @@ -282,6 +283,7 @@ void proc_sched_set_task(struct task_struct *p) p->se.sleep_max = 0; p->se.block_max = 0; p->se.exec_max = 0; + p->se.slice_max = 0; p->se.wait_max = 0; p->se.wait_runtime_overruns = 0; p->se.wait_runtime_underruns = 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0990b20..5c15d8a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -739,6 +739,17 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end(cfs_rq, se); update_stats_curr_start(cfs_rq, se); set_cfs_rq_curr(cfs_rq, se); +#ifdef CONFIG_SCHEDSTATS + /* + * Track our maximum slice length, if the CPU's load is at + * least twice that of our own weight (i.e. dont track it + * when there are only lesser-weight tasks around): + */ + if (rq_of(cfs_rq)->ls.load.weight >= 2*se->load.weight) { + se->slice_max = max(se->slice_max, + se->sum_exec_runtime - se->prev_sum_exec_runtime); + } +#endif se->prev_sum_exec_runtime = se->sum_exec_runtime; } -- cgit v0.10.2 From 38ad464d410dadceda1563f36bdb0be7fe4c8938 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:02 +0200 Subject: sched: uniform tunings use the same defaults on both UP and SMP. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index 282d037..2520923 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4898,32 +4898,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; -/* - * Increase the granularity value when there are more CPUs, - * because with more CPUs the 'effective latency' as visible - * to users decreases. But the relationship is not linear, - * so pick a second-best guess by going with the log2 of the - * number of CPUs. - * - * This idea comes from the SD scheduler of Con Kolivas: - */ -static inline void sched_init_granularity(void) -{ - unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long limit = 100000000; - - sysctl_sched_min_granularity *= factor; - if (sysctl_sched_min_granularity > limit) - sysctl_sched_min_granularity = limit; - - sysctl_sched_latency *= factor; - if (sysctl_sched_latency > limit) - sysctl_sched_latency = limit; - - sysctl_sched_runtime_limit = sysctl_sched_latency; - sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2; -} - #ifdef CONFIG_SMP /* * This is how migration works: @@ -6491,12 +6465,10 @@ void __init sched_init_smp(void) /* Move init over to a non-isolated CPU */ if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); - sched_init_granularity(); } #else void __init sched_init_smp(void) { - sched_init_granularity(); } #endif /* CONFIG_SMP */ -- cgit v0.10.2 From 2bd8e6d422a4f44c0994f909317eba80b0fe08a1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:02 +0200 Subject: sched: use constants if !CONFIG_SCHED_DEBUG use constants if !CONFIG_SCHED_DEBUG. this speeds up the code and reduces code-size: text data bss dec hex filename 27464 3014 16 30494 771e sched.o.before 26929 3010 20 29959 7507 sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 9761b16..befca3f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1402,15 +1402,18 @@ static inline void idle_task_exit(void) {} extern void sched_idle_next(void); +#ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_batch_wakeup_granularity; extern unsigned int sysctl_sched_stat_granularity; extern unsigned int sysctl_sched_runtime_limit; -extern unsigned int sysctl_sched_compat_yield; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; +#endif + +extern unsigned int sysctl_sched_compat_yield; #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); diff --git a/kernel/sched.c b/kernel/sched.c index 2520923..ae1544f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1659,12 +1659,6 @@ void sched_fork(struct task_struct *p, int clone_flags) } /* - * After fork, child runs first. (default) If set to 0 then - * parent will (try to) run first. - */ -unsigned int __read_mostly sysctl_sched_child_runs_first = 1; - -/* * wake_up_new_task - wake up a newly created task for the first time. * * This function will do some initial scheduler statistics housekeeping diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5c15d8a..2e84aaf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -21,6 +21,15 @@ */ /* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# define const_debug __read_mostly +#else +# define const_debug static const +#endif + +/* * Targeted preemption latency for CPU-bound tasks: * (default: 20ms, units: nanoseconds) * @@ -34,7 +43,13 @@ * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) * Targeted preemption latency for CPU-bound tasks: */ -unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; +const_debug unsigned int sysctl_sched_latency = 20000000ULL; + +/* + * After fork, child runs first. (default) If set to 0 then + * parent will (try to) run first. + */ +const_debug unsigned int sysctl_sched_child_runs_first = 1; /* * Minimal preemption granularity for CPU-bound tasks: @@ -58,7 +73,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield; * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; +const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL; /* * SCHED_OTHER wake-up granularity. @@ -68,13 +83,10 @@ unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; +const_debug unsigned int sysctl_sched_wakeup_granularity = 1000000UL; -unsigned int sysctl_sched_stat_granularity __read_mostly; +const_debug unsigned int sysctl_sched_stat_granularity; -/* - * Initialized in sched_init_granularity() [to 5 times the base granularity]: - */ unsigned int sysctl_sched_runtime_limit __read_mostly; /* @@ -89,7 +101,7 @@ enum { SCHED_FEAT_SKIP_INITIAL = 32, }; -unsigned int sysctl_sched_features __read_mostly = +const_debug unsigned int sysctl_sched_features = SCHED_FEAT_FAIR_SLEEPERS *1 | SCHED_FEAT_SLEEPER_AVG *0 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | -- cgit v0.10.2 From 8ebc91d93669af39dbed50914d7daf457eeb43be Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:03 +0200 Subject: sched: remove stat_gran remove the stat_gran code - it was disabled by default and it causes unnecessary overhead. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index befca3f..3c38a50 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -895,9 +895,6 @@ struct load_weight { */ struct sched_entity { long wait_runtime; - unsigned long delta_fair_run; - unsigned long delta_fair_sleep; - unsigned long delta_exec; s64 fair_key; struct load_weight load; /* for load-balancing */ struct rb_node run_node; diff --git a/kernel/sched.c b/kernel/sched.c index ae1544f..d4dabfc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -829,7 +829,7 @@ static void update_curr_load(struct rq *rq) * Stagger updates to ls->delta_fair. Very frequent updates * can be expensive. */ - if (ls->delta_stat >= sysctl_sched_stat_granularity) + if (ls->delta_stat) __update_curr_load(rq, ls); } @@ -1588,9 +1588,6 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; - p->se.delta_exec = 0; - p->se.delta_fair_run = 0; - p->se.delta_fair_sleep = 0; p->se.wait_runtime = 0; p->se.sleep_start_fair = 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2e84aaf..2138c40 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -85,8 +85,6 @@ const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL; */ const_debug unsigned int sysctl_sched_wakeup_granularity = 1000000UL; -const_debug unsigned int sysctl_sched_stat_granularity; - unsigned int sysctl_sched_runtime_limit __read_mostly; /* @@ -360,13 +358,13 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) * are not in our scheduling class. */ static inline void -__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) +__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, + unsigned long delta_exec) { - unsigned long delta, delta_exec, delta_fair, delta_mine; + unsigned long delta, delta_fair, delta_mine; struct load_weight *lw = &cfs_rq->load; unsigned long load = lw->weight; - delta_exec = curr->delta_exec; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); curr->sum_exec_runtime += delta_exec; @@ -400,6 +398,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq_curr(cfs_rq); + u64 now = rq_of(cfs_rq)->clock; unsigned long delta_exec; if (unlikely(!curr)) @@ -410,15 +409,10 @@ static void update_curr(struct cfs_rq *cfs_rq) * since the last time we changed load (this cannot * overflow on 32 bits): */ - delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start); - - curr->delta_exec += delta_exec; + delta_exec = (unsigned long)(now - curr->exec_start); - if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { - __update_curr(cfs_rq, curr); - curr->delta_exec = 0; - } - curr->exec_start = rq_of(cfs_rq)->clock; + __update_curr(cfs_rq, curr, delta_exec); + curr->exec_start = now; } static inline void @@ -494,10 +488,9 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) * Note: must be called with a freshly updated rq->fair_clock. */ static inline void -__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long delta_fair) { - unsigned long delta_fair = se->delta_fair_run; - schedstat_set(se->wait_max, max(se->wait_max, rq_of(cfs_rq)->clock - se->wait_start)); @@ -519,12 +512,7 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), (u64)(cfs_rq->fair_clock - se->wait_start_fair)); - se->delta_fair_run += delta_fair; - if (unlikely(abs(se->delta_fair_run) >= - sysctl_sched_stat_granularity)) { - __update_stats_wait_end(cfs_rq, se); - se->delta_fair_run = 0; - } + __update_stats_wait_end(cfs_rq, se, delta_fair); se->wait_start_fair = 0; schedstat_set(se->wait_start, 0); @@ -567,9 +555,10 @@ update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ -static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long delta_fair) { - unsigned long load = cfs_rq->load.weight, delta_fair; + unsigned long load = cfs_rq->load.weight; long prev_runtime; /* @@ -582,8 +571,6 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) load = rq_of(cfs_rq)->cpu_load[2]; - delta_fair = se->delta_fair_sleep; - /* * Fix up delta_fair with the effect of us running * during the whole sleep period: @@ -618,12 +605,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), (u64)(cfs_rq->fair_clock - se->sleep_start_fair)); - se->delta_fair_sleep += delta_fair; - if (unlikely(abs(se->delta_fair_sleep) >= - sysctl_sched_stat_granularity)) { - __enqueue_sleeper(cfs_rq, se); - se->delta_fair_sleep = 0; - } + __enqueue_sleeper(cfs_rq, se, delta_fair); se->sleep_start_fair = 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6c97259..9b1b0d4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -266,17 +266,6 @@ static ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_stat_granularity_ns", - .data = &sysctl_sched_stat_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_wakeup_granularity_ns, - .extra2 = &max_wakeup_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_runtime_limit_ns", .data = &sysctl_sched_runtime_limit, .maxlen = sizeof(unsigned int), -- cgit v0.10.2 From a25707f3aef9cf68c341eba5960d580f364e4e6f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:03 +0200 Subject: sched: remove precise CPU load CPU load calculations are statistical anyway, and there's little benefit from having it calculated on every scheduling event. So remove this code, it gets rid of a divide from the scheduler wakeup and context-switch fastpath. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index d4dabfc..25cc9b2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1972,42 +1972,11 @@ unsigned long nr_active(void) */ static void update_cpu_load(struct rq *this_rq) { - u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; unsigned long total_load = this_rq->ls.load.weight; unsigned long this_load = total_load; - struct load_stat *ls = &this_rq->ls; int i, scale; this_rq->nr_load_updates++; - if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) - goto do_avg; - - /* Update delta_fair/delta_exec fields first */ - update_curr_load(this_rq); - - fair_delta64 = ls->delta_fair + 1; - ls->delta_fair = 0; - - exec_delta64 = ls->delta_exec + 1; - ls->delta_exec = 0; - - sample_interval64 = this_rq->clock - ls->load_update_last; - ls->load_update_last = this_rq->clock; - - if ((s64)sample_interval64 < (s64)TICK_NSEC) - sample_interval64 = TICK_NSEC; - - if (exec_delta64 > sample_interval64) - exec_delta64 = sample_interval64; - - idle_delta64 = sample_interval64 - exec_delta64; - - tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); - tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); - - this_load = (unsigned long)tmp64; - -do_avg: /* Update our load: */ for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { @@ -2017,7 +1986,13 @@ do_avg: old_load = this_rq->cpu_load[i]; new_load = this_load; - + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } } @@ -6484,7 +6459,6 @@ static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) void __init sched_init(void) { - u64 now = sched_clock(); int highest_cpu = 0; int i, j; @@ -6509,8 +6483,6 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); #endif - rq->ls.load_update_last = now; - rq->ls.load_update_start = now; for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index fd080f6..6b789da 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -145,8 +145,6 @@ static void print_cpu(struct seq_file *m, int cpu) P(nr_running); SEQ_printf(m, " .%-30s: %lu\n", "load", rq->ls.load.weight); - P(ls.delta_fair); - P(ls.delta_exec); P(nr_switches); P(nr_load_updates); P(nr_uninterruptible); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2138c40..105d57b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -94,16 +94,14 @@ enum { SCHED_FEAT_FAIR_SLEEPERS = 1, SCHED_FEAT_SLEEPER_AVG = 2, SCHED_FEAT_SLEEPER_LOAD_AVG = 4, - SCHED_FEAT_PRECISE_CPU_LOAD = 8, - SCHED_FEAT_START_DEBIT = 16, - SCHED_FEAT_SKIP_INITIAL = 32, + SCHED_FEAT_START_DEBIT = 8, + SCHED_FEAT_SKIP_INITIAL = 16, }; const_debug unsigned int sysctl_sched_features = SCHED_FEAT_FAIR_SLEEPERS *1 | SCHED_FEAT_SLEEPER_AVG *0 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | - SCHED_FEAT_PRECISE_CPU_LOAD *1 | SCHED_FEAT_START_DEBIT *1 | SCHED_FEAT_SKIP_INITIAL *0; -- cgit v0.10.2 From 53df556e06d85245cf6aacedaba8e4da684859c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:03 +0200 Subject: sched: remove precise CPU load calculations #2 continued removal of precise CPU load calculations. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index 25cc9b2..f6a8106 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -173,8 +173,6 @@ struct rt_prio_array { struct load_stat { struct load_weight load; - u64 load_update_start, load_update_last; - unsigned long delta_fair, delta_exec, delta_stat; }; /* CFS-related fields in a runqueue */ @@ -793,15 +791,6 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, #define sched_class_highest (&rt_sched_class) -static void __update_curr_load(struct rq *rq, struct load_stat *ls) -{ - if (rq->curr != rq->idle && ls->load.weight) { - ls->delta_exec += ls->delta_stat; - ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); - ls->delta_stat = 0; - } -} - /* * Update delta_exec, delta_fair fields for rq. * @@ -817,31 +806,13 @@ static void __update_curr_load(struct rq *rq, struct load_stat *ls) * This function is called /before/ updating rq->ls.load * and when switching tasks. */ -static void update_curr_load(struct rq *rq) -{ - struct load_stat *ls = &rq->ls; - u64 start; - - start = ls->load_update_start; - ls->load_update_start = rq->clock; - ls->delta_stat += rq->clock - start; - /* - * Stagger updates to ls->delta_fair. Very frequent updates - * can be expensive. - */ - if (ls->delta_stat) - __update_curr_load(rq, ls); -} - static inline void inc_load(struct rq *rq, const struct task_struct *p) { - update_curr_load(rq); update_load_add(&rq->ls.load, p->se.load.weight); } static inline void dec_load(struct rq *rq, const struct task_struct *p) { - update_curr_load(rq); update_load_sub(&rq->ls.load, p->se.load.weight); } @@ -1972,8 +1943,7 @@ unsigned long nr_active(void) */ static void update_cpu_load(struct rq *this_rq) { - unsigned long total_load = this_rq->ls.load.weight; - unsigned long this_load = total_load; + unsigned long this_load = this_rq->ls.load.weight; int i, scale; this_rq->nr_load_updates++; -- cgit v0.10.2 From 62160e3f4a06d948ec89665d29f1173e551deedc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:03 +0200 Subject: sched: track cfs_rq->curr on !group-scheduling too Noticed by Roman Zippel: use cfs_rq->curr in the !group-scheduling case too. Small micro-optimization and cleanup effect: text data bss dec hex filename 36269 3482 24 39775 9b5f sched.o.before 36177 3486 24 39687 9b07 sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index f6a8106..3209e2c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -189,11 +189,11 @@ struct cfs_rq { struct rb_root tasks_timeline; struct rb_node *rb_leftmost; struct rb_node *rb_load_balance_curr; -#ifdef CONFIG_FAIR_GROUP_SCHED /* 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr; +#ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 105d57b..335faf0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -111,51 +111,38 @@ extern struct sched_class fair_sched_class; * CFS operations on generic schedulable entities: */ -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* cpu runqueue to which this cfs_rq is attached */ -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return cfs_rq->rq; -} - /* currently running entity (if any) on this cfs_rq */ static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) { return cfs_rq->curr; } -/* An entity is a task if it doesn't "own" a runqueue */ -#define entity_is_task(se) (!se->my_q) - static inline void set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { cfs_rq->curr = se; } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED +/* cpu runqueue to which this cfs_rq is attached */ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) { - return container_of(cfs_rq, struct rq, cfs); + return cfs_rq->rq; } -static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) -{ - struct rq *rq = rq_of(cfs_rq); +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) - if (unlikely(rq->curr->sched_class != &fair_sched_class)) - return NULL; +#else /* CONFIG_FAIR_GROUP_SCHED */ - return &rq->curr->se; +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); } #define entity_is_task(se) 1 -static inline void -set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { } - #endif /* CONFIG_FAIR_GROUP_SCHED */ static inline struct task_struct *task_of(struct sched_entity *se) -- cgit v0.10.2 From 429d43bcc026b92b9dfaccd3577fec290f6a67ce Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:03 +0200 Subject: sched: cleanup: simplify cfs_rq_curr() methods cleanup: simplify cfs_rq_curr() methods - now that the cfs_rq->curr pointer is unconditionally present, remove the wrappers. kernel/sched.o: text data bss dec hex filename 11784 224 2012 14020 36c4 sched.o.before 11784 224 2012 14020 36c4 sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 335faf0..74d47e6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -111,18 +111,6 @@ extern struct sched_class fair_sched_class; * CFS operations on generic schedulable entities: */ -/* currently running entity (if any) on this cfs_rq */ -static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) -{ - return cfs_rq->curr; -} - -static inline void -set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->curr = se; -} - #ifdef CONFIG_FAIR_GROUP_SCHED /* cpu runqueue to which this cfs_rq is attached */ @@ -382,7 +370,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, static void update_curr(struct cfs_rq *cfs_rq) { - struct sched_entity *curr = cfs_rq_curr(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; u64 now = rq_of(cfs_rq)->clock; unsigned long delta_exec; @@ -440,7 +428,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) * Are we enqueueing a waiting task? (for current tasks * a dequeue/enqueue event is a NOP) */ - if (se != cfs_rq_curr(cfs_rq)) + if (se != cfs_rq->curr) update_stats_wait_start(cfs_rq, se); /* * Update the key: @@ -511,7 +499,7 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) * Mark the end of the wait period if dequeueing a * waiting task: */ - if (se != cfs_rq_curr(cfs_rq)) + if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); } @@ -717,7 +705,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); update_stats_curr_start(cfs_rq, se); - set_cfs_rq_curr(cfs_rq, se); + cfs_rq->curr = se; #ifdef CONFIG_SCHEDSTATS /* * Track our maximum slice length, if the CPU's load is at @@ -754,7 +742,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) if (prev->on_rq) update_stats_wait_start(cfs_rq, prev); - set_cfs_rq_curr(cfs_rq, NULL); + cfs_rq->curr = NULL; } static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) @@ -1153,7 +1141,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) static void task_new_fair(struct rq *rq, struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); - struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); + struct sched_entity *se = &p->se, *curr = cfs_rq->curr; sched_info_queued(p); -- cgit v0.10.2 From e59c80c5bbc0d3d6b0772edb347ce2dd303121b3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:03 +0200 Subject: sched: simplify SCHED_FEAT_* code Peter Zijlstra suggested to simplify SCHED_FEAT_* checks via the sched_feat(x) macro. No code changed: text data bss dec hex filename 38895 3550 24 42469 a5e5 sched.o.before 38895 3550 24 42469 a5e5 sched.o.after Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 74d47e6..2488f6f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -105,6 +105,8 @@ const_debug unsigned int sysctl_sched_features = SCHED_FEAT_START_DEBIT *1 | SCHED_FEAT_SKIP_INITIAL *0; +#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) + extern struct sched_class fair_sched_class; /************************************************************** @@ -541,14 +543,14 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) return; - if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) + if (sched_feat(SLEEPER_LOAD_AVG)) load = rq_of(cfs_rq)->cpu_load[2]; /* * Fix up delta_fair with the effect of us running * during the whole sleep period: */ - if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG) + if (sched_feat(SLEEPER_AVG)) delta_fair = div64_likely32((u64)delta_fair * load, load + se->load.weight); @@ -572,7 +574,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) unsigned long delta_fair; if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) || - !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS)) + !sched_feat(FAIR_SLEEPERS)) return; delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), @@ -1158,14 +1160,14 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * The first wait is dominated by the child-runs-first logic, * so do not credit it with that waiting time yet: */ - if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) + if (sched_feat(SKIP_INITIAL)) se->wait_start_fair = 0; /* * The statistical average of wait_runtime is about * -granularity/2, so initialize the task with that: */ - if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) + if (sched_feat(START_DEBIT)) se->wait_runtime = -(sched_granularity(cfs_rq) / 2); __enqueue_entity(cfs_rq, se); -- cgit v0.10.2 From 19ccd97a03a026c2341b35af3ed2078a83c4a22b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:04 +0200 Subject: sched: uninline __enqueue_entity()/__dequeue_entity() suggested by Roman Zippel: uninline __enqueue_entity() and __dequeue_entity(). this reduces code size: text data bss dec hex filename 25385 2386 16 27787 6c8b sched.o.before 25257 2386 16 27659 6c0b sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2488f6f..91a227b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -148,7 +148,7 @@ static inline struct task_struct *task_of(struct sched_entity *se) /* * Enqueue an entity into the rb-tree: */ -static inline void +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; @@ -191,7 +191,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); } -static inline void +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->rb_leftmost == &se->run_node) -- cgit v0.10.2 From 1091985b482fdd577a5c511059b9d7b4467bd15d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:04 +0200 Subject: sched: speed up update_load_add/_sub() speed up update_load_add/_sub() by not delaying the division - this reduces CPU pipeline dependencies. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index 3209e2c..992a1fa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -697,16 +697,17 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); } -static void update_load_add(struct load_weight *lw, unsigned long inc) +static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; - lw->inv_weight = 0; + lw->inv_weight = WMULT_CONST / lw->weight; } -static void update_load_sub(struct load_weight *lw, unsigned long dec) +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) { lw->weight -= dec; - lw->inv_weight = 0; + if (likely(lw->weight)) + lw->inv_weight = WMULT_CONST / lw->weight; } /* -- cgit v0.10.2 From 08e2388aa1e40cb06f7d04ac621e2ae94e1d8fdc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:04 +0200 Subject: sched: clean up calc_weighted() clean up calc_weighted() - we always use the normalized shift so it's not needed to pass that in. Also, push the non-nice0 branch into the function. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 91a227b..b46f807 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -397,27 +397,16 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); } -/* - * We calculate fair deltas here, so protect against the random effects - * of a multiplication overflow by capping it to the runtime limit: - */ -#if BITS_PER_LONG == 32 static inline unsigned long -calc_weighted(unsigned long delta, unsigned long weight, int shift) +calc_weighted(unsigned long delta, struct sched_entity *se) { - u64 tmp = (u64)delta * weight >> shift; + unsigned long weight = se->load.weight; - if (unlikely(tmp > sysctl_sched_runtime_limit*2)) - return sysctl_sched_runtime_limit*2; - return tmp; + if (unlikely(weight != NICE_0_LOAD)) + return (u64)delta * se->load.weight >> NICE_0_SHIFT; + else + return delta; } -#else -static inline unsigned long -calc_weighted(unsigned long delta, unsigned long weight, int shift) -{ - return delta * weight >> shift; -} -#endif /* * Task is being enqueued - update stats: @@ -469,9 +458,7 @@ __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, schedstat_set(se->wait_max, max(se->wait_max, rq_of(cfs_rq)->clock - se->wait_start)); - if (unlikely(se->load.weight != NICE_0_LOAD)) - delta_fair = calc_weighted(delta_fair, se->load.weight, - NICE_0_SHIFT); + delta_fair = calc_weighted(delta_fair, se); add_wait_runtime(cfs_rq, se, delta_fair); } @@ -554,9 +541,7 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, delta_fair = div64_likely32((u64)delta_fair * load, load + se->load.weight); - if (unlikely(se->load.weight != NICE_0_LOAD)) - delta_fair = calc_weighted(delta_fair, se->load.weight, - NICE_0_SHIFT); + delta_fair = calc_weighted(delta_fair, se); prev_runtime = se->wait_runtime; __add_wait_runtime(cfs_rq, se, delta_fair); -- cgit v0.10.2 From e9acbff6484df51fd880e0f5fe0224e8be34c17b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:04 +0200 Subject: sched: introduce se->vruntime introduce se->vruntime as a sum of weighted delta-exec's, and use that as the key into the tree. the idea to use absolute virtual time as the basic metric of scheduling has been first raised by William Lee Irwin, advanced by Tong Li and first prototyped by Roman Zippel in the "Really Fair Scheduler" (RFS) patchset. also see: http://lkml.org/lkml/2007/9/2/76 for a simpler variant of this patch. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 3c38a50..5e5c457 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -902,6 +902,7 @@ struct sched_entity { u64 exec_start; u64 sum_exec_runtime; + u64 vruntime; u64 prev_sum_exec_runtime; u64 wait_start_fair; u64 sleep_start_fair; diff --git a/kernel/sched.c b/kernel/sched.c index 992a1fa..8f80eba 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -182,6 +182,7 @@ struct cfs_rq { s64 fair_clock; u64 exec_clock; + u64 min_vruntime; s64 wait_runtime; u64 sleeper_bonus; unsigned long wait_runtime_overruns, wait_runtime_underruns; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b46f807..a2af09c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -92,14 +92,16 @@ unsigned int sysctl_sched_runtime_limit __read_mostly; */ enum { SCHED_FEAT_FAIR_SLEEPERS = 1, - SCHED_FEAT_SLEEPER_AVG = 2, - SCHED_FEAT_SLEEPER_LOAD_AVG = 4, - SCHED_FEAT_START_DEBIT = 8, - SCHED_FEAT_SKIP_INITIAL = 16, + SCHED_FEAT_NEW_FAIR_SLEEPERS = 2, + SCHED_FEAT_SLEEPER_AVG = 4, + SCHED_FEAT_SLEEPER_LOAD_AVG = 8, + SCHED_FEAT_START_DEBIT = 16, + SCHED_FEAT_SKIP_INITIAL = 32, }; const_debug unsigned int sysctl_sched_features = - SCHED_FEAT_FAIR_SLEEPERS *1 | + SCHED_FEAT_FAIR_SLEEPERS *0 | + SCHED_FEAT_NEW_FAIR_SLEEPERS *1 | SCHED_FEAT_SLEEPER_AVG *0 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | SCHED_FEAT_START_DEBIT *1 | @@ -145,6 +147,19 @@ static inline struct task_struct *task_of(struct sched_entity *se) * Scheduling class tree data structure manipulation methods: */ +static inline void +set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost) +{ + struct sched_entity *se; + + cfs_rq->rb_leftmost = leftmost; + if (leftmost) { + se = rb_entry(leftmost, struct sched_entity, run_node); + cfs_rq->min_vruntime = max(se->vruntime, + cfs_rq->min_vruntime); + } +} + /* * Enqueue an entity into the rb-tree: */ @@ -180,7 +195,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * used): */ if (leftmost) - cfs_rq->rb_leftmost = &se->run_node; + set_leftmost(cfs_rq, &se->run_node); rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); @@ -195,7 +210,8 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->rb_leftmost == &se->run_node) - cfs_rq->rb_leftmost = rb_next(&se->run_node); + set_leftmost(cfs_rq, rb_next(&se->run_node)); + rb_erase(&se->run_node, &cfs_rq->tasks_timeline); update_load_sub(&cfs_rq->load, se->load.weight); cfs_rq->nr_running--; @@ -336,7 +352,7 @@ static inline void __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { - unsigned long delta, delta_fair, delta_mine; + unsigned long delta, delta_fair, delta_mine, delta_exec_weighted; struct load_weight *lw = &cfs_rq->load; unsigned long load = lw->weight; @@ -344,6 +360,12 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->sum_exec_runtime += delta_exec; cfs_rq->exec_clock += delta_exec; + delta_exec_weighted = delta_exec; + if (unlikely(curr->load.weight != NICE_0_LOAD)) { + delta_exec_weighted = calc_delta_fair(delta_exec_weighted, + &curr->load); + } + curr->vruntime += delta_exec_weighted; if (unlikely(!load)) return; @@ -413,8 +435,6 @@ calc_weighted(unsigned long delta, struct sched_entity *se) */ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { - s64 key; - /* * Are we enqueueing a waiting task? (for current tasks * a dequeue/enqueue event is a NOP) @@ -424,28 +444,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * Update the key: */ - key = cfs_rq->fair_clock; - - /* - * Optimize the common nice 0 case: - */ - if (likely(se->load.weight == NICE_0_LOAD)) { - key -= se->wait_runtime; - } else { - u64 tmp; - - if (se->wait_runtime < 0) { - tmp = -se->wait_runtime; - key += (tmp * se->load.inv_weight) >> - (WMULT_SHIFT - NICE_0_SHIFT); - } else { - tmp = se->wait_runtime; - key -= (tmp * se->load.inv_weight) >> - (WMULT_SHIFT - NICE_0_SHIFT); - } - } - - se->fair_key = key; + se->fair_key = se->vruntime; } /* @@ -615,8 +614,22 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) */ update_curr(cfs_rq); - if (wakeup) + if (wakeup) { + u64 min_runtime, latency; + + min_runtime = cfs_rq->min_vruntime; + min_runtime += sysctl_sched_latency/2; + + if (sched_feat(NEW_FAIR_SLEEPERS)) { + latency = calc_weighted(sysctl_sched_latency, se); + if (min_runtime > latency) + min_runtime -= latency; + } + + se->vruntime = max(se->vruntime, min_runtime); + enqueue_sleeper(cfs_rq, se); + } update_stats_enqueue(cfs_rq, se); __enqueue_entity(cfs_rq, se); @@ -1155,6 +1168,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) if (sched_feat(START_DEBIT)) se->wait_runtime = -(sched_granularity(cfs_rq) / 2); + se->vruntime = cfs_rq->min_vruntime; + update_stats_enqueue(cfs_rq, se); __enqueue_entity(cfs_rq, se); resched_task(rq->curr); } -- cgit v0.10.2 From bf5c91ba8c629b84413c761f529627195fd0a935 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:04 +0200 Subject: sched: move sched_feat() definitions move sched_feat() definitions so that it can be used sooner by generic code too. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index 8f80eba..a5dd035 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -382,6 +382,37 @@ static void update_rq_clock(struct rq *rq) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) /* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# define const_debug __read_mostly +#else +# define const_debug static const +#endif + +/* + * Debugging: various feature bits + */ +enum { + SCHED_FEAT_FAIR_SLEEPERS = 1, + SCHED_FEAT_NEW_FAIR_SLEEPERS = 2, + SCHED_FEAT_SLEEPER_AVG = 4, + SCHED_FEAT_SLEEPER_LOAD_AVG = 8, + SCHED_FEAT_START_DEBIT = 16, + SCHED_FEAT_SKIP_INITIAL = 32, +}; + +const_debug unsigned int sysctl_sched_features = + SCHED_FEAT_FAIR_SLEEPERS *0 | + SCHED_FEAT_NEW_FAIR_SLEEPERS *1 | + SCHED_FEAT_SLEEPER_AVG *0 | + SCHED_FEAT_SLEEPER_LOAD_AVG *1 | + SCHED_FEAT_START_DEBIT *1 | + SCHED_FEAT_SKIP_INITIAL *0; + +#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) + +/* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu * clock constructed from sched_clock(): */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a2af09c..a566a45 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -21,15 +21,6 @@ */ /* - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: - */ -#ifdef CONFIG_SCHED_DEBUG -# define const_debug __read_mostly -#else -# define const_debug static const -#endif - -/* * Targeted preemption latency for CPU-bound tasks: * (default: 20ms, units: nanoseconds) * @@ -87,28 +78,6 @@ const_debug unsigned int sysctl_sched_wakeup_granularity = 1000000UL; unsigned int sysctl_sched_runtime_limit __read_mostly; -/* - * Debugging: various feature bits - */ -enum { - SCHED_FEAT_FAIR_SLEEPERS = 1, - SCHED_FEAT_NEW_FAIR_SLEEPERS = 2, - SCHED_FEAT_SLEEPER_AVG = 4, - SCHED_FEAT_SLEEPER_LOAD_AVG = 8, - SCHED_FEAT_START_DEBIT = 16, - SCHED_FEAT_SKIP_INITIAL = 32, -}; - -const_debug unsigned int sysctl_sched_features = - SCHED_FEAT_FAIR_SLEEPERS *0 | - SCHED_FEAT_NEW_FAIR_SLEEPERS *1 | - SCHED_FEAT_SLEEPER_AVG *0 | - SCHED_FEAT_SLEEPER_LOAD_AVG *1 | - SCHED_FEAT_START_DEBIT *1 | - SCHED_FEAT_SKIP_INITIAL *0; - -#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) - extern struct sched_class fair_sched_class; /************************************************************** -- cgit v0.10.2 From 6cb58195143b55d4c427d92f8425bec2b0d9c56c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:04 +0200 Subject: sched: optimize vruntime based scheduling optimize vruntime based scheduling. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index a5dd035..5594e65 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -732,13 +732,14 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; - lw->inv_weight = WMULT_CONST / lw->weight; + if (sched_feat(FAIR_SLEEPERS)) + lw->inv_weight = WMULT_CONST / lw->weight; } static inline void update_load_sub(struct load_weight *lw, unsigned long dec) { lw->weight -= dec; - if (likely(lw->weight)) + if (sched_feat(FAIR_SLEEPERS) && likely(lw->weight)) lw->inv_weight = WMULT_CONST / lw->weight; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a566a45..7041dc6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -336,6 +336,9 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, } curr->vruntime += delta_exec_weighted; + if (!sched_feat(FAIR_SLEEPERS)) + return; + if (unlikely(!load)) return; -- cgit v0.10.2 From 4d78e7b656aa6440c337302fe065338ce840a64e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:04 +0200 Subject: sched: new task placement for vruntime add proper new task placement for the vruntime based math too. ( note: introduces a swap() macro, but the swap token is too widely used in the kernel namespace for a generic version to be added without changing non-scheduler code - so this cleanup will be done separately. ) Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7041dc6..95487e3 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -203,6 +203,20 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ +static u64 __sched_period(unsigned long nr_running) +{ + u64 period = sysctl_sched_latency; + unsigned long nr_latency = + sysctl_sched_latency / sysctl_sched_min_granularity; + + if (unlikely(nr_running > nr_latency)) { + period *= nr_running; + do_div(period, nr_latency); + } + + return period; +} + /* * Calculate the preemption granularity needed to schedule every * runnable task once per sysctl_sched_latency amount of time. @@ -1103,6 +1117,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) } } +#define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) + /* * Share the fairness runtime between parent and child, thus the * total amount of pressure for CPU stays equal - new tasks @@ -1118,14 +1134,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) sched_info_queued(p); update_curr(cfs_rq); + se->vruntime = cfs_rq->min_vruntime; update_stats_enqueue(cfs_rq, se); - /* - * Child runs first: we let it run before the parent - * until it reschedules once. We set up the key so that - * it will preempt the parent: - */ - se->fair_key = curr->fair_key - - niced_granularity(curr, sched_granularity(cfs_rq)) - 1; + /* * The first wait is dominated by the child-runs-first logic, * so do not credit it with that waiting time yet: @@ -1138,9 +1149,16 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * -granularity/2, so initialize the task with that: */ if (sched_feat(START_DEBIT)) - se->wait_runtime = -(sched_granularity(cfs_rq) / 2); + se->wait_runtime = -(__sched_period(cfs_rq->nr_running+1) / 2); + + if (sysctl_sched_child_runs_first && + curr->vruntime < se->vruntime) { + + dequeue_entity(cfs_rq, curr, 0); + swap(curr->vruntime, se->vruntime); + enqueue_entity(cfs_rq, curr, 0); + } - se->vruntime = cfs_rq->min_vruntime; update_stats_enqueue(cfs_rq, se); __enqueue_entity(cfs_rq, se); resched_task(rq->curr); -- cgit v0.10.2 From 6d0f0ebd063e36cd0ebae9be15973b02c4245a99 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:05 +0200 Subject: sched: simplify adaptive latency simplify adaptive latency. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 95487e3..3179d11 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -217,77 +217,14 @@ static u64 __sched_period(unsigned long nr_running) return period; } -/* - * Calculate the preemption granularity needed to schedule every - * runnable task once per sysctl_sched_latency amount of time. - * (down to a sensible low limit on granularity) - * - * For example, if there are 2 tasks running and latency is 10 msecs, - * we switch tasks every 5 msecs. If we have 3 tasks running, we have - * to switch tasks every 3.33 msecs to get a 10 msecs observed latency - * for each task. We do finer and finer scheduling up to until we - * reach the minimum granularity value. - * - * To achieve this we use the following dynamic-granularity rule: - * - * gran = lat/nr - lat/nr/nr - * - * This comes out of the following equations: - * - * kA1 + gran = kB1 - * kB2 + gran = kA2 - * kA2 = kA1 - * kB2 = kB1 - d + d/nr - * lat = d * nr - * - * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), - * '1' is start of time, '2' is end of time, 'd' is delay between - * 1 and 2 (during which task B was running), 'nr' is number of tasks - * running, 'lat' is the the period of each task. ('lat' is the - * sched_latency that we aim for.) - */ -static long -sched_granularity(struct cfs_rq *cfs_rq) +static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned int gran = sysctl_sched_latency; - unsigned int nr = cfs_rq->nr_running; - - if (nr > 1) { - gran = gran/nr - gran/nr/nr; - gran = max(gran, sysctl_sched_min_granularity); - } + u64 period = __sched_period(cfs_rq->nr_running); - return gran; -} + period *= se->load.weight; + do_div(period, cfs_rq->load.weight); -/* - * We rescale the rescheduling granularity of tasks according to their - * nice level, but only linearly, not exponentially: - */ -static long -niced_granularity(struct sched_entity *curr, unsigned long granularity) -{ - u64 tmp; - - if (likely(curr->load.weight == NICE_0_LOAD)) - return granularity; - /* - * Positive nice levels get the same granularity as nice-0: - */ - if (likely(curr->load.weight < NICE_0_LOAD)) { - tmp = curr->load.weight * (u64)granularity; - return (long) (tmp >> NICE_0_SHIFT); - } - /* - * Negative nice level tasks get linearly finer - * granularity: - */ - tmp = curr->load.inv_weight * (u64)granularity; - - /* - * It will always fit into 'long': - */ - return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT)); + return period; } static inline void @@ -646,36 +583,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) */ static void __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, - struct sched_entity *curr, unsigned long granularity) + struct sched_entity *curr) { - s64 __delta = curr->fair_key - se->fair_key; unsigned long ideal_runtime, delta_exec; - /* - * ideal_runtime is compared against sum_exec_runtime, which is - * walltime, hence do not scale. - */ - ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running, - (unsigned long)sysctl_sched_min_granularity); - - /* - * If we executed more than what the latency constraint suggests, - * reduce the rescheduling granularity. This way the total latency - * of how much a task is not scheduled converges to - * sysctl_sched_latency: - */ + ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) - granularity = 0; - - /* - * Take scheduling granularity into account - do not - * preempt the current task unless the best task has - * a larger than sched_granularity fairness advantage: - * - * scale granularity as key space is in fair_clock. - */ - if (__delta > niced_granularity(curr, granularity)) resched_task(rq_of(cfs_rq)->curr); } @@ -749,8 +663,7 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) if (next == curr) return; - __check_preempt_curr_fair(cfs_rq, next, curr, - sched_granularity(cfs_rq)); + __check_preempt_curr_fair(cfs_rq, next, curr); } /************************************************** @@ -944,7 +857,6 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - unsigned long gran; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -953,15 +865,8 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) return; } - gran = sysctl_sched_wakeup_granularity; - /* - * Batch tasks prefer throughput over latency: - */ - if (unlikely(p->policy == SCHED_BATCH)) - gran = sysctl_sched_batch_wakeup_granularity; - if (is_same_group(curr, p)) - __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); + __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se); } static struct task_struct *pick_next_task_fair(struct rq *rq) -- cgit v0.10.2 From 5c6b5964a0629bd39fbf4e5648a8aca32de5bcaf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:05 +0200 Subject: sched: simplify check_preempt() methods simplify the check_preempt() methods. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3179d11..45c7493 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -582,8 +582,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) * Preempt the current task with a newly woken task if needed: */ static void -__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, - struct sched_entity *curr) +__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *curr) { unsigned long ideal_runtime, delta_exec; @@ -663,7 +662,7 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) if (next == curr) return; - __check_preempt_curr_fair(cfs_rq, next, curr); + __check_preempt_curr_fair(cfs_rq, curr); } /************************************************** @@ -866,7 +865,7 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) } if (is_same_group(curr, p)) - __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se); + __check_preempt_curr_fair(cfs_rq, &curr->se); } static struct task_struct *pick_next_task_fair(struct rq *rq) -- cgit v0.10.2 From 2e09bf556fbe1a4cd8d837a3e6607de55f7cf4fd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:05 +0200 Subject: sched: wakeup granularity increase increase wakeup granularity - we were overscheduling a bit. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 45c7493..a60b1da 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -74,7 +74,7 @@ const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL; * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -const_debug unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL; unsigned int sysctl_sched_runtime_limit __read_mostly; @@ -582,7 +582,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) * Preempt the current task with a newly woken task if needed: */ static void -__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *curr) +check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { unsigned long ideal_runtime, delta_exec; @@ -646,8 +646,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - struct sched_entity *next; - /* * Dequeue and enqueue the task to update its * position within the tree: @@ -655,14 +653,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) dequeue_entity(cfs_rq, curr, 0); enqueue_entity(cfs_rq, curr, 0); - /* - * Reschedule if another task tops the current one. - */ - next = __pick_next_entity(cfs_rq); - if (next == curr) - return; - - __check_preempt_curr_fair(cfs_rq, curr); + if (cfs_rq->nr_running > 1) + check_preempt_tick(cfs_rq, curr); } /************************************************** @@ -852,7 +844,7 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); @@ -863,9 +855,12 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) resched_task(curr); return; } + if (is_same_group(curr, p)) { + s64 delta = curr->se.vruntime - p->se.vruntime; - if (is_same_group(curr, p)) - __check_preempt_curr_fair(cfs_rq, &curr->se); + if (delta > (s64)sysctl_sched_wakeup_granularity) + resched_task(curr); + } } static struct task_struct *pick_next_task_fair(struct rq *rq) @@ -1095,7 +1090,7 @@ struct sched_class fair_sched_class __read_mostly = { .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, - .check_preempt_curr = check_preempt_curr_fair, + .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, -- cgit v0.10.2 From aeb73b040399f94698b4f64dd058cae39187e18d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:05 +0200 Subject: sched: clean up new task placement clean up new task placement. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a60b1da..cc447fb 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -199,6 +199,21 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); } +static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct sched_entity *se = NULL; + struct rb_node *parent; + + while (*link) { + parent = *link; + se = rb_entry(parent, struct sched_entity, run_node); + link = &parent->rb_right; + } + + return se; +} + /************************************************************** * Scheduling class statistics methods: */ @@ -530,6 +545,31 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) } static void +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) +{ + struct sched_entity *last = __pick_last_entity(cfs_rq); + u64 min_runtime, latency; + + min_runtime = cfs_rq->min_vruntime; + if (last) { + min_runtime += last->vruntime; + min_runtime >>= 1; + if (initial && sched_feat(START_DEBIT)) + min_runtime += sysctl_sched_latency/2; + } + + if (!initial && sched_feat(NEW_FAIR_SLEEPERS)) { + latency = sysctl_sched_latency; + if (min_runtime > latency) + min_runtime -= latency; + else + min_runtime = 0; + } + + se->vruntime = max(se->vruntime, min_runtime); +} + +static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) { /* @@ -538,19 +578,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) update_curr(cfs_rq); if (wakeup) { - u64 min_runtime, latency; - - min_runtime = cfs_rq->min_vruntime; - min_runtime += sysctl_sched_latency/2; - - if (sched_feat(NEW_FAIR_SLEEPERS)) { - latency = calc_weighted(sysctl_sched_latency, se); - if (min_runtime > latency) - min_runtime -= latency; - } - - se->vruntime = max(se->vruntime, min_runtime); - + place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); } @@ -1033,8 +1061,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) sched_info_queued(p); update_curr(cfs_rq); - se->vruntime = cfs_rq->min_vruntime; - update_stats_enqueue(cfs_rq, se); + place_entity(cfs_rq, se, 1); /* * The first wait is dominated by the child-runs-first logic, -- cgit v0.10.2 From 67e12eac328b276dca7e61640632ed996ff1a93a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:05 +0200 Subject: sched: add se->vruntime debugging debug se->vruntime fields. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 6b789da..75ccf7a 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -44,7 +44,8 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", + SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld %15Ld\n", + (long long)p->se.vruntime, (long long)p->se.sum_exec_runtime, (long long)p->se.sum_wait_runtime, (long long)p->se.sum_sleep_runtime, @@ -64,10 +65,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) "\nrunnable tasks:\n" " task PID tree-key delta waiting" " switches prio" - " sum-exec sum-wait sum-sleep" + " exec-runtime sum-exec sum-wait sum-sleep" " wait-overrun wait-underrun\n" "------------------------------------------------------------------" - "----------------" + "--------------------------------" "------------------------------------------------" "--------------------------------\n"); @@ -108,6 +109,11 @@ print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { + s64 MIN_vruntime = -1, max_vruntime = -1, spread; + struct rq *rq = &per_cpu(runqueues, cpu); + struct sched_entity *last; + unsigned long flags; + SEQ_printf(m, "\ncfs_rq\n"); #define P(x) \ @@ -115,6 +121,23 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) P(fair_clock); P(exec_clock); + P(min_vruntime); + + spin_lock_irqsave(&rq->lock, flags); + if (cfs_rq->rb_leftmost) + MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; + last = __pick_last_entity(cfs_rq); + if (last) + max_vruntime = last->vruntime; + spin_unlock_irqrestore(&rq->lock, flags); + SEQ_printf(m, " .%-30s: %Ld\n", "MIN_vruntime", + (long long)MIN_vruntime); + SEQ_printf(m, " .%-30s: %Ld\n", "max_vruntime", + (long long)max_vruntime); + spread = max_vruntime - MIN_vruntime; + SEQ_printf(m, " .%-30s: %Ld\n", "spread", + (long long)spread); + P(wait_runtime); P(wait_runtime_overruns); P(wait_runtime_underruns); @@ -243,6 +266,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.wait_start_fair); P(se.exec_start); P(se.sleep_start_fair); + P(se.vruntime); P(se.sum_exec_runtime); #ifdef CONFIG_SCHEDSTATS -- cgit v0.10.2 From 28a1f6fa2f7ecec7e5da28b03a24abbecbd2e864 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:05 +0200 Subject: sched: remove SCHED_FEAT_SKIP_INITIAL remove SCHED_FEAT_SKIP_INITIAL - it was off by default and even when enabled it never made any real difference. Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index 5594e65..bf85b4b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -399,7 +399,6 @@ enum { SCHED_FEAT_SLEEPER_AVG = 4, SCHED_FEAT_SLEEPER_LOAD_AVG = 8, SCHED_FEAT_START_DEBIT = 16, - SCHED_FEAT_SKIP_INITIAL = 32, }; const_debug unsigned int sysctl_sched_features = @@ -407,8 +406,7 @@ const_debug unsigned int sysctl_sched_features = SCHED_FEAT_NEW_FAIR_SLEEPERS *1 | SCHED_FEAT_SLEEPER_AVG *0 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | - SCHED_FEAT_START_DEBIT *1 | - SCHED_FEAT_SKIP_INITIAL *0; + SCHED_FEAT_START_DEBIT *1; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cc447fb..c8c6b05 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1064,13 +1064,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) place_entity(cfs_rq, se, 1); /* - * The first wait is dominated by the child-runs-first logic, - * so do not credit it with that waiting time yet: - */ - if (sched_feat(SKIP_INITIAL)) - se->wait_start_fair = 0; - - /* * The statistical average of wait_runtime is about * -granularity/2, so initialize the task with that: */ -- cgit v0.10.2 From 94dfb5e75ef59068a8cf68fa6e18f25ebdcd20b9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:05 +0200 Subject: sched: add tree based averages add support for tree based vruntime averages. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index bf85b4b..198b07a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -399,6 +399,8 @@ enum { SCHED_FEAT_SLEEPER_AVG = 4, SCHED_FEAT_SLEEPER_LOAD_AVG = 8, SCHED_FEAT_START_DEBIT = 16, + SCHED_FEAT_USE_TREE_AVG = 32, + SCHED_FEAT_APPROX_AVG = 64, }; const_debug unsigned int sysctl_sched_features = @@ -406,7 +408,9 @@ const_debug unsigned int sysctl_sched_features = SCHED_FEAT_NEW_FAIR_SLEEPERS *1 | SCHED_FEAT_SLEEPER_AVG *0 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | - SCHED_FEAT_START_DEBIT *1; + SCHED_FEAT_START_DEBIT *1 | + SCHED_FEAT_USE_TREE_AVG *0 | + SCHED_FEAT_APPROX_AVG *0; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c8c6b05..86e5e8c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -547,16 +547,22 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { - struct sched_entity *last = __pick_last_entity(cfs_rq); u64 min_runtime, latency; min_runtime = cfs_rq->min_vruntime; - if (last) { - min_runtime += last->vruntime; - min_runtime >>= 1; - if (initial && sched_feat(START_DEBIT)) - min_runtime += sysctl_sched_latency/2; - } + + if (sched_feat(USE_TREE_AVG)) { + struct sched_entity *last = __pick_last_entity(cfs_rq); + if (last) { + min_runtime = __pick_next_entity(cfs_rq)->vruntime; + min_runtime += last->vruntime; + min_runtime >>= 1; + } + } else if (sched_feat(APPROX_AVG)) + min_runtime += sysctl_sched_latency/2; + + if (initial && sched_feat(START_DEBIT)) + min_runtime += sched_slice(cfs_rq, se); if (!initial && sched_feat(NEW_FAIR_SLEEPERS)) { latency = sysctl_sched_latency; -- cgit v0.10.2 From 9014623c0e3545be58a7f19f55793f6517bdc274 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:05 +0200 Subject: sched: handle vruntime 64-bit overflow Handle vruntime overflow by centering the key space around min_vruntime. ( otherwise we could overflow 64-bit vruntime in a few days with SCHED_IDLE tasks - or in a few years with nice +19. ) Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 86e5e8c..895fef7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -124,11 +124,18 @@ set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost) cfs_rq->rb_leftmost = leftmost; if (leftmost) { se = rb_entry(leftmost, struct sched_entity, run_node); - cfs_rq->min_vruntime = max(se->vruntime, - cfs_rq->min_vruntime); + if ((se->vruntime > cfs_rq->min_vruntime) || + (cfs_rq->min_vruntime > (1ULL << 61) && + se->vruntime < (1ULL << 50))) + cfs_rq->min_vruntime = se->vruntime; } } +s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return se->fair_key - cfs_rq->min_vruntime; +} + /* * Enqueue an entity into the rb-tree: */ @@ -138,7 +145,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; - s64 key = se->fair_key; + s64 key = entity_key(cfs_rq, se); int leftmost = 1; /* @@ -151,7 +158,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * We dont care about collisions. Nodes with * the same key stay together. */ - if (key - entry->fair_key < 0) { + if (key < entity_key(cfs_rq, entry)) { link = &parent->rb_left; } else { link = &parent->rb_right; -- cgit v0.10.2 From 86d9560cb6bd85986e98b4c63705daec94406bd4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:06 +0200 Subject: sched: add more vruntime statistics add more vruntime statistics. Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 75ccf7a..7a61706 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -109,7 +109,8 @@ print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 MIN_vruntime = -1, max_vruntime = -1, spread; + s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, + spread, rq0_min_vruntime, spread0; struct rq *rq = &per_cpu(runqueues, cpu); struct sched_entity *last; unsigned long flags; @@ -121,7 +122,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) P(fair_clock); P(exec_clock); - P(min_vruntime); spin_lock_irqsave(&rq->lock, flags); if (cfs_rq->rb_leftmost) @@ -129,14 +129,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) last = __pick_last_entity(cfs_rq); if (last) max_vruntime = last->vruntime; + min_vruntime = rq->cfs.min_vruntime; + rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; spin_unlock_irqrestore(&rq->lock, flags); SEQ_printf(m, " .%-30s: %Ld\n", "MIN_vruntime", (long long)MIN_vruntime); + SEQ_printf(m, " .%-30s: %Ld\n", "min_vruntime", + (long long)min_vruntime); SEQ_printf(m, " .%-30s: %Ld\n", "max_vruntime", (long long)max_vruntime); spread = max_vruntime - MIN_vruntime; SEQ_printf(m, " .%-30s: %Ld\n", "spread", (long long)spread); + spread0 = min_vruntime - rq0_min_vruntime; + SEQ_printf(m, " .%-30s: %Ld\n", "spread0", + (long long)spread0); P(wait_runtime); P(wait_runtime_overruns); -- cgit v0.10.2 From 7a62eabc4d60980eb39fff659f168d903b55c6d7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:06 +0200 Subject: sched: debug: update exec_clock only when SCHED_DEBUG micro-optimization: update cfs_rq->exec_clock only if CONFIG_SCHED_DEBUG=y. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 895fef7..ce79eb0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -301,7 +301,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); curr->sum_exec_runtime += delta_exec; - cfs_rq->exec_clock += delta_exec; + schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = delta_exec; if (unlikely(curr->load.weight != NICE_0_LOAD)) { delta_exec_weighted = calc_delta_fair(delta_exec_weighted, -- cgit v0.10.2 From 495eca494aa6006df55e3a04e105462c5940ca17 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 15 Oct 2007 17:00:06 +0200 Subject: sched: clean up struct load_stat 'struct load_stat' is redundant now so let's get rid of it. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index 198b07a..3a4ac0b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -171,10 +171,6 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; -struct load_stat { - struct load_weight load; -}; - /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; @@ -236,7 +232,7 @@ struct rq { #ifdef CONFIG_NO_HZ unsigned char in_nohz_recently; #endif - struct load_stat ls; /* capture load from *all* tasks on this cpu */ + struct load_weight load; /* capture load from *all* tasks on this cpu */ unsigned long nr_load_updates; u64 nr_switches; @@ -831,7 +827,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, * Update delta_exec, delta_fair fields for rq. * * delta_fair clock advances at a rate inversely proportional to - * total load (rq->ls.load.weight) on the runqueue, while + * total load (rq->load.weight) on the runqueue, while * delta_exec advances at the same rate as wall-clock (provided * cpu is not idle). * @@ -839,17 +835,17 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, * runqueue over any given interval. This (smoothened) load is used * during load balance. * - * This function is called /before/ updating rq->ls.load + * This function is called /before/ updating rq->load * and when switching tasks. */ static inline void inc_load(struct rq *rq, const struct task_struct *p) { - update_load_add(&rq->ls.load, p->se.load.weight); + update_load_add(&rq->load, p->se.load.weight); } static inline void dec_load(struct rq *rq, const struct task_struct *p) { - update_load_sub(&rq->ls.load, p->se.load.weight); + update_load_sub(&rq->load, p->se.load.weight); } static void inc_nr_running(struct task_struct *p, struct rq *rq) @@ -996,7 +992,7 @@ inline int task_curr(const struct task_struct *p) /* Used instead of source_load when we know the type == 0 */ unsigned long weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->ls.load.weight; + return cpu_rq(cpu)->load.weight; } static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) @@ -1979,7 +1975,7 @@ unsigned long nr_active(void) */ static void update_cpu_load(struct rq *this_rq) { - unsigned long this_load = this_rq->ls.load.weight; + unsigned long this_load = this_rq->load.weight; int i, scale; this_rq->nr_load_updates++; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 7a61706..62965f0 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -174,7 +174,7 @@ static void print_cpu(struct seq_file *m, int cpu) P(nr_running); SEQ_printf(m, " .%-30s: %lu\n", "load", - rq->ls.load.weight); + rq->load.weight); P(nr_switches); P(nr_load_updates); P(nr_uninterruptible); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ce79eb0..72f202a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -652,7 +652,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ - if (rq_of(cfs_rq)->ls.load.weight >= 2*se->load.weight) { + if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { se->slice_max = max(se->slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); } -- cgit v0.10.2 From e22f5bbf86d8cce710d5c8ba5bf57832e73aab8c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:06 +0200 Subject: sched: remove wait_runtime limit remove the wait_runtime-limit fields and the code depending on it, now that the math has been changed over to rely on the vruntime metric. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 5e5c457..353630d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -905,7 +905,6 @@ struct sched_entity { u64 vruntime; u64 prev_sum_exec_runtime; u64 wait_start_fair; - u64 sleep_start_fair; #ifdef CONFIG_SCHEDSTATS u64 wait_start; diff --git a/kernel/sched.c b/kernel/sched.c index 3a4ac0b..21cc3b2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -180,7 +180,6 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; s64 wait_runtime; - u64 sleeper_bonus; unsigned long wait_runtime_overruns, wait_runtime_underruns; struct rb_root tasks_timeline; @@ -673,19 +672,6 @@ static inline void resched_task(struct task_struct *p) } #endif -static u64 div64_likely32(u64 divident, unsigned long divisor) -{ -#if BITS_PER_LONG == 32 - if (likely(divident <= 0xffffffffULL)) - return (u32)divident / divisor; - do_div(divident, divisor); - - return divident; -#else - return divident / divisor; -#endif -} - #if BITS_PER_LONG == 32 # define WMULT_CONST (~0UL) #else @@ -1016,8 +1002,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->se.wait_start_fair) p->se.wait_start_fair -= fair_clock_offset; - if (p->se.sleep_start_fair) - p->se.sleep_start_fair -= fair_clock_offset; #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) @@ -1592,7 +1576,6 @@ static void __sched_fork(struct task_struct *p) p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.wait_runtime = 0; - p->se.sleep_start_fair = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -6582,7 +6565,6 @@ void normalize_rt_tasks(void) p->se.wait_runtime = 0; p->se.exec_start = 0; p->se.wait_start_fair = 0; - p->se.sleep_start_fair = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; p->se.sleep_start = 0; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 62965f0..3350169 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -148,7 +148,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) P(wait_runtime); P(wait_runtime_overruns); P(wait_runtime_underruns); - P(sleeper_bonus); #undef P print_cfs_rq_runtime_sum(m, cpu, cfs_rq); @@ -272,7 +271,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.wait_runtime); P(se.wait_start_fair); P(se.exec_start); - P(se.sleep_start_fair); P(se.vruntime); P(se.sum_exec_runtime); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 72f202a..a94189c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -249,41 +249,11 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) return period; } -static inline void -limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - long limit = sysctl_sched_runtime_limit; - - /* - * Niced tasks have the same history dynamic range as - * non-niced tasks: - */ - if (unlikely(se->wait_runtime > limit)) { - se->wait_runtime = limit; - schedstat_inc(se, wait_runtime_overruns); - schedstat_inc(cfs_rq, wait_runtime_overruns); - } - if (unlikely(se->wait_runtime < -limit)) { - se->wait_runtime = -limit; - schedstat_inc(se, wait_runtime_underruns); - schedstat_inc(cfs_rq, wait_runtime_underruns); - } -} - -static inline void -__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) -{ - se->wait_runtime += delta; - schedstat_add(se, sum_wait_runtime, delta); - limit_wait_runtime(cfs_rq, se); -} - static void add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) { - schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); - __add_wait_runtime(cfs_rq, se, delta); - schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); + se->wait_runtime += delta; + schedstat_add(cfs_rq, wait_runtime, delta); } /* @@ -294,7 +264,7 @@ static inline void __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { - unsigned long delta, delta_fair, delta_mine, delta_exec_weighted; + unsigned long delta_fair, delta_mine, delta_exec_weighted; struct load_weight *lw = &cfs_rq->load; unsigned long load = lw->weight; @@ -318,14 +288,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, delta_fair = calc_delta_fair(delta_exec, lw); delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); - if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) { - delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); - delta = min(delta, (unsigned long)( - (long)sysctl_sched_runtime_limit - curr->wait_runtime)); - cfs_rq->sleeper_bonus -= delta; - delta_mine -= delta; - } - cfs_rq->fair_clock += delta_fair; /* * We executed delta_exec amount of time on the CPU, @@ -461,58 +423,8 @@ update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ -static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long delta_fair) -{ - unsigned long load = cfs_rq->load.weight; - long prev_runtime; - - /* - * Do not boost sleepers if there's too much bonus 'in flight' - * already: - */ - if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) - return; - - if (sched_feat(SLEEPER_LOAD_AVG)) - load = rq_of(cfs_rq)->cpu_load[2]; - - /* - * Fix up delta_fair with the effect of us running - * during the whole sleep period: - */ - if (sched_feat(SLEEPER_AVG)) - delta_fair = div64_likely32((u64)delta_fair * load, - load + se->load.weight); - - delta_fair = calc_weighted(delta_fair, se); - - prev_runtime = se->wait_runtime; - __add_wait_runtime(cfs_rq, se, delta_fair); - delta_fair = se->wait_runtime - prev_runtime; - - /* - * Track the amount of bonus we've given to sleepers: - */ - cfs_rq->sleeper_bonus += delta_fair; -} - static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct task_struct *tsk = task_of(se); - unsigned long delta_fair; - - if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) || - !sched_feat(FAIR_SLEEPERS)) - return; - - delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), - (u64)(cfs_rq->fair_clock - se->sleep_start_fair)); - - __enqueue_sleeper(cfs_rq, se, delta_fair); - - se->sleep_start_fair = 0; - #ifdef CONFIG_SCHEDSTATS if (se->sleep_start) { u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; @@ -544,6 +456,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) * time that the task spent sleeping: */ if (unlikely(prof_on == SLEEP_PROFILING)) { + struct task_struct *tsk = task_of(se); + profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), delta >> 20); } @@ -604,7 +518,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { update_stats_dequeue(cfs_rq, se); if (sleep) { - se->sleep_start_fair = cfs_rq->fair_clock; #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9b1b0d4..97b15c2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -266,17 +266,6 @@ static ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_runtime_limit_ns", - .data = &sysctl_sched_runtime_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, .maxlen = sizeof(unsigned int), -- cgit v0.10.2 From bbdba7c0e1161934ae881ad00e4db49830f5ef59 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:06 +0200 Subject: sched: remove wait_runtime fields and features remove wait_runtime based fields and features, now that the CFS math has been changed over to the vruntime metric. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 353630d..572df1b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -888,13 +888,9 @@ struct load_weight { * 4 se->block_start * 4 se->run_node * 4 se->sleep_start - * 4 se->sleep_start_fair * 6 se->load.weight - * 7 se->delta_fair - * 15 se->wait_runtime */ struct sched_entity { - long wait_runtime; s64 fair_key; struct load_weight load; /* for load-balancing */ struct rb_node run_node; @@ -904,12 +900,10 @@ struct sched_entity { u64 sum_exec_runtime; u64 vruntime; u64 prev_sum_exec_runtime; - u64 wait_start_fair; #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; - s64 sum_wait_runtime; u64 sleep_start; u64 sleep_max; @@ -919,9 +913,6 @@ struct sched_entity { u64 block_max; u64 exec_max; u64 slice_max; - - unsigned long wait_runtime_overruns; - unsigned long wait_runtime_underruns; #endif #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched.c b/kernel/sched.c index 21cc3b2..0f0cf37 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -176,11 +176,8 @@ struct cfs_rq { struct load_weight load; unsigned long nr_running; - s64 fair_clock; u64 exec_clock; u64 min_vruntime; - s64 wait_runtime; - unsigned long wait_runtime_overruns, wait_runtime_underruns; struct rb_root tasks_timeline; struct rb_node *rb_leftmost; @@ -389,20 +386,14 @@ static void update_rq_clock(struct rq *rq) * Debugging: various feature bits */ enum { - SCHED_FEAT_FAIR_SLEEPERS = 1, - SCHED_FEAT_NEW_FAIR_SLEEPERS = 2, - SCHED_FEAT_SLEEPER_AVG = 4, - SCHED_FEAT_SLEEPER_LOAD_AVG = 8, - SCHED_FEAT_START_DEBIT = 16, - SCHED_FEAT_USE_TREE_AVG = 32, - SCHED_FEAT_APPROX_AVG = 64, + SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, + SCHED_FEAT_START_DEBIT = 2, + SCHED_FEAT_USE_TREE_AVG = 4, + SCHED_FEAT_APPROX_AVG = 8, }; const_debug unsigned int sysctl_sched_features = - SCHED_FEAT_FAIR_SLEEPERS *0 | SCHED_FEAT_NEW_FAIR_SLEEPERS *1 | - SCHED_FEAT_SLEEPER_AVG *0 | - SCHED_FEAT_SLEEPER_LOAD_AVG *1 | SCHED_FEAT_START_DEBIT *1 | SCHED_FEAT_USE_TREE_AVG *0 | SCHED_FEAT_APPROX_AVG *0; @@ -716,15 +707,11 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; - if (sched_feat(FAIR_SLEEPERS)) - lw->inv_weight = WMULT_CONST / lw->weight; } static inline void update_load_sub(struct load_weight *lw, unsigned long dec) { lw->weight -= dec; - if (sched_feat(FAIR_SLEEPERS) && likely(lw->weight)) - lw->inv_weight = WMULT_CONST / lw->weight; } /* @@ -848,8 +835,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq) static void set_load_weight(struct task_struct *p) { - p->se.wait_runtime = 0; - if (task_has_rt_policy(p)) { p->se.load.weight = prio_to_weight[0] * 2; p->se.load.inv_weight = prio_to_wmult[0] >> 1; @@ -995,13 +980,9 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); - u64 clock_offset, fair_clock_offset; + u64 clock_offset; clock_offset = old_rq->clock - new_rq->clock; - fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock; - - if (p->se.wait_start_fair) - p->se.wait_start_fair -= fair_clock_offset; #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) @@ -1571,15 +1552,12 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) */ static void __sched_fork(struct task_struct *p) { - p->se.wait_start_fair = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; - p->se.wait_runtime = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; - p->se.sum_wait_runtime = 0; p->se.sum_sleep_runtime = 0; p->se.sleep_start = 0; p->se.block_start = 0; @@ -1588,8 +1566,6 @@ static void __sched_fork(struct task_struct *p) p->se.exec_max = 0; p->se.slice_max = 0; p->se.wait_max = 0; - p->se.wait_runtime_overruns = 0; - p->se.wait_runtime_underruns = 0; #endif INIT_LIST_HEAD(&p->run_list); @@ -6436,7 +6412,6 @@ int in_sched_functions(unsigned long addr) static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) { cfs_rq->tasks_timeline = RB_ROOT; - cfs_rq->fair_clock = 1; #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; #endif @@ -6562,15 +6537,12 @@ void normalize_rt_tasks(void) read_lock_irq(&tasklist_lock); do_each_thread(g, p) { p->se.fair_key = 0; - p->se.wait_runtime = 0; p->se.exec_start = 0; - p->se.wait_start_fair = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; p->se.sleep_start = 0; p->se.block_start = 0; #endif - task_rq(p)->cfs.fair_clock = 0; task_rq(p)->clock = 0; if (!rt_task(p)) { diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 3350169..e3b6232 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -36,21 +36,16 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " "); - SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ", + SEQ_printf(m, "%15s %5d %15Ld %13Ld %5d ", p->comm, p->pid, (long long)p->se.fair_key, - (long long)(p->se.fair_key - rq->cfs.fair_clock), - (long long)p->se.wait_runtime, (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld %15Ld\n", + SEQ_printf(m, "%15Ld %15Ld %15Ld\n", (long long)p->se.vruntime, (long long)p->se.sum_exec_runtime, - (long long)p->se.sum_wait_runtime, - (long long)p->se.sum_sleep_runtime, - (long long)p->se.wait_runtime_overruns, - (long long)p->se.wait_runtime_underruns); + (long long)p->se.sum_sleep_runtime); #else SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", 0LL, 0LL, 0LL, 0LL, 0LL); @@ -63,10 +58,8 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\nrunnable tasks:\n" - " task PID tree-key delta waiting" - " switches prio" - " exec-runtime sum-exec sum-wait sum-sleep" - " wait-overrun wait-underrun\n" + " task PID tree-key switches prio" + " exec-runtime sum-exec sum-sleep\n" "------------------------------------------------------------------" "--------------------------------" "------------------------------------------------" @@ -84,29 +77,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_unlock_irq(&tasklist_lock); } -static void -print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) -{ - s64 wait_runtime_rq_sum = 0; - struct task_struct *p; - struct rb_node *curr; - unsigned long flags; - struct rq *rq = &per_cpu(runqueues, cpu); - - spin_lock_irqsave(&rq->lock, flags); - curr = first_fair(cfs_rq); - while (curr) { - p = rb_entry(curr, struct task_struct, se.run_node); - wait_runtime_rq_sum += p->se.wait_runtime; - - curr = rb_next(curr); - } - spin_unlock_irqrestore(&rq->lock, flags); - - SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum", - (long long)wait_runtime_rq_sum); -} - void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, @@ -120,7 +90,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) - P(fair_clock); P(exec_clock); spin_lock_irqsave(&rq->lock, flags); @@ -144,13 +113,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) spread0 = min_vruntime - rq0_min_vruntime; SEQ_printf(m, " .%-30s: %Ld\n", "spread0", (long long)spread0); - - P(wait_runtime); - P(wait_runtime_overruns); - P(wait_runtime_underruns); #undef P - - print_cfs_rq_runtime_sum(m, cpu, cfs_rq); } static void print_cpu(struct seq_file *m, int cpu) @@ -268,8 +231,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #define P(F) \ SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) - P(se.wait_runtime); - P(se.wait_start_fair); P(se.exec_start); P(se.vruntime); P(se.sum_exec_runtime); @@ -283,9 +244,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.exec_max); P(se.slice_max); P(se.wait_max); - P(se.wait_runtime_overruns); - P(se.wait_runtime_underruns); - P(se.sum_wait_runtime); #endif SEQ_printf(m, "%-25s:%20Ld\n", "nr_switches", (long long)(p->nvcsw + p->nivcsw)); @@ -312,8 +270,6 @@ void proc_sched_set_task(struct task_struct *p) p->se.exec_max = 0; p->se.slice_max = 0; p->se.wait_max = 0; - p->se.wait_runtime_overruns = 0; - p->se.wait_runtime_underruns = 0; #endif p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a94189c..2df5a64 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -178,8 +178,6 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&cfs_rq->load, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; - - schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); } static void @@ -192,8 +190,6 @@ __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; - - schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); } static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) @@ -249,13 +245,6 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) return period; } -static void -add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) -{ - se->wait_runtime += delta; - schedstat_add(cfs_rq, wait_runtime, delta); -} - /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -264,9 +253,7 @@ static inline void __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { - unsigned long delta_fair, delta_mine, delta_exec_weighted; - struct load_weight *lw = &cfs_rq->load; - unsigned long load = lw->weight; + unsigned long delta_exec_weighted; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); @@ -278,25 +265,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, &curr->load); } curr->vruntime += delta_exec_weighted; - - if (!sched_feat(FAIR_SLEEPERS)) - return; - - if (unlikely(!load)) - return; - - delta_fair = calc_delta_fair(delta_exec, lw); - delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); - - cfs_rq->fair_clock += delta_fair; - /* - * We executed delta_exec amount of time on the CPU, - * but we were only entitled to delta_mine amount of - * time during that period (if nr_running == 1 then - * the two values are equal) - * [Note: delta_mine - delta_exec is negative]: - */ - add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); } static void update_curr(struct cfs_rq *cfs_rq) @@ -322,7 +290,6 @@ static void update_curr(struct cfs_rq *cfs_rq) static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - se->wait_start_fair = cfs_rq->fair_clock; schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); } @@ -354,35 +321,11 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) se->fair_key = se->vruntime; } -/* - * Note: must be called with a freshly updated rq->fair_clock. - */ -static inline void -__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long delta_fair) -{ - schedstat_set(se->wait_max, max(se->wait_max, - rq_of(cfs_rq)->clock - se->wait_start)); - - delta_fair = calc_weighted(delta_fair, se); - - add_wait_runtime(cfs_rq, se, delta_fair); -} - static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long delta_fair; - - if (unlikely(!se->wait_start_fair)) - return; - - delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), - (u64)(cfs_rq->fair_clock - se->wait_start_fair)); - - __update_stats_wait_end(cfs_rq, se, delta_fair); - - se->wait_start_fair = 0; + schedstat_set(se->wait_max, max(se->wait_max, + rq_of(cfs_rq)->clock - se->wait_start)); schedstat_set(se->wait_start, 0); } @@ -552,9 +495,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * Any task has to be enqueued before it get to execute on * a CPU. So account for the time it spent waiting on the - * runqueue. (note, here we rely on pick_next_task() having - * done a put_prev_task_fair() shortly before this, which - * updated rq->fair_clock - used by update_stats_wait_end()) + * runqueue. */ update_stats_wait_end(cfs_rq, se); update_stats_curr_start(cfs_rq, se); @@ -989,13 +930,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) update_curr(cfs_rq); place_entity(cfs_rq, se, 1); - /* - * The statistical average of wait_runtime is about - * -granularity/2, so initialize the task with that: - */ - if (sched_feat(START_DEBIT)) - se->wait_runtime = -(__sched_period(cfs_rq->nr_running+1) / 2); - if (sysctl_sched_child_runs_first && curr->vruntime < se->vruntime) { -- cgit v0.10.2 From db36cc7d6d9e538481e60fae7f56646b92557526 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 15 Oct 2007 17:00:06 +0200 Subject: sched: clean up schedstat block in dequeue_entity() Better placement of #ifdef CONFIG_SCHEDSTAT block in dequeue_entity(). Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2df5a64..e3081fb 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -460,8 +460,8 @@ static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { update_stats_dequeue(cfs_rq, se); - if (sleep) { #ifdef CONFIG_SCHEDSTATS + if (sleep) { if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -470,8 +470,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) if (tsk->state & TASK_UNINTERRUPTIBLE) se->block_start = rq_of(cfs_rq)->clock; } -#endif } +#endif __dequeue_entity(cfs_rq, se); } -- cgit v0.10.2 From 35a6ff5417bf94c9e19b6b55a9eb6eea14cc7be7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:07 +0200 Subject: sched: x86: allow single-depth wchan output sched.o gets smaller and faster if we compile it with -fomit-frame-pointers, so make this a config option. The cost is the loss of multi-depth wchan lookups - but SysRq-T is a sufficient replacement for them anyway, so their utility is much lower these days. the size difference is significant: text data bss dec hex filename 34005 3462 24 37491 9273 sched.o.before 33470 3462 24 36956 905c sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index f1486f8..bf9aafa 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -214,6 +214,17 @@ config X86_ES7000 endchoice +config SCHED_NO_NO_OMIT_FRAME_POINTER + bool "Single-depth WCHAN output" + default y + help + Calculate simpler /proc//wchan values. If this option + is disabled then wchan values will recurse back to the + caller function. This provides more accurate wchan values, + at the expense of slightly more scheduling overhead. + + If in doubt, say "Y". + config PARAVIRT bool "Paravirtualization support (EXPERIMENTAL)" depends on EXPERIMENTAL -- cgit v0.10.2 From 02e0431a3db554019b816936b597d618256b705d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:07 +0200 Subject: sched: better min_vruntime tracking Better min_vruntime tracking: update it every time 'curr' is updated - not just when a task is enqueued into the tree. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e3081fb..ec445ca 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -116,22 +116,28 @@ static inline struct task_struct *task_of(struct sched_entity *se) * Scheduling class tree data structure manipulation methods: */ +static inline u64 +max_vruntime(u64 min_vruntime, u64 vruntime) +{ + if ((vruntime > min_vruntime) || + (min_vruntime > (1ULL << 61) && vruntime < (1ULL << 50))) + min_vruntime = vruntime; + + return min_vruntime; +} + static inline void set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost) { struct sched_entity *se; cfs_rq->rb_leftmost = leftmost; - if (leftmost) { + if (leftmost) se = rb_entry(leftmost, struct sched_entity, run_node); - if ((se->vruntime > cfs_rq->min_vruntime) || - (cfs_rq->min_vruntime > (1ULL << 61) && - se->vruntime < (1ULL << 50))) - cfs_rq->min_vruntime = se->vruntime; - } } -s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline s64 +entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { return se->fair_key - cfs_rq->min_vruntime; } @@ -254,6 +260,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { unsigned long delta_exec_weighted; + u64 next_vruntime, min_vruntime; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); @@ -265,6 +272,25 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, &curr->load); } curr->vruntime += delta_exec_weighted; + + /* + * maintain cfs_rq->min_vruntime to be a monotonic increasing + * value tracking the leftmost vruntime in the tree. + */ + if (first_fair(cfs_rq)) { + next_vruntime = __pick_next_entity(cfs_rq)->vruntime; + + /* min_vruntime() := !max_vruntime() */ + min_vruntime = max_vruntime(curr->vruntime, next_vruntime); + if (min_vruntime == next_vruntime) + min_vruntime = curr->vruntime; + else + min_vruntime = next_vruntime; + } else + min_vruntime = curr->vruntime; + + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, min_vruntime); } static void update_curr(struct cfs_rq *cfs_rq) -- cgit v0.10.2 From 119fe5e06800afc197781ebc8c2d8ca7d03497c8 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 15 Oct 2007 17:00:07 +0200 Subject: sched: fix SMP migration latencies fix SMP migration latencies: the vruntimes of different CPUs are at incompatible offsets so they have to be fixed up when migrating a task across CPUs. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index 0f0cf37..4ad789d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -992,6 +992,9 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->se.block_start) p->se.block_start -= clock_offset; #endif + if (likely(new_rq->cfs.min_vruntime)) + p->se.vruntime -= old_rq->cfs.min_vruntime - + new_rq->cfs.min_vruntime; __set_task_cpu(p, new_cpu); } -- cgit v0.10.2 From 29f59db3a74b0bdf78a1f5b53ef773caa82692dc Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 15 Oct 2007 17:00:07 +0200 Subject: sched: group-scheduler core Add interface to control cpu bandwidth allocation to task-groups. (not yet configurable, due to missing CONFIG_CONTAINERS) Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra diff --git a/init/Kconfig b/init/Kconfig index d54d0ca..11c6762 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -281,6 +281,15 @@ config CPUSETS Say N if unsure. +config FAIR_GROUP_SCHED + bool "Fair group scheduler" + depends on EXPERIMENTAL && CONTAINERS + help + This option enables you to group tasks and control CPU resource + allocation to such groups. + + Say N if unsure. + config SYSFS_DEPRECATED bool "Create deprecated sysfs files" default y diff --git a/kernel/sched.c b/kernel/sched.c index 4ad789d..b2688ce 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -171,6 +171,58 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; +#ifdef CONFIG_FAIR_GROUP_SCHED + +#include + +struct cfs_rq; + +/* task group related information */ +struct task_grp { + struct container_subsys_state css; + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + unsigned long shares; +}; + +/* Default task group's sched entity on each cpu */ +static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); +/* Default task group's cfs_rq on each cpu */ +static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; + +static struct sched_entity *init_sched_entity_p[CONFIG_NR_CPUS]; +static struct cfs_rq *init_cfs_rq_p[CONFIG_NR_CPUS]; + +/* Default task group. + * Every task in system belong to this group at bootup. + */ +static struct task_grp init_task_grp = { + .se = init_sched_entity_p, + .cfs_rq = init_cfs_rq_p, + }; + +/* return group to which a task belongs */ +static inline struct task_grp *task_grp(struct task_struct *p) +{ + return container_of(task_subsys_state(p, cpu_subsys_id), + struct task_grp, css); +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_cfs_rq(struct task_struct *p) +{ + p->se.cfs_rq = task_grp(p)->cfs_rq[task_cpu(p)]; + p->se.parent = task_grp(p)->se[task_cpu(p)]; +} + +#else + +static inline void set_task_cfs_rq(struct task_struct *p) { } + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; @@ -197,6 +249,7 @@ struct cfs_rq { * list is used during load balance. */ struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ + struct task_grp *tg; /* group that "owns" this runqueue */ #endif }; @@ -419,18 +472,6 @@ unsigned long long cpu_clock(int cpu) return now; } -#ifdef CONFIG_FAIR_GROUP_SCHED -/* Change a task's ->cfs_rq if it moves across CPUs */ -static inline void set_task_cfs_rq(struct task_struct *p) -{ - p->se.cfs_rq = &task_rq(p)->cfs; -} -#else -static inline void set_task_cfs_rq(struct task_struct *p) -{ -} -#endif - #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif @@ -970,8 +1011,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { #ifdef CONFIG_SMP task_thread_info(p)->cpu = cpu; - set_task_cfs_rq(p); #endif + set_task_cfs_rq(p); } #ifdef CONFIG_SMP @@ -3885,8 +3926,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) oldprio = p->prio; on_rq = p->se.on_rq; - if (on_rq) + if (on_rq) { dequeue_task(rq, p, 0); + if (task_running(rq, p)) + p->sched_class->put_prev_task(rq, p); + } if (rt_prio(prio)) p->sched_class = &rt_sched_class; @@ -3905,6 +3949,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); + p->sched_class->set_curr_task(rq); } else { check_preempt_curr(rq, p); } @@ -4190,8 +4235,11 @@ recheck: } update_rq_clock(rq); on_rq = p->se.on_rq; - if (on_rq) + if (on_rq) { deactivate_task(rq, p, 0); + if (task_running(rq, p)) + p->sched_class->put_prev_task(rq, p); + } oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); if (on_rq) { @@ -4204,6 +4252,7 @@ recheck: if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); + p->sched_class->set_curr_task(rq); } else { check_preempt_curr(rq, p); } @@ -6444,7 +6493,25 @@ void __init sched_init(void) init_cfs_rq(&rq->cfs, rq); #ifdef CONFIG_FAIR_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); - list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + { + struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); + struct sched_entity *se = + &per_cpu(init_sched_entity, i); + + init_cfs_rq_p[i] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = &init_task_grp; + list_add(&cfs_rq->leaf_cfs_rq_list, + &rq->leaf_cfs_rq_list); + + init_sched_entity_p[i] = se; + se->cfs_rq = &rq->cfs; + se->my_q = cfs_rq; + se->load.weight = NICE_0_LOAD; + se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); + se->parent = NULL; + } + init_task_grp.shares = NICE_0_LOAD; #endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) @@ -6632,3 +6699,250 @@ void set_curr_task(int cpu, struct task_struct *p) } #endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* return corresponding task_grp object of a container */ +static inline struct task_grp *container_tg(struct container *cont) +{ + return container_of(container_subsys_state(cont, cpu_subsys_id), + struct task_grp, css); +} + +/* allocate runqueue etc for a new task group */ +static struct container_subsys_state * +sched_create_group(struct container_subsys *ss, struct container *cont) +{ + struct task_grp *tg; + struct cfs_rq *cfs_rq; + struct sched_entity *se; + int i; + + if (!cont->parent) { + /* This is early initialization for the top container */ + init_task_grp.css.container = cont; + return &init_task_grp.css; + } + + /* we support only 1-level deep hierarchical scheduler atm */ + if (cont->parent->parent) + return ERR_PTR(-EINVAL); + + tg = kzalloc(sizeof(*tg), GFP_KERNEL); + if (!tg) + return ERR_PTR(-ENOMEM); + + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * num_possible_cpus(), GFP_KERNEL); + if (!tg->cfs_rq) + goto err; + tg->se = kzalloc(sizeof(se) * num_possible_cpus(), GFP_KERNEL); + if (!tg->se) + goto err; + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + + cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, + cpu_to_node(i)); + if (!cfs_rq) + goto err; + + se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, + cpu_to_node(i)); + if (!se) + goto err; + + memset(cfs_rq, 0, sizeof(struct cfs_rq)); + memset(se, 0, sizeof(struct sched_entity)); + + tg->cfs_rq[i] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + + tg->se[i] = se; + se->cfs_rq = &rq->cfs; + se->my_q = cfs_rq; + se->load.weight = NICE_0_LOAD; + se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); + se->parent = NULL; + } + + tg->shares = NICE_0_LOAD; + + /* Bind the container to task_grp object we just created */ + tg->css.container = cont; + + return &tg->css; + +err: + for_each_possible_cpu(i) { + if (tg->cfs_rq && tg->cfs_rq[i]) + kfree(tg->cfs_rq[i]); + if (tg->se && tg->se[i]) + kfree(tg->se[i]); + } + if (tg->cfs_rq) + kfree(tg->cfs_rq); + if (tg->se) + kfree(tg->se); + if (tg) + kfree(tg); + + return ERR_PTR(-ENOMEM); +} + + +/* destroy runqueue etc associated with a task group */ +static void sched_destroy_group(struct container_subsys *ss, + struct container *cont) +{ + struct task_grp *tg = container_tg(cont); + struct cfs_rq *cfs_rq; + struct sched_entity *se; + int i; + + for_each_possible_cpu(i) { + cfs_rq = tg->cfs_rq[i]; + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + } + + /* wait for possible concurrent references to cfs_rqs complete */ + synchronize_sched(); + + /* now it should be safe to free those cfs_rqs */ + for_each_possible_cpu(i) { + cfs_rq = tg->cfs_rq[i]; + kfree(cfs_rq); + + se = tg->se[i]; + kfree(se); + } + + kfree(tg->cfs_rq); + kfree(tg->se); + kfree(tg); +} + +static int sched_can_attach(struct container_subsys *ss, + struct container *cont, struct task_struct *tsk) +{ + /* We don't support RT-tasks being in separate groups */ + if (tsk->sched_class != &fair_sched_class) + return -EINVAL; + + return 0; +} + +/* change task's runqueue when it moves between groups */ +static void sched_move_task(struct container_subsys *ss, struct container *cont, + struct container *old_cont, struct task_struct *tsk) +{ + int on_rq, running; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(tsk, &flags); + + if (tsk->sched_class != &fair_sched_class) + goto done; + + update_rq_clock(rq); + + running = task_running(rq, tsk); + on_rq = tsk->se.on_rq; + + if (on_rq) { + dequeue_task(rq, tsk, 0); + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); + } + + set_task_cfs_rq(tsk); + + if (on_rq) { + enqueue_task(rq, tsk, 0); + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); + } + +done: + task_rq_unlock(rq, &flags); +} + +static void set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; + int on_rq; + + spin_lock_irq(&rq->lock); + + on_rq = se->on_rq; + if (on_rq) + dequeue_entity(cfs_rq, se, 0); + + se->load.weight = shares; + se->load.inv_weight = div64_64((1ULL<<32), shares); + + if (on_rq) + enqueue_entity(cfs_rq, se, 0); + + spin_unlock_irq(&rq->lock); +} + +static ssize_t cpu_shares_write(struct container *cont, struct cftype *cftype, + struct file *file, const char __user *userbuf, + size_t nbytes, loff_t *ppos) +{ + int i; + unsigned long shareval; + struct task_grp *tg = container_tg(cont); + char buffer[2*sizeof(unsigned long) + 1]; + + if (nbytes > 2*sizeof(unsigned long)) /* safety check */ + return -E2BIG; + + if (copy_from_user(buffer, userbuf, nbytes)) + return -EFAULT; + + buffer[nbytes] = 0; /* nul-terminate */ + shareval = simple_strtoul(buffer, NULL, 10); + + tg->shares = shareval; + for_each_possible_cpu(i) + set_se_shares(tg->se[i], shareval); + + return nbytes; +} + +static u64 cpu_shares_read_uint(struct container *cont, struct cftype *cft) +{ + struct task_grp *tg = container_tg(cont); + + return (u64) tg->shares; +} + +struct cftype cpuctl_share = { + .name = "shares", + .read_uint = cpu_shares_read_uint, + .write = cpu_shares_write, +}; + +static int sched_populate(struct container_subsys *ss, struct container *cont) +{ + return container_add_file(cont, ss, &cpuctl_share); +} + +struct container_subsys cpu_subsys = { + .name = "cpu", + .create = sched_create_group, + .destroy = sched_destroy_group, + .can_attach = sched_can_attach, + .attach = sched_move_task, + .populate = sched_populate, + .subsys_id = cpu_subsys_id, + .early_init = 1, +}; + +#endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ec445ca..12ab933 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -610,8 +610,7 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) */ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) { - /* A later patch will take group into account */ - return &cpu_rq(this_cpu)->cfs; + return cfs_rq->tg->cfs_rq[this_cpu]; } /* Iterate thr' all leaf cfs_rq's on a runqueue */ diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3503fb2..5ebf829 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -50,6 +50,10 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr) { } +static void set_curr_task_idle(struct rq *rq) +{ +} + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -66,6 +70,7 @@ static struct sched_class idle_sched_class __read_mostly = { .load_balance = load_balance_idle, + .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, /* no .task_new for idle tasks */ }; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 4b87476..45b339f 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -218,6 +218,10 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) } } +static void set_curr_task_rt(struct rq *rq) +{ +} + static struct sched_class rt_sched_class __read_mostly = { .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, @@ -230,5 +234,6 @@ static struct sched_class rt_sched_class __read_mostly = { .load_balance = load_balance_rt, + .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, }; -- cgit v0.10.2 From d02e5ed8d55e2a2b2735232ea1da40ffbf4c0932 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 15 Oct 2007 17:00:07 +0200 Subject: sched: sched_setscheduler() fix Fix a problem in the 'sched-group' patch for !CONFIG_FAIR_GROUP_SCHED. description: sched_setscheduler() { ... if (task_running()) p->sched_class->put_prev_entity(); [ this one sets up cfs_rq->curr to NULL ] ... if (task_running) p->sched_class->set_curr_task(); [ and this one is a _NOP_ (empty) for !CONFIG_FAIR_GROUP_SCHED ] As a result, the task continues to run with cfs_rq->curr == NULL... no crashes (due to checks for !NULL in place) but e.g. update_curr() effectively becomes a NOP... i.e. runtime statistics for this task is not accounted untill it's rescheduled anew. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 12ab933..144f3ef 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -984,6 +984,10 @@ static void set_curr_task_fair(struct rq *rq) #else static void set_curr_task_fair(struct rq *rq) { + struct sched_entity *se = &rq->curr->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + cfs_rq->curr = se; } #endif -- cgit v0.10.2 From 7074badbcb4212d404a243e5c50efeb778ec3fc6 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 15 Oct 2007 17:00:07 +0200 Subject: sched: add set_curr_task() calls p->sched_class->set_curr_task() has to be called before activate_task()/enqueue_task() in rt_mutex_setprio(), sched_setschedule() and sched_move_task() in order to set up 'cfs_rq->curr'. The logic of enqueueing depends on whether a task to be inserted is 'current' or not. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index b2688ce..6d18921 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3915,8 +3915,8 @@ EXPORT_SYMBOL(sleep_on_timeout); */ void rt_mutex_setprio(struct task_struct *p, int prio) { + int oldprio, on_rq, running; unsigned long flags; - int oldprio, on_rq; struct rq *rq; BUG_ON(prio < 0 || prio > MAX_PRIO); @@ -3926,9 +3926,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) oldprio = p->prio; on_rq = p->se.on_rq; + running = task_running(rq, p); if (on_rq) { dequeue_task(rq, p, 0); - if (task_running(rq, p)) + if (running) p->sched_class->put_prev_task(rq, p); } @@ -3940,16 +3941,17 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; if (on_rq) { + if (running) + p->sched_class->set_curr_task(rq); enqueue_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_running(rq, p)) { + if (running) { if (p->prio > oldprio) resched_task(rq->curr); - p->sched_class->set_curr_task(rq); } else { check_preempt_curr(rq, p); } @@ -4153,7 +4155,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { - int retval, oldprio, oldpolicy = -1, on_rq; + int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; struct rq *rq; @@ -4235,24 +4237,26 @@ recheck: } update_rq_clock(rq); on_rq = p->se.on_rq; + running = task_running(rq, p); if (on_rq) { deactivate_task(rq, p, 0); - if (task_running(rq, p)) + if (running) p->sched_class->put_prev_task(rq, p); } oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); if (on_rq) { + if (running) + p->sched_class->set_curr_task(rq); activate_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_running(rq, p)) { + if (running) { if (p->prio > oldprio) resched_task(rq->curr); - p->sched_class->set_curr_task(rq); } else { check_preempt_curr(rq, p); } @@ -6861,9 +6865,9 @@ static void sched_move_task(struct container_subsys *ss, struct container *cont, set_task_cfs_rq(tsk); if (on_rq) { - enqueue_task(rq, tsk, 0); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); + enqueue_task(rq, tsk, 0); } done: -- cgit v0.10.2 From 30cfdcfc5f180fc21a3dad6ae3b7b2a9ee112186 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 15 Oct 2007 17:00:07 +0200 Subject: sched: do not keep current in the tree and get rid of sched_entity::fair_key Get rid of 'sched_entity::fair_key'. As a side effect, 'current' is not kept withing the tree for SCHED_NORMAL/BATCH tasks anymore. This simplifies some parts of code (e.g. entity_tick() and yield_task_fair()) and also somewhat optimizes them (e.g. a single update_curr() now vs. dequeue/enqueue() before in entity_tick()). Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 572df1b..f776a30 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -891,7 +891,6 @@ struct load_weight { * 6 se->load.weight */ struct sched_entity { - s64 fair_key; struct load_weight load; /* for load-balancing */ struct rb_node run_node; unsigned int on_rq; diff --git a/kernel/sched.c b/kernel/sched.c index 6d18921..3b10463 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,7 +6610,6 @@ void normalize_rt_tasks(void) read_lock_irq(&tasklist_lock); do_each_thread(g, p) { - p->se.fair_key = 0; p->se.exec_start = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index e3b6232..bb34b81 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -38,7 +38,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, "%15s %5d %15Ld %13Ld %5d ", p->comm, p->pid, - (long long)p->se.fair_key, + (long long)p->se.vruntime, (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 144f3ef..b9e426a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -139,7 +139,7 @@ set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost) static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return se->fair_key - cfs_rq->min_vruntime; + return se->vruntime - cfs_rq->min_vruntime; } /* @@ -181,9 +181,6 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); - update_load_add(&cfs_rq->load, se->load.weight); - cfs_rq->nr_running++; - se->on_rq = 1; } static void @@ -193,9 +190,6 @@ __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) set_leftmost(cfs_rq, rb_next(&se->run_node)); rb_erase(&se->run_node, &cfs_rq->tasks_timeline); - update_load_sub(&cfs_rq->load, se->load.weight); - cfs_rq->nr_running--; - se->on_rq = 0; } static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) @@ -341,10 +335,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (se != cfs_rq->curr) update_stats_wait_start(cfs_rq, se); - /* - * Update the key: - */ - se->fair_key = se->vruntime; } static void @@ -392,6 +382,22 @@ update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +static void +account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_add(&cfs_rq->load, se->load.weight); + cfs_rq->nr_running++; + se->on_rq = 1; +} + +static void +account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_sub(&cfs_rq->load, se->load.weight); + cfs_rq->nr_running--; + se->on_rq = 0; +} + static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS @@ -479,7 +485,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) } update_stats_enqueue(cfs_rq, se); - __enqueue_entity(cfs_rq, se); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + account_entity_enqueue(cfs_rq, se); } static void @@ -498,7 +506,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) } } #endif - __dequeue_entity(cfs_rq, se); + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + account_entity_dequeue(cfs_rq, se); } /* @@ -544,6 +554,10 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); + /* 'current' is not kept within the tree. */ + if (se) + __dequeue_entity(cfs_rq, se); + set_next_entity(cfs_rq, se); return se; @@ -560,19 +574,20 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) update_stats_curr_end(cfs_rq, prev); - if (prev->on_rq) + if (prev->on_rq) { update_stats_wait_start(cfs_rq, prev); + /* Put 'current' back into the tree. */ + __enqueue_entity(cfs_rq, prev); + } cfs_rq->curr = NULL; } static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { /* - * Dequeue and enqueue the task to update its - * position within the tree: + * Update run-time statistics of the 'current'. */ - dequeue_entity(cfs_rq, curr, 0); - enqueue_entity(cfs_rq, curr, 0); + update_curr(cfs_rq); if (cfs_rq->nr_running > 1) check_preempt_tick(cfs_rq, curr); @@ -749,7 +764,7 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p) /* * Minimally necessary key value to be last in the tree: */ - se->fair_key = rightmost->fair_key + 1; + se->vruntime = rightmost->vruntime + 1; if (cfs_rq->rb_leftmost == &se->run_node) cfs_rq->rb_leftmost = rb_next(&se->run_node); @@ -965,6 +980,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) update_stats_enqueue(cfs_rq, se); __enqueue_entity(cfs_rq, se); + account_entity_enqueue(cfs_rq, se); resched_task(rq->curr); } -- cgit v0.10.2 From 75d4ef16a6aa84f708188bada182315f80aab6fa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:08 +0200 Subject: sched: fix delay accounting performance regression fix delay accounting performance regression - those sched_clock() calls are not needed. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index c20a94d..1d9ec98 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -129,7 +129,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) # define schedstat_set(var, val) do { } while (0) #endif -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +#ifdef CONFIG_SCHEDSTATS /* * Called when a process is dequeued from the active array and given * the cpu. We should note that with the exception of interactive @@ -233,5 +233,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) #else #define sched_info_queued(t) do { } while (0) #define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +#endif /* CONFIG_SCHEDSTATS */ -- cgit v0.10.2 From 87fefa381ef27f46c1182622ea01eb9504cd2e24 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 15 Oct 2007 17:00:08 +0200 Subject: sched: optimize task_new_fair() due to the fact that we no longer keep the 'current' within the tree, dequeue/enqueue_entity() is useless for the 'current' in task_new_fair(). We are about to reschedule and sched_class->put_prev_task() will put the 'current' back into the tree, based on its new key. text data bss dec hex filename 24388 2734 20 27142 6a06 sched.o.before 24341 2734 20 27095 69d7 sched.o.after Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b9e426a..827a063 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -972,10 +972,11 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) if (sysctl_sched_child_runs_first && curr->vruntime < se->vruntime) { - - dequeue_entity(cfs_rq, curr, 0); + /* + * Upon rescheduling, sched_class::put_prev_task() will place + * 'current' within the tree based on its new key value. + */ swap(curr->vruntime, se->vruntime); - enqueue_entity(cfs_rq, curr, 0); } update_stats_enqueue(cfs_rq, se); -- cgit v0.10.2 From 4530d7ab0fb8d5056b68c376949e2d5c4db7817e Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 15 Oct 2007 17:00:08 +0200 Subject: sched: simplify sched_class::yield_task() the 'p' (task_struct) parameter in the sched_class :: yield_task() is redundant as the caller is always the 'current'. Get rid of it. text data bss dec hex filename 24341 2734 20 27095 69d7 sched.o.before 24330 2734 20 27084 69cc sched.o.after Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index f776a30..6616900 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -858,7 +858,7 @@ struct sched_class { void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); - void (*yield_task) (struct rq *rq, struct task_struct *p); + void (*yield_task) (struct rq *rq); void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); diff --git a/kernel/sched.c b/kernel/sched.c index 3b10463..e1f784f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4537,7 +4537,7 @@ asmlinkage long sys_sched_yield(void) struct rq *rq = this_rq_lock(); schedstat_inc(rq, yld_cnt); - current->sched_class->yield_task(rq, current); + current->sched_class->yield_task(rq); /* * Since we are going to call schedule() anyway, there's diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 827a063..4dd256d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -722,11 +722,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) * * If compat_yield is turned on then we requeue to the end of the tree. */ -static void yield_task_fair(struct rq *rq, struct task_struct *p) +static void yield_task_fair(struct rq *rq) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct cfs_rq *cfs_rq = &rq->cfs; struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; - struct sched_entity *rightmost, *se = &p->se; + struct sched_entity *rightmost, *se = &rq->curr->se; struct rb_node *parent; /* @@ -741,8 +741,8 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p) * Dequeue and enqueue the task to update its * position within the tree: */ - dequeue_entity(cfs_rq, &p->se, 0); - enqueue_entity(cfs_rq, &p->se, 0); + dequeue_entity(cfs_rq, se, 0); + enqueue_entity(cfs_rq, se, 0); return; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 45b339f..b86944c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -59,9 +59,9 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p) } static void -yield_task_rt(struct rq *rq, struct task_struct *p) +yield_task_rt(struct rq *rq) { - requeue_task_rt(rq, p); + requeue_task_rt(rq, rq->curr); } /* -- cgit v0.10.2 From f6b53205e17c8ca481c69ed579a35a650a4b481a Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 15 Oct 2007 17:00:08 +0200 Subject: sched: rework enqueue/dequeue_entity() to get rid of set_curr_task() rework enqueue/dequeue_entity() to get rid of sched_class::set_curr_task(). This simplifies sched_setscheduler(), rt_mutex_setprio() and sched_move_tasks(). text data bss dec hex filename 24330 2734 20 27084 69cc sched.o.before 24233 2730 20 26983 6967 sched.o.after Signed-off-by: Dmitry Adamushko Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 6616900..abcb027 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -871,7 +871,6 @@ struct sched_class { struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio); - void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p); void (*task_new) (struct rq *rq, struct task_struct *p); }; diff --git a/kernel/sched.c b/kernel/sched.c index e1f784f..72c936d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3915,8 +3915,8 @@ EXPORT_SYMBOL(sleep_on_timeout); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, on_rq, running; unsigned long flags; + int oldprio, on_rq; struct rq *rq; BUG_ON(prio < 0 || prio > MAX_PRIO); @@ -3926,12 +3926,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) oldprio = p->prio; on_rq = p->se.on_rq; - running = task_running(rq, p); - if (on_rq) { + if (on_rq) dequeue_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - } if (rt_prio(prio)) p->sched_class = &rt_sched_class; @@ -3941,15 +3937,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; if (on_rq) { - if (running) - p->sched_class->set_curr_task(rq); enqueue_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (running) { + if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); } else { @@ -4155,7 +4149,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { - int retval, oldprio, oldpolicy = -1, on_rq, running; + int retval, oldprio, oldpolicy = -1, on_rq; unsigned long flags; struct rq *rq; @@ -4237,24 +4231,20 @@ recheck: } update_rq_clock(rq); on_rq = p->se.on_rq; - running = task_running(rq, p); - if (on_rq) { + if (on_rq) deactivate_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - } + oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); + if (on_rq) { - if (running) - p->sched_class->set_curr_task(rq); activate_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (running) { + if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); } else { @@ -6855,19 +6845,13 @@ static void sched_move_task(struct container_subsys *ss, struct container *cont, running = task_running(rq, tsk); on_rq = tsk->se.on_rq; - if (on_rq) { + if (on_rq) dequeue_task(rq, tsk, 0); - if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); - } set_task_cfs_rq(tsk); - if (on_rq) { - if (unlikely(running)) - tsk->sched_class->set_curr_task(rq); + if (on_rq) enqueue_task(rq, tsk, 0); - } done: task_rq_unlock(rq, &flags); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4dd256d..568e922 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -472,9 +472,20 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) } static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + int wakeup, int set_curr) { /* + * In case of the 'current'. + */ + if (unlikely(set_curr)) { + update_stats_curr_start(cfs_rq, se); + cfs_rq->curr = se; + account_entity_enqueue(cfs_rq, se); + return; + } + + /* * Update the fair clock. */ update_curr(cfs_rq); @@ -485,8 +496,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) } update_stats_enqueue(cfs_rq, se); - if (se != cfs_rq->curr) - __enqueue_entity(cfs_rq, se); + __enqueue_entity(cfs_rq, se); account_entity_enqueue(cfs_rq, se); } @@ -506,8 +516,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) } } #endif - if (se != cfs_rq->curr) + if (likely(se != cfs_rq->curr)) __dequeue_entity(cfs_rq, se); + else { + update_stats_curr_end(cfs_rq, se); + cfs_rq->curr = NULL; + } account_entity_dequeue(cfs_rq, se); } @@ -689,12 +703,17 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int set_curr = 0; + + /* Are we enqueuing the current task? */ + if (unlikely(task_running(rq, p))) + set_curr = 1; for_each_sched_entity(se) { if (se->on_rq) break; cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, wakeup); + enqueue_entity(cfs_rq, se, wakeup, set_curr); } } @@ -742,7 +761,7 @@ static void yield_task_fair(struct rq *rq) * position within the tree: */ dequeue_entity(cfs_rq, se, 0); - enqueue_entity(cfs_rq, se, 0); + enqueue_entity(cfs_rq, se, 0, 1); return; } @@ -985,29 +1004,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } -#ifdef CONFIG_FAIR_GROUP_SCHED -/* Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ -static void set_curr_task_fair(struct rq *rq) -{ - struct sched_entity *se = &rq->curr->se; - - for_each_sched_entity(se) - set_next_entity(cfs_rq_of(se), se); -} -#else -static void set_curr_task_fair(struct rq *rq) -{ - struct sched_entity *se = &rq->curr->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - cfs_rq->curr = se; -} -#endif - /* * All the scheduling class methods: */ @@ -1023,7 +1019,6 @@ struct sched_class fair_sched_class __read_mostly = { .load_balance = load_balance_fair, - .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_new = task_new_fair, }; diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 5ebf829..3503fb2 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -50,10 +50,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr) { } -static void set_curr_task_idle(struct rq *rq) -{ -} - /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -70,7 +66,6 @@ static struct sched_class idle_sched_class __read_mostly = { .load_balance = load_balance_idle, - .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, /* no .task_new for idle tasks */ }; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index b86944c..3c77c03 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -218,10 +218,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) } } -static void set_curr_task_rt(struct rq *rq) -{ -} - static struct sched_class rt_sched_class __read_mostly = { .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, @@ -234,6 +230,5 @@ static struct sched_class rt_sched_class __read_mostly = { .load_balance = load_balance_rt, - .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, }; -- cgit v0.10.2 From 1a75b94f7bda591f4c53af86baa50e1eaee35927 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:08 +0200 Subject: sched: prettify /proc/sched_debug output print the correct amount of dashes in /proc/sched_debug. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index bb34b81..22cf74c 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -60,10 +60,8 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) "\nrunnable tasks:\n" " task PID tree-key switches prio" " exec-runtime sum-exec sum-sleep\n" - "------------------------------------------------------------------" - "--------------------------------" - "------------------------------------------------" - "--------------------------------\n"); + "------------------------------------------------------" + "------------------------------------------------"); read_lock_irq(&tasklist_lock); -- cgit v0.10.2 From ef83a5714d9a817b2e9b97f04a6d070fbd6ecf80 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:08 +0200 Subject: sched: enhance debug output enhance debug output by changing 12345678 nsecs to 12.345678 output, this is more human-readable. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 22cf74c..e2c1e0d 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -28,6 +28,31 @@ printk(x); \ } while (0) +/* + * Ease the printing of nsec fields: + */ +static long long nsec_high(long long nsec) +{ + if (nsec < 0) { + nsec = -nsec; + do_div(nsec, 1000000); + return -nsec; + } + do_div(nsec, 1000000); + + return nsec; +} + +static unsigned long nsec_low(long long nsec) +{ + if (nsec < 0) + nsec = -nsec; + + return do_div(nsec, 1000000); +} + +#define SPLIT_NS(x) nsec_high(x), nsec_low(x) + static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { @@ -36,19 +61,19 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " "); - SEQ_printf(m, "%15s %5d %15Ld %13Ld %5d ", + SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, p->pid, - (long long)p->se.vruntime, + SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%15Ld %15Ld %15Ld\n", - (long long)p->se.vruntime, - (long long)p->se.sum_exec_runtime, - (long long)p->se.sum_sleep_runtime); + SEQ_printf(m, "%15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", + SPLIT_NS(p->se.vruntime), + SPLIT_NS(p->se.sum_exec_runtime), + SPLIT_NS(p->se.sum_sleep_runtime)); #else - SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", - 0LL, 0LL, 0LL, 0LL, 0LL); + SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", + 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif } @@ -85,10 +110,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, "\ncfs_rq\n"); -#define P(x) \ - SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) - - P(exec_clock); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", + SPLIT_NS(cfs_rq->exec_clock)); spin_lock_irqsave(&rq->lock, flags); if (cfs_rq->rb_leftmost) @@ -99,19 +122,18 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) min_vruntime = rq->cfs.min_vruntime; rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; spin_unlock_irqrestore(&rq->lock, flags); - SEQ_printf(m, " .%-30s: %Ld\n", "MIN_vruntime", - (long long)MIN_vruntime); - SEQ_printf(m, " .%-30s: %Ld\n", "min_vruntime", - (long long)min_vruntime); - SEQ_printf(m, " .%-30s: %Ld\n", "max_vruntime", - (long long)max_vruntime); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", + SPLIT_NS(MIN_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", + SPLIT_NS(min_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", + SPLIT_NS(max_vruntime)); spread = max_vruntime - MIN_vruntime; - SEQ_printf(m, " .%-30s: %Ld\n", "spread", - (long long)spread); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", + SPLIT_NS(spread)); spread0 = min_vruntime - rq0_min_vruntime; - SEQ_printf(m, " .%-30s: %Ld\n", "spread0", - (long long)spread0); -#undef P + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", + SPLIT_NS(spread0)); } static void print_cpu(struct seq_file *m, int cpu) @@ -131,6 +153,8 @@ static void print_cpu(struct seq_file *m, int cpu) #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) +#define PN(x) \ + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) P(nr_running); SEQ_printf(m, " .%-30s: %lu\n", "load", @@ -139,21 +163,22 @@ static void print_cpu(struct seq_file *m, int cpu) P(nr_load_updates); P(nr_uninterruptible); SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); - P(next_balance); + PN(next_balance); P(curr->pid); - P(clock); - P(idle_clock); - P(prev_clock_raw); + PN(clock); + PN(idle_clock); + PN(prev_clock_raw); P(clock_warps); P(clock_overflows); P(clock_deep_idle_events); - P(clock_max_delta); + PN(clock_max_delta); P(cpu_load[0]); P(cpu_load[1]); P(cpu_load[2]); P(cpu_load[3]); P(cpu_load[4]); #undef P +#undef PN print_cfs_stats(m, cpu); @@ -170,7 +195,7 @@ static int sched_debug_show(struct seq_file *m, void *v) (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); + SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); for_each_online_cpu(cpu) print_cpu(m, cpu); @@ -228,20 +253,22 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "----------------------------------------------\n"); #define P(F) \ SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) +#define PN(F) \ + SEQ_printf(m, "%-25s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) - P(se.exec_start); - P(se.vruntime); - P(se.sum_exec_runtime); + PN(se.exec_start); + PN(se.vruntime); + PN(se.sum_exec_runtime); #ifdef CONFIG_SCHEDSTATS - P(se.wait_start); - P(se.sleep_start); - P(se.block_start); - P(se.sleep_max); - P(se.block_max); - P(se.exec_max); - P(se.slice_max); - P(se.wait_max); + PN(se.wait_start); + PN(se.sleep_start); + PN(se.block_start); + PN(se.sleep_max); + PN(se.block_max); + PN(se.exec_max); + PN(se.slice_max); + PN(se.wait_max); #endif SEQ_printf(m, "%-25s:%20Ld\n", "nr_switches", (long long)(p->nvcsw + p->nivcsw)); @@ -249,6 +276,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(policy); P(prio); #undef P +#undef PN { u64 t0, t1; -- cgit v0.10.2 From c86da3a3d40f6e7a032edfaea191fb51e9626c8f Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 15 Oct 2007 17:00:08 +0200 Subject: sched: fix formatting of /proc/sched_debug fix formatting of /proc/sched_debug Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index e2c1e0d..4eaaf96 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -67,7 +67,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", SPLIT_NS(p->se.vruntime), SPLIT_NS(p->se.sum_exec_runtime), SPLIT_NS(p->se.sum_sleep_runtime)); @@ -83,10 +83,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\nrunnable tasks:\n" - " task PID tree-key switches prio" - " exec-runtime sum-exec sum-sleep\n" + " task PID tree-key switches prio" + " exec-runtime sum-exec sum-sleep\n" "------------------------------------------------------" - "------------------------------------------------"); + "----------------------------------------------------\n"); read_lock_irq(&tasklist_lock); -- cgit v0.10.2 From edcb60a309769a5f6e7c9e76d7c98b34d1757448 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:08 +0200 Subject: sched: kernel/sched_fair.c whitespace cleanups some trivial whitespace cleanups. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 568e922..9f93a5c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -476,8 +476,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup, int set_curr) { /* - * In case of the 'current'. - */ + * In case of the 'current'. + */ if (unlikely(set_curr)) { update_stats_curr_start(cfs_rq, se); cfs_rq->curr = se; @@ -992,9 +992,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) if (sysctl_sched_child_runs_first && curr->vruntime < se->vruntime) { /* - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ + * Upon rescheduling, sched_class::put_prev_task() will place + * 'current' within the tree based on its new key value. + */ swap(curr->vruntime, se->vruntime); } -- cgit v0.10.2 From 83b699ed20f5218580a1b7042064082e2e05f8c5 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 15 Oct 2007 17:00:08 +0200 Subject: sched: revert recent removal of set_curr_task() Revert removal of set_curr_task. Use put_prev_task/set_curr_task when changing groups/policies Signed-off-by: Srivatsa Vaddagiri < vatsa@linux.vnet.ibm.com> Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra diff --git a/include/linux/sched.h b/include/linux/sched.h index abcb027..6616900 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -871,6 +871,7 @@ struct sched_class { struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio); + void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p); void (*task_new) (struct rq *rq, struct task_struct *p); }; diff --git a/kernel/sched.c b/kernel/sched.c index 72c936d..ee7ac71 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3916,7 +3916,7 @@ EXPORT_SYMBOL(sleep_on_timeout); void rt_mutex_setprio(struct task_struct *p, int prio) { unsigned long flags; - int oldprio, on_rq; + int oldprio, on_rq, running; struct rq *rq; BUG_ON(prio < 0 || prio > MAX_PRIO); @@ -3926,8 +3926,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio) oldprio = p->prio; on_rq = p->se.on_rq; - if (on_rq) + running = task_running(rq, p); + if (on_rq) { dequeue_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + } if (rt_prio(prio)) p->sched_class = &rt_sched_class; @@ -3937,13 +3941,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; if (on_rq) { + if (running) + p->sched_class->set_curr_task(rq); enqueue_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_running(rq, p)) { + if (running) { if (p->prio > oldprio) resched_task(rq->curr); } else { @@ -4149,7 +4155,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { - int retval, oldprio, oldpolicy = -1, on_rq; + int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; struct rq *rq; @@ -4231,20 +4237,26 @@ recheck: } update_rq_clock(rq); on_rq = p->se.on_rq; - if (on_rq) + running = task_running(rq, p); + if (on_rq) { deactivate_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + } oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); if (on_rq) { + if (running) + p->sched_class->set_curr_task(rq); activate_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_running(rq, p)) { + if (running) { if (p->prio > oldprio) resched_task(rq->curr); } else { @@ -6845,13 +6857,19 @@ static void sched_move_task(struct container_subsys *ss, struct container *cont, running = task_running(rq, tsk); on_rq = tsk->se.on_rq; - if (on_rq) + if (on_rq) { dequeue_task(rq, tsk, 0); + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); + } set_task_cfs_rq(tsk); - if (on_rq) + if (on_rq) { + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); enqueue_task(rq, tsk, 0); + } done: task_rq_unlock(rq, &flags); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9f93a5c..92563cd 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -472,20 +472,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) } static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - int wakeup, int set_curr) +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) { /* - * In case of the 'current'. - */ - if (unlikely(set_curr)) { - update_stats_curr_start(cfs_rq, se); - cfs_rq->curr = se; - account_entity_enqueue(cfs_rq, se); - return; - } - - /* * Update the fair clock. */ update_curr(cfs_rq); @@ -496,7 +485,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } update_stats_enqueue(cfs_rq, se); - __enqueue_entity(cfs_rq, se); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); account_entity_enqueue(cfs_rq, se); } @@ -516,12 +506,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) } } #endif - if (likely(se != cfs_rq->curr)) + if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); - else { - update_stats_curr_end(cfs_rq, se); - cfs_rq->curr = NULL; - } account_entity_dequeue(cfs_rq, se); } @@ -539,15 +525,20 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) resched_task(rq_of(cfs_rq)->curr); } -static inline void +static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - /* - * Any task has to be enqueued before it get to execute on - * a CPU. So account for the time it spent waiting on the - * runqueue. - */ - update_stats_wait_end(cfs_rq, se); + /* 'current' is not kept within the tree. */ + if (se->on_rq) { + /* + * Any task has to be enqueued before it get to execute on + * a CPU. So account for the time it spent waiting on the + * runqueue. + */ + update_stats_wait_end(cfs_rq, se); + __dequeue_entity(cfs_rq, se); + } + update_stats_curr_start(cfs_rq, se); cfs_rq->curr = se; #ifdef CONFIG_SCHEDSTATS @@ -568,10 +559,6 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); - /* 'current' is not kept within the tree. */ - if (se) - __dequeue_entity(cfs_rq, se); - set_next_entity(cfs_rq, se); return se; @@ -703,17 +690,12 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int set_curr = 0; - - /* Are we enqueuing the current task? */ - if (unlikely(task_running(rq, p))) - set_curr = 1; for_each_sched_entity(se) { if (se->on_rq) break; cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, wakeup, set_curr); + enqueue_entity(cfs_rq, se, wakeup); } } @@ -761,7 +743,7 @@ static void yield_task_fair(struct rq *rq) * position within the tree: */ dequeue_entity(cfs_rq, se, 0); - enqueue_entity(cfs_rq, se, 0, 1); + enqueue_entity(cfs_rq, se, 0); return; } @@ -1004,6 +986,19 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } +/* Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_curr_task_fair(struct rq *rq) +{ + struct sched_entity *se = &rq->curr->se; + + for_each_sched_entity(se) + set_next_entity(cfs_rq_of(se), se); +} + /* * All the scheduling class methods: */ @@ -1019,6 +1014,7 @@ struct sched_class fair_sched_class __read_mostly = { .load_balance = load_balance_fair, + .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_new = task_new_fair, }; diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3503fb2..5ebf829 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -50,6 +50,10 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr) { } +static void set_curr_task_idle(struct rq *rq) +{ +} + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -66,6 +70,7 @@ static struct sched_class idle_sched_class __read_mostly = { .load_balance = load_balance_idle, + .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, /* no .task_new for idle tasks */ }; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3c77c03..e1d5f1c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -218,6 +218,13 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) } } +static void set_curr_task_rt(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + p->se.exec_start = rq->clock; +} + static struct sched_class rt_sched_class __read_mostly = { .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, @@ -230,5 +237,6 @@ static struct sched_class rt_sched_class __read_mostly = { .load_balance = load_balance_rt, + .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, }; -- cgit v0.10.2 From 72ea22f8fbc893425faefa60641f45a4cdef2261 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 15 Oct 2007 17:00:08 +0200 Subject: sched: fix minor bug in yield - fix a minor bug in yield (seen for CONFIG_FAIR_GROUP_SCHED), group scheduling would skew when yield was called. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 92563cd..d8d2e2f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -725,7 +725,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) */ static void yield_task_fair(struct rq *rq) { - struct cfs_rq *cfs_rq = &rq->cfs; + struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr); struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; struct sched_entity *rightmost, *se = &rq->curr->se; struct rb_node *parent; -- cgit v0.10.2 From 545f3b18152355acbb8da59873506fcf66c7c60e Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 15 Oct 2007 17:00:09 +0200 Subject: sched: print nr_running and load in /proc/sched_debug - print nr_running and load information for cfs_rq in /proc/sched_debug Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4eaaf96..3e47e87 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -134,6 +134,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) spread0 = min_vruntime - rq0_min_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", SPLIT_NS(spread0)); + SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); } static void print_cpu(struct seq_file *m, int cpu) -- cgit v0.10.2 From 75c28ace9f2b2f403674e045939424a77c95b47c Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 15 Oct 2007 17:00:09 +0200 Subject: sched: print &rq->cfs stats - Print &rq->cfs statistics as well (useful for group scheduling) Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d8d2e2f..556942c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1024,6 +1024,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq; +#ifdef CONFIG_FAIR_GROUP_SCHED + print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); +#endif for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) print_cfs_rq(m, cpu, cfs_rq); } -- cgit v0.10.2 From 9b5b77512dce239fa168183fa71896712232e95a Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 15 Oct 2007 17:00:09 +0200 Subject: sched: clean up code under CONFIG_FAIR_GROUP_SCHED With the view of supporting user-id based fair scheduling (and not just container-based fair scheduling), this patch renames several functions and makes them independent of whether they are being used for container or user-id based fair scheduling. Also fix a problem reported by KAMEZAWA Hiroyuki (wrt allocating less-sized array for tg->cfs_rq[] and tf->se[]). Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 6616900..03c13b6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -136,6 +136,7 @@ extern unsigned long weighted_cpuload(const int cpu); struct seq_file; struct cfs_rq; +struct task_grp; #ifdef CONFIG_SCHED_DEBUG extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); @@ -1834,6 +1835,17 @@ extern int sched_mc_power_savings, sched_smt_power_savings; extern void normalize_rt_tasks(void); +#ifdef CONFIG_FAIR_GROUP_SCHED + +extern struct task_grp init_task_grp; + +extern struct task_grp *sched_create_group(void); +extern void sched_destroy_group(struct task_grp *tg); +extern void sched_move_task(struct task_struct *tsk); +extern int sched_group_set_shares(struct task_grp *tg, unsigned long shares); + +#endif + #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { diff --git a/init/Kconfig b/init/Kconfig index 11c6762..ef90a15 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -282,13 +282,12 @@ config CPUSETS Say N if unsure. config FAIR_GROUP_SCHED - bool "Fair group scheduler" - depends on EXPERIMENTAL && CONTAINERS + bool "Fair group cpu scheduler" + default n + depends on EXPERIMENTAL help - This option enables you to group tasks and control CPU resource - allocation to such groups. - - Say N if unsure. + This feature lets cpu scheduler recognize task groups and control cpu + bandwidth allocation to such task groups. config SYSFS_DEPRECATED bool "Create deprecated sysfs files" diff --git a/kernel/sched.c b/kernel/sched.c index ee7ac71..e10c403 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -173,13 +173,10 @@ struct rt_prio_array { #ifdef CONFIG_FAIR_GROUP_SCHED -#include - struct cfs_rq; /* task group related information */ struct task_grp { - struct container_subsys_state css; /* schedulable entities of this group on each cpu */ struct sched_entity **se; /* runqueue "owned" by this group on each cpu */ @@ -192,22 +189,28 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; -static struct sched_entity *init_sched_entity_p[CONFIG_NR_CPUS]; -static struct cfs_rq *init_cfs_rq_p[CONFIG_NR_CPUS]; +static struct sched_entity *init_sched_entity_p[NR_CPUS]; +static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; /* Default task group. * Every task in system belong to this group at bootup. */ -static struct task_grp init_task_grp = { - .se = init_sched_entity_p, - .cfs_rq = init_cfs_rq_p, - }; +struct task_grp init_task_grp = { + .se = init_sched_entity_p, + .cfs_rq = init_cfs_rq_p, + }; + +#define INIT_TASK_GRP_LOAD NICE_0_LOAD +static int init_task_grp_load = INIT_TASK_GRP_LOAD; /* return group to which a task belongs */ static inline struct task_grp *task_grp(struct task_struct *p) { - return container_of(task_subsys_state(p, cpu_subsys_id), - struct task_grp, css); + struct task_grp *tg; + + tg = &init_task_grp; + + return tg; } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ @@ -250,6 +253,7 @@ struct cfs_rq { */ struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ struct task_grp *tg; /* group that "owns" this runqueue */ + struct rcu_head rcu; #endif }; @@ -6513,11 +6517,12 @@ void __init sched_init(void) init_sched_entity_p[i] = se; se->cfs_rq = &rq->cfs; se->my_q = cfs_rq; - se->load.weight = NICE_0_LOAD; - se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); + se->load.weight = init_task_grp_load; + se->load.inv_weight = + div64_64(1ULL<<32, init_task_grp_load); se->parent = NULL; } - init_task_grp.shares = NICE_0_LOAD; + init_task_grp.shares = init_task_grp_load; #endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) @@ -6707,45 +6712,28 @@ void set_curr_task(int cpu, struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED -/* return corresponding task_grp object of a container */ -static inline struct task_grp *container_tg(struct container *cont) -{ - return container_of(container_subsys_state(cont, cpu_subsys_id), - struct task_grp, css); -} - /* allocate runqueue etc for a new task group */ -static struct container_subsys_state * -sched_create_group(struct container_subsys *ss, struct container *cont) +struct task_grp *sched_create_group(void) { struct task_grp *tg; struct cfs_rq *cfs_rq; struct sched_entity *se; + struct rq *rq; int i; - if (!cont->parent) { - /* This is early initialization for the top container */ - init_task_grp.css.container = cont; - return &init_task_grp.css; - } - - /* we support only 1-level deep hierarchical scheduler atm */ - if (cont->parent->parent) - return ERR_PTR(-EINVAL); - tg = kzalloc(sizeof(*tg), GFP_KERNEL); if (!tg) return ERR_PTR(-ENOMEM); - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * num_possible_cpus(), GFP_KERNEL); + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); if (!tg->cfs_rq) goto err; - tg->se = kzalloc(sizeof(se) * num_possible_cpus(), GFP_KERNEL); + tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); if (!tg->se) goto err; for_each_possible_cpu(i) { - struct rq *rq = cpu_rq(i); + rq = cpu_rq(i); cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); @@ -6763,7 +6751,6 @@ sched_create_group(struct container_subsys *ss, struct container *cont) tg->cfs_rq[i] = cfs_rq; init_cfs_rq(cfs_rq, rq); cfs_rq->tg = tg; - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); tg->se[i] = se; se->cfs_rq = &rq->cfs; @@ -6773,12 +6760,15 @@ sched_create_group(struct container_subsys *ss, struct container *cont) se->parent = NULL; } - tg->shares = NICE_0_LOAD; + for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = tg->cfs_rq[i]; + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + } - /* Bind the container to task_grp object we just created */ - tg->css.container = cont; + tg->shares = NICE_0_LOAD; - return &tg->css; + return tg; err: for_each_possible_cpu(i) { @@ -6797,24 +6787,14 @@ err: return ERR_PTR(-ENOMEM); } - -/* destroy runqueue etc associated with a task group */ -static void sched_destroy_group(struct container_subsys *ss, - struct container *cont) +/* rcu callback to free various structures associated with a task group */ +static void free_sched_group(struct rcu_head *rhp) { - struct task_grp *tg = container_tg(cont); - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu); + struct task_grp *tg = cfs_rq->tg; struct sched_entity *se; int i; - for_each_possible_cpu(i) { - cfs_rq = tg->cfs_rq[i]; - list_del_rcu(&cfs_rq->leaf_cfs_rq_list); - } - - /* wait for possible concurrent references to cfs_rqs complete */ - synchronize_sched(); - /* now it should be safe to free those cfs_rqs */ for_each_possible_cpu(i) { cfs_rq = tg->cfs_rq[i]; @@ -6829,19 +6809,29 @@ static void sched_destroy_group(struct container_subsys *ss, kfree(tg); } -static int sched_can_attach(struct container_subsys *ss, - struct container *cont, struct task_struct *tsk) +/* Destroy runqueue etc associated with a task group */ +void sched_destroy_group(struct task_grp *tg) { - /* We don't support RT-tasks being in separate groups */ - if (tsk->sched_class != &fair_sched_class) - return -EINVAL; + struct cfs_rq *cfs_rq; + int i; - return 0; + for_each_possible_cpu(i) { + cfs_rq = tg->cfs_rq[i]; + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + } + + cfs_rq = tg->cfs_rq[0]; + + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&cfs_rq->rcu, free_sched_group); } -/* change task's runqueue when it moves between groups */ -static void sched_move_task(struct container_subsys *ss, struct container *cont, - struct container *old_cont, struct task_struct *tsk) +/* change task's runqueue when it moves between groups. + * The caller of this function should have put the task in its new group + * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to + * reflect its new group. + */ +void sched_move_task(struct task_struct *tsk) { int on_rq, running; unsigned long flags; @@ -6896,58 +6886,20 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) spin_unlock_irq(&rq->lock); } -static ssize_t cpu_shares_write(struct container *cont, struct cftype *cftype, - struct file *file, const char __user *userbuf, - size_t nbytes, loff_t *ppos) +int sched_group_set_shares(struct task_grp *tg, unsigned long shares) { int i; - unsigned long shareval; - struct task_grp *tg = container_tg(cont); - char buffer[2*sizeof(unsigned long) + 1]; - - if (nbytes > 2*sizeof(unsigned long)) /* safety check */ - return -E2BIG; - if (copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; + if (tg->shares == shares) + return 0; - buffer[nbytes] = 0; /* nul-terminate */ - shareval = simple_strtoul(buffer, NULL, 10); + /* return -EINVAL if the new value is not sane */ - tg->shares = shareval; + tg->shares = shares; for_each_possible_cpu(i) - set_se_shares(tg->se[i], shareval); - - return nbytes; -} - -static u64 cpu_shares_read_uint(struct container *cont, struct cftype *cft) -{ - struct task_grp *tg = container_tg(cont); - - return (u64) tg->shares; -} + set_se_shares(tg->se[i], shares); -struct cftype cpuctl_share = { - .name = "shares", - .read_uint = cpu_shares_read_uint, - .write = cpu_shares_write, -}; - -static int sched_populate(struct container_subsys *ss, struct container *cont) -{ - return container_add_file(cont, ss, &cpuctl_share); + return 0; } -struct container_subsys cpu_subsys = { - .name = "cpu", - .create = sched_create_group, - .destroy = sched_destroy_group, - .can_attach = sched_can_attach, - .attach = sched_move_task, - .populate = sched_populate, - .subsys_id = cpu_subsys_id, - .early_init = 1, -}; - -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 556942c..abd65ed 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -877,7 +877,10 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) if (!cfs_rq->nr_running) return MAX_PRIO; - curr = __pick_next_entity(cfs_rq); + curr = cfs_rq->curr; + if (!curr) + curr = __pick_next_entity(cfs_rq); + p = task_of(curr); return p->prio; -- cgit v0.10.2 From 24e377a83220ef05c9b5bec7e01d65eed6609aa6 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 15 Oct 2007 17:00:09 +0200 Subject: sched: add fair-user scheduler Enable user-id based fair group scheduling. This is useful for anyone who wants to test the group scheduler w/o having to enable CONFIG_CGROUPS. A separate scheduling group (i.e struct task_grp) is automatically created for every new user added to the system. Upon uid change for a task, it is made to move to the corresponding scheduling group. A /proc tunable (/proc/root_user_share) is also provided to tune root user's quota of cpu bandwidth. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 03c13b6..d0cc583 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -597,6 +597,10 @@ struct user_struct { /* Hash table maintenance information */ struct hlist_node uidhash_node; uid_t uid; + +#ifdef CONFIG_FAIR_USER_SCHED + struct task_grp *tg; +#endif }; extern struct user_struct *find_user(uid_t); diff --git a/init/Kconfig b/init/Kconfig index ef90a15..37711fe 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -289,6 +289,19 @@ config FAIR_GROUP_SCHED This feature lets cpu scheduler recognize task groups and control cpu bandwidth allocation to such task groups. +choice + depends on FAIR_GROUP_SCHED + prompt "Basis for grouping tasks" + default FAIR_USER_SCHED + + config FAIR_USER_SCHED + bool "user id" + help + This option will choose userid as the basis for grouping + tasks, thus providing equal cpu bandwidth to each user. + +endchoice + config SYSFS_DEPRECATED bool "Create deprecated sysfs files" default y diff --git a/kernel/sched.c b/kernel/sched.c index e10c403..f33608e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -200,7 +200,12 @@ struct task_grp init_task_grp = { .cfs_rq = init_cfs_rq_p, }; +#ifdef CONFIG_FAIR_USER_SCHED +#define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD +#else #define INIT_TASK_GRP_LOAD NICE_0_LOAD +#endif + static int init_task_grp_load = INIT_TASK_GRP_LOAD; /* return group to which a task belongs */ @@ -208,7 +213,11 @@ static inline struct task_grp *task_grp(struct task_struct *p) { struct task_grp *tg; +#ifdef CONFIG_FAIR_USER_SCHED + tg = p->user->tg; +#else tg = &init_task_grp; +#endif return tg; } diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 3e47e87..57ee9d5 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -212,6 +212,49 @@ static void sysrq_sched_debug_show(void) sched_debug_show(NULL, NULL); } +#ifdef CONFIG_FAIR_USER_SCHED + +static DEFINE_MUTEX(root_user_share_mutex); + +static int +root_user_share_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int len; + + len = sprintf(page, "%d\n", init_task_grp_load); + + return len; +} + +static int +root_user_share_write_proc(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + unsigned long shares; + char kbuf[sizeof(unsigned long)+1]; + int rc = 0; + + if (copy_from_user(kbuf, buffer, sizeof(kbuf))) + return -EFAULT; + + shares = simple_strtoul(kbuf, NULL, 0); + + if (!shares) + shares = NICE_0_LOAD; + + mutex_lock(&root_user_share_mutex); + + init_task_grp_load = shares; + rc = sched_group_set_shares(&init_task_grp, shares); + + mutex_unlock(&root_user_share_mutex); + + return (rc < 0 ? rc : count); +} + +#endif /* CONFIG_FAIR_USER_SCHED */ + static int sched_debug_open(struct inode *inode, struct file *filp) { return single_open(filp, sched_debug_show, NULL); @@ -234,6 +277,15 @@ static int __init init_sched_debug_procfs(void) pe->proc_fops = &sched_debug_fops; +#ifdef CONFIG_FAIR_USER_SCHED + pe = create_proc_entry("root_user_share", 0644, NULL); + if (!pe) + return -ENOMEM; + + pe->read_proc = root_user_share_read_proc; + pe->write_proc = root_user_share_write_proc; +#endif + return 0; } diff --git a/kernel/user.c b/kernel/user.c index 9ca2848..c6387fa 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -50,8 +50,41 @@ struct user_struct root_user = { .uid_keyring = &root_user_keyring, .session_keyring = &root_session_keyring, #endif +#ifdef CONFIG_FAIR_USER_SCHED + .tg = &init_task_grp, +#endif }; +#ifdef CONFIG_FAIR_USER_SCHED +static void sched_destroy_user(struct user_struct *up) +{ + sched_destroy_group(up->tg); +} + +static int sched_create_user(struct user_struct *up) +{ + int rc = 0; + + up->tg = sched_create_group(); + if (IS_ERR(up->tg)) + rc = -ENOMEM; + + return rc; +} + +static void sched_switch_user(struct task_struct *p) +{ + sched_move_task(p); +} + +#else /* CONFIG_FAIR_USER_SCHED */ + +static void sched_destroy_user(struct user_struct *up) { } +static int sched_create_user(struct user_struct *up) { return 0; } +static void sched_switch_user(struct task_struct *p) { } + +#endif /* CONFIG_FAIR_USER_SCHED */ + /* * These routines must be called with the uidhash spinlock held! */ @@ -109,6 +142,7 @@ void free_uid(struct user_struct *up) if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); + sched_destroy_user(up); key_put(up->uid_keyring); key_put(up->session_keyring); kmem_cache_free(uid_cachep, up); @@ -150,6 +184,13 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) return NULL; } + if (sched_create_user(new) < 0) { + key_put(new->uid_keyring); + key_put(new->session_keyring); + kmem_cache_free(uid_cachep, new); + return NULL; + } + /* * Before adding this, check whether we raced * on adding the same user already.. @@ -157,6 +198,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { + sched_destroy_user(new); key_put(new->uid_keyring); key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); @@ -184,6 +226,7 @@ void switch_uid(struct user_struct *new_user) atomic_dec(&old_user->processes); switch_uid_keyring(new_user); current->user = new_user; + sched_switch_user(current); /* * We need to synchronize with __sigqueue_alloc() -- cgit v0.10.2 From 7ed2be459b61c66fcc4926ffb073a25fc077d51f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:09 +0200 Subject: sched: fair-group sched, cleanups fair-group sched, cleanups. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/init/Kconfig b/init/Kconfig index 37711fe..b680733 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -294,10 +294,10 @@ choice prompt "Basis for grouping tasks" default FAIR_USER_SCHED - config FAIR_USER_SCHED - bool "user id" - help - This option will choose userid as the basis for grouping + config FAIR_USER_SCHED + bool "user id" + help + This option will choose userid as the basis for grouping tasks, thus providing equal cpu bandwidth to each user. endchoice -- cgit v0.10.2 From de8d585a12aef40676f12ddc63e97daaf7752ba1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:09 +0200 Subject: sched: enable CONFIG_FAIR_GROUP_SCHED=y by default enable CONFIG_FAIR_GROUP_SCHED=y by default. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/init/Kconfig b/init/Kconfig index b680733..faed9a0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -283,7 +283,7 @@ config CPUSETS config FAIR_GROUP_SCHED bool "Fair group cpu scheduler" - default n + default y depends on EXPERIMENTAL help This feature lets cpu scheduler recognize task groups and control cpu -- cgit v0.10.2 From b8efb56172bc55082b8490778b07ef73eea0b551 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:10 +0200 Subject: sched debug: BKL usage statistics add per task and per rq BKL usage statistics. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index d0cc583..920eb73 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -621,6 +621,10 @@ struct sched_info { /* timestamps */ unsigned long long last_arrival,/* when we last ran on a cpu */ last_queued; /* when we were last queued to run */ +#ifdef CONFIG_SCHEDSTATS + /* BKL stats */ + unsigned long bkl_cnt; +#endif }; #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ diff --git a/kernel/sched.c b/kernel/sched.c index f33608e..5004dff 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -356,6 +356,9 @@ struct rq { /* try_to_wake_up() stats */ unsigned long ttwu_cnt; unsigned long ttwu_local; + + /* BKL stats */ + unsigned long bkl_cnt; #endif struct lock_class_key rq_lock_key; }; @@ -3414,6 +3417,12 @@ static inline void schedule_debug(struct task_struct *prev) profile_hit(SCHED_PROFILING, __builtin_return_address(0)); schedstat_inc(this_rq(), sched_cnt); +#ifdef CONFIG_SCHEDSTATS + if (unlikely(prev->lock_depth >= 0)) { + schedstat_inc(this_rq(), bkl_cnt); + schedstat_inc(prev, sched_info.bkl_cnt); + } +#endif } /* diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 57ee9d5..823b63a 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -136,6 +136,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(spread0)); SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); + SEQ_printf(m, " .%-30s: %ld\n", "bkl_cnt", + rq->bkl_cnt); } static void print_cpu(struct seq_file *m, int cpu) @@ -323,6 +325,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.exec_max); PN(se.slice_max); PN(se.wait_max); + P(sched_info.bkl_cnt); #endif SEQ_printf(m, "%-25s:%20Ld\n", "nr_switches", (long long)(p->nvcsw + p->nivcsw)); @@ -350,6 +353,7 @@ void proc_sched_set_task(struct task_struct *p) p->se.exec_max = 0; p->se.slice_max = 0; p->se.wait_max = 0; + p->sched_info.bkl_cnt = 0; #endif p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; -- cgit v0.10.2 From fdd71d132badad542a9ab99ab4a9c3c08fa6412f Mon Sep 17 00:00:00 2001 From: "S.Caglar Onur" Date: Mon, 15 Oct 2007 17:00:10 +0200 Subject: sched debug: BKL usage statistics, fix build fix for the SCHED_DEBUG && !SCHEDSTATS case. Signed-off-by: S.Ceglar Onur Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 823b63a..b6d0a94 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -136,8 +136,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(spread0)); SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); +#ifdef CONFIG_SCHEDSTATS SEQ_printf(m, " .%-30s: %ld\n", "bkl_cnt", rq->bkl_cnt); +#endif } static void print_cpu(struct seq_file *m, int cpu) -- cgit v0.10.2 From c18b8a7cbcbac46497ee1ce656b0e68197c7581d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:10 +0200 Subject: sched: remove unneeded tunables remove unneeded tunables. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 920eb73..2c33227 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1403,8 +1403,6 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_batch_wakeup_granularity; -extern unsigned int sysctl_sched_stat_granularity; -extern unsigned int sysctl_sched_runtime_limit; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; #endif diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index abd65ed..5db7bd1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -76,8 +76,6 @@ const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL; */ const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL; -unsigned int sysctl_sched_runtime_limit __read_mostly; - extern struct sched_class fair_sched_class; /************************************************************** -- cgit v0.10.2 From 1aa4731eff7dab7bd01747b46f654f449f1cfc2c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:10 +0200 Subject: sched debug: print settings print the current value of all tunables in /proc/sched_debug output. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index b6d0a94..d79e1ec 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -203,6 +203,19 @@ static int sched_debug_show(struct seq_file *m, void *v) SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); +#define P(x) \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + PN(sysctl_sched_latency); + PN(sysctl_sched_min_granularity); + PN(sysctl_sched_wakeup_granularity); + PN(sysctl_sched_batch_wakeup_granularity); + PN(sysctl_sched_child_runs_first); + P(sysctl_sched_features); +#undef PN +#undef P + for_each_online_cpu(cpu) print_cpu(m, cpu); -- cgit v0.10.2 From 67e9fb2a39a1d454218d50383094940982be138f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:10 +0200 Subject: sched: add vslice add vslice: the load-dependent "virtual slice" a task should run ideally, so that the observed latency stays within the sched_latency window. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c33227..d74830c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -908,6 +908,7 @@ struct sched_entity { u64 sum_exec_runtime; u64 vruntime; u64 prev_sum_exec_runtime; + u64 last_min_vruntime; #ifdef CONFIG_SCHEDSTATS u64 wait_start; diff --git a/kernel/sched.c b/kernel/sched.c index 5004dff..fe1165b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1615,6 +1615,7 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; + p->se.last_min_vruntime = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -6495,6 +6496,7 @@ static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; #endif + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } void __init sched_init(void) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5db7bd1..87acc5c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -243,6 +243,15 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) return period; } +static u64 __sched_vslice(unsigned long nr_running) +{ + u64 period = __sched_period(nr_running); + + do_div(period, nr_running); + + return period; +} + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -441,32 +450,33 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { - u64 min_runtime, latency; + u64 vruntime; - min_runtime = cfs_rq->min_vruntime; + vruntime = cfs_rq->min_vruntime; if (sched_feat(USE_TREE_AVG)) { struct sched_entity *last = __pick_last_entity(cfs_rq); if (last) { - min_runtime = __pick_next_entity(cfs_rq)->vruntime; - min_runtime += last->vruntime; - min_runtime >>= 1; + vruntime += last->vruntime; + vruntime >>= 1; } - } else if (sched_feat(APPROX_AVG)) - min_runtime += sysctl_sched_latency/2; + } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) + vruntime += __sched_vslice(cfs_rq->nr_running)/2; if (initial && sched_feat(START_DEBIT)) - min_runtime += sched_slice(cfs_rq, se); + vruntime += __sched_vslice(cfs_rq->nr_running + 1); if (!initial && sched_feat(NEW_FAIR_SLEEPERS)) { - latency = sysctl_sched_latency; - if (min_runtime > latency) - min_runtime -= latency; + s64 latency = cfs_rq->min_vruntime - se->last_min_vruntime; + if (latency < 0 || !cfs_rq->nr_running) + latency = 0; else - min_runtime = 0; + latency = min_t(s64, latency, sysctl_sched_latency); + vruntime -= latency; } - se->vruntime = max(se->vruntime, min_runtime); + se->vruntime = vruntime; + } static void @@ -478,6 +488,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) update_curr(cfs_rq); if (wakeup) { + /* se->vruntime += cfs_rq->min_vruntime; */ place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); } @@ -492,8 +503,8 @@ static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { update_stats_dequeue(cfs_rq, se); -#ifdef CONFIG_SCHEDSTATS if (sleep) { +#ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -502,8 +513,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) if (tsk->state & TASK_UNINTERRUPTIBLE) se->block_start = rq_of(cfs_rq)->clock; } - } #endif + /* se->vruntime = entity_key(cfs_rq, se); */ + se->last_min_vruntime = cfs_rq->min_vruntime; + } + if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); -- cgit v0.10.2 From d822cecedad88b69a7d68aa8d49e1f238aa320c7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:10 +0200 Subject: sched debug: more width for parameter printouts more width for parameter printouts in /proc/sched_debug. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index d79e1ec..b24f17d 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -204,9 +204,9 @@ static int sched_debug_show(struct seq_file *m, void *v) SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); #define P(x) \ - SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(x)) + SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) #define PN(x) \ - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); PN(sysctl_sched_wakeup_granularity); -- cgit v0.10.2 From ddc972975091ba5f839bf24d0f9ef54fe90ee741 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:10 +0200 Subject: sched debug: check spread debug feature: check how well we schedule within a reasonable vruntime 'spread' range. (note that CPU overload can increase the spread, so this is not a hard condition, but normal loads should be within the spread.) Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra diff --git a/kernel/sched.c b/kernel/sched.c index fe1165b..213294f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -250,6 +250,9 @@ struct cfs_rq { * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr; + + unsigned long nr_spread_over; + #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index b24f17d..4659c90 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -140,6 +140,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "bkl_cnt", rq->bkl_cnt); #endif + SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", + cfs_rq->nr_spread_over); } static void print_cpu(struct seq_file *m, int cpu) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 87acc5c..8ea4c9b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -447,6 +447,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) #endif } +static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + s64 d = se->vruntime - cfs_rq->min_vruntime; + + if (d < 0) + d = -d; + + if (d > 3*sysctl_sched_latency) + schedstat_inc(cfs_rq, nr_spread_over); +#endif +} + static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { @@ -494,6 +507,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) } update_stats_enqueue(cfs_rq, se); + check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); account_entity_enqueue(cfs_rq, se); @@ -587,6 +601,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) update_stats_curr_end(cfs_rq, prev); + check_spread(cfs_rq, prev); if (prev->on_rq) { update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ @@ -996,6 +1011,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) } update_stats_enqueue(cfs_rq, se); + check_spread(cfs_rq, se); + check_spread(cfs_rq, curr); __enqueue_entity(cfs_rq, se); account_entity_enqueue(cfs_rq, se); resched_task(rq->curr); -- cgit v0.10.2 From 8465e792e82c567b80358e38732164b770ed4b7f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:11 +0200 Subject: sched: entity_key() fix entity_key() fix - we'd occasionally end up with a 0 vruntime in the !initial case. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8ea4c9b..926491f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -479,13 +479,16 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (initial && sched_feat(START_DEBIT)) vruntime += __sched_vslice(cfs_rq->nr_running + 1); - if (!initial && sched_feat(NEW_FAIR_SLEEPERS)) { - s64 latency = cfs_rq->min_vruntime - se->last_min_vruntime; - if (latency < 0 || !cfs_rq->nr_running) - latency = 0; - else - latency = min_t(s64, latency, sysctl_sched_latency); - vruntime -= latency; + if (!initial) { + if (sched_feat(NEW_FAIR_SLEEPERS)) { + s64 latency = cfs_rq->min_vruntime - se->last_min_vruntime; + if (latency < 0 || !cfs_rq->nr_running) + latency = 0; + else + latency = min_t(s64, latency, sysctl_sched_latency); + vruntime -= latency; + } + vruntime = max(vruntime, se->vruntime); } se->vruntime = vruntime; -- cgit v0.10.2 From 785c29ef9573d98b31493c9a68c3589449082108 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:11 +0200 Subject: sched: remove condition from set_task_cpu() remove condition from set_task_cpu(). Now that ->vruntime is not global anymore, it should (and does) work fine without it too. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra diff --git a/kernel/sched.c b/kernel/sched.c index 213294f..c779bf9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1052,9 +1052,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->se.block_start) p->se.block_start -= clock_offset; #endif - if (likely(new_rq->cfs.min_vruntime)) - p->se.vruntime -= old_rq->cfs.min_vruntime - - new_rq->cfs.min_vruntime; + p->se.vruntime -= old_rq->cfs.min_vruntime - new_rq->cfs.min_vruntime; __set_task_cpu(p, new_cpu); } -- cgit v0.10.2 From dc1f31c90cfa067af6f7000db7a5383c7667ccba Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:11 +0200 Subject: sched: remove last_min_vruntime effect remove last_min_vruntime use - prepare to remove it. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 926491f..0228de1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -481,7 +481,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (!initial) { if (sched_feat(NEW_FAIR_SLEEPERS)) { - s64 latency = cfs_rq->min_vruntime - se->last_min_vruntime; + s64 latency = cfs_rq->min_vruntime - se->vruntime; if (latency < 0 || !cfs_rq->nr_running) latency = 0; else -- cgit v0.10.2 From 94359f05cb7e1fed0deccc83ebc30a1175a9ae16 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:11 +0200 Subject: sched: undo some of the recent changes undo some of the recent changes that are not needed after all, such as last_min_vruntime. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra diff --git a/include/linux/sched.h b/include/linux/sched.h index d74830c..2c33227 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -908,7 +908,6 @@ struct sched_entity { u64 sum_exec_runtime; u64 vruntime; u64 prev_sum_exec_runtime; - u64 last_min_vruntime; #ifdef CONFIG_SCHEDSTATS u64 wait_start; diff --git a/kernel/sched.c b/kernel/sched.c index c779bf9..744bd50 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1616,7 +1616,6 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; - p->se.last_min_vruntime = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0228de1..62a9ee8d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -480,14 +480,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime += __sched_vslice(cfs_rq->nr_running + 1); if (!initial) { - if (sched_feat(NEW_FAIR_SLEEPERS)) { - s64 latency = cfs_rq->min_vruntime - se->vruntime; - if (latency < 0 || !cfs_rq->nr_running) - latency = 0; - else - latency = min_t(s64, latency, sysctl_sched_latency); - vruntime -= latency; - } + if (sched_feat(NEW_FAIR_SLEEPERS)) + vruntime -= sysctl_sched_latency; + vruntime = max(vruntime, se->vruntime); } @@ -531,8 +526,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) se->block_start = rq_of(cfs_rq)->clock; } #endif - /* se->vruntime = entity_key(cfs_rq, se); */ - se->last_min_vruntime = cfs_rq->min_vruntime; } if (se != cfs_rq->curr) -- cgit v0.10.2 From b8487b924177385e3932f846f430b73ce8e69bba Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:11 +0200 Subject: sched: fix sign check error in place_entity() fix sign check error in place_entity() - we'd get excessive latencies due to negatives being converted to large u64's. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 62a9ee8d..2bd9625 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -483,7 +483,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (sched_feat(NEW_FAIR_SLEEPERS)) vruntime -= sysctl_sched_latency; - vruntime = max(vruntime, se->vruntime); + vruntime = max_t(s64, vruntime, se->vruntime); } se->vruntime = vruntime; -- cgit v0.10.2 From 02e4bac2a5b097e23d757bf2953740b3d51b7976 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:11 +0200 Subject: sched: fix sched_fork() fix sched_fork(): large latencies at new task creation time because the ->vruntime was not fixed up cross-CPU, if the parent got migrated after the child's CPU got set up. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index 744bd50..36484da 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1657,7 +1657,7 @@ void sched_fork(struct task_struct *p, int clone_flags) #ifdef CONFIG_SMP cpu = sched_balance_self(cpu, SD_BALANCE_FORK); #endif - __set_task_cpu(p, cpu); + set_task_cpu(p, cpu); /* * Make sure we do not leak PI boosting priority to the child: -- cgit v0.10.2 From 368059a977871def0f88a92eefb6ecc1f7b6132f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:11 +0200 Subject: sched: max_vruntime() simplification max_vruntime() simplification. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2bd9625..91664d6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -117,8 +117,8 @@ static inline struct task_struct *task_of(struct sched_entity *se) static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) { - if ((vruntime > min_vruntime) || - (min_vruntime > (1ULL << 61) && vruntime < (1ULL << 50))) + s64 delta = (s64)(vruntime - min_vruntime); + if (delta > 0) min_vruntime = vruntime; return min_vruntime; -- cgit v0.10.2 From 2ddbf952508fb9911036c484a87f6351106b917c Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 15 Oct 2007 17:00:11 +0200 Subject: sched: clean up sched_fork() The adjusting sched_class is a missing part of the already existing "do not leak PI boosting priority to the child" at the sched_fork(). This patch moves the adjusting sched_class from wake_up_new_task() to sched_fork(). this also shrinks the code a bit: text data bss dec hex filename 40111 4018 292 44421 ad85 sched.o.before 40102 4018 292 44412 ad7c sched.o.after Signed-off-by: Hiroshi Shimamoto Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index 36484da..cd2b494 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1663,6 +1663,8 @@ void sched_fork(struct task_struct *p, int clone_flags) * Make sure we do not leak PI boosting priority to the child: */ p->prio = current->normal_prio; + if (!rt_prio(p->prio)) + p->sched_class = &fair_sched_class; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -1698,11 +1700,6 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->prio = effective_prio(p); - if (rt_prio(p->prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; - if (task_cpu(p) != this_cpu || !p->sched_class->task_new || !current->se.on_rq) { activate_task(rq, p, 0); -- cgit v0.10.2 From 57cb499df26d80ec11cd49e56d20835334ac4ab9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:11 +0200 Subject: sched: remove set_leftmost() Lee Schermerhorn noticed that set_leftmost() contains dead code, remove this. Reported-by: Lee Schermerhorn Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 91664d6..48c6921 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -124,16 +124,6 @@ max_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline void -set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost) -{ - struct sched_entity *se; - - cfs_rq->rb_leftmost = leftmost; - if (leftmost) - se = rb_entry(leftmost, struct sched_entity, run_node); -} - static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -175,7 +165,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * used): */ if (leftmost) - set_leftmost(cfs_rq, &se->run_node); + cfs_rq->rb_leftmost = &se->run_node; rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); @@ -185,7 +175,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->rb_leftmost == &se->run_node) - set_leftmost(cfs_rq, rb_next(&se->run_node)); + cfs_rq->rb_leftmost = rb_next(&se->run_node); rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } -- cgit v0.10.2 From 8651a86c342ab79a956afec0c5971acaad38d3a1 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 15 Oct 2007 17:00:12 +0200 Subject: sched: group scheduler wakeup latency fix group scheduler wakeup latency fix: when checking for preemption we must check cross-group too, not just intra-group. Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 48c6921..5384a97 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -796,7 +796,8 @@ static void yield_task_fair(struct rq *rq) static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) { struct task_struct *curr = rq->curr; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct cfs_rq *cfs_rq = task_cfs_rq(curr), *pcfs_rq; + struct sched_entity *se = &curr->se, *pse = &p->se; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -804,11 +805,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) resched_task(curr); return; } - if (is_same_group(curr, p)) { - s64 delta = curr->se.vruntime - p->se.vruntime; - if (delta > (s64)sysctl_sched_wakeup_granularity) - resched_task(curr); + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + pcfs_rq = cfs_rq_of(pse); + + if (cfs_rq == pcfs_rq) { + s64 delta = se->vruntime - pse->vruntime; + + if (delta > (s64)sysctl_sched_wakeup_granularity) + resched_task(curr); + break; + } +#ifdef CONFIG_FAIR_GROUP_SCHED + pse = pse->parent; +#endif } } -- cgit v0.10.2 From 2b1e315dd2822c99793485f9e53a73459fb399c1 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 15 Oct 2007 17:00:12 +0200 Subject: sched: yield fix fix yield bugs due to the current-not-in-rbtree changes: the task is not in the rbtree so rbtree-removal is a no-no. [ From: Srivatsa Vaddagiri : build fix. ] also, nice code size reduction: kernel/sched.o: text data bss dec hex filename 38323 3506 24 41853 a37d sched.o.before 38236 3506 24 41766 a326 sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Dmitry Adamushko Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5384a97..fcd6900 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -739,9 +739,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) static void yield_task_fair(struct rq *rq) { struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr); - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; struct sched_entity *rightmost, *se = &rq->curr->se; - struct rb_node *parent; /* * Are we the only task in the tree? @@ -755,39 +753,26 @@ static void yield_task_fair(struct rq *rq) * Dequeue and enqueue the task to update its * position within the tree: */ - dequeue_entity(cfs_rq, se, 0); - enqueue_entity(cfs_rq, se, 0); + update_curr(cfs_rq); return; } /* * Find the rightmost entry in the rbtree: */ - do { - parent = *link; - link = &parent->rb_right; - } while (*link); - - rightmost = rb_entry(parent, struct sched_entity, run_node); + rightmost = __pick_last_entity(cfs_rq); /* * Already in the rightmost position? */ - if (unlikely(rightmost == se)) + if (unlikely(rightmost->vruntime < se->vruntime)) return; /* * Minimally necessary key value to be last in the tree: + * Upon rescheduling, sched_class::put_prev_task() will place + * 'current' within the tree based on its new key value. */ se->vruntime = rightmost->vruntime + 1; - - if (cfs_rq->rb_leftmost == &se->run_node) - cfs_rq->rb_leftmost = rb_next(&se->run_node); - /* - * Relink the task to the rightmost position: - */ - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); - rb_link_node(&se->run_node, parent, link); - rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); } /* -- cgit v0.10.2 From 2d72376b3af1e7d4d4515ebfd0f4383f2e92c343 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:12 +0200 Subject: sched: clean up schedstats, cnt -> count rename all 'cnt' fields and variables to the less yucky 'count' name. yuckage noticed by Andrew Morton. no change in code, other than the /proc/sched_debug bkl_count string got a bit larger: text data bss dec hex filename 38236 3506 24 41766 a326 sched.o.before 38240 3506 24 41770 a32a sched.o.after Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner diff --git a/fs/proc/base.c b/fs/proc/base.c index 19489b0..e5d0953 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -304,7 +304,7 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer) return sprintf(buffer, "%llu %llu %lu\n", task->sched_info.cpu_time, task->sched_info.run_delay, - task->sched_info.pcnt); + task->sched_info.pcount); } #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c33227..d5daca4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -614,7 +614,7 @@ struct reclaim_state; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info { /* cumulative counters */ - unsigned long pcnt; /* # of times run on this cpu */ + unsigned long pcount; /* # of times run on this cpu */ unsigned long long cpu_time, /* time spent on the cpu */ run_delay; /* time spent waiting on a runqueue */ @@ -623,7 +623,7 @@ struct sched_info { last_queued; /* when we were last queued to run */ #ifdef CONFIG_SCHEDSTATS /* BKL stats */ - unsigned long bkl_cnt; + unsigned long bkl_count; #endif }; #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ @@ -759,7 +759,7 @@ struct sched_domain { #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ - unsigned long lb_cnt[CPU_MAX_IDLE_TYPES]; + unsigned long lb_count[CPU_MAX_IDLE_TYPES]; unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; @@ -769,17 +769,17 @@ struct sched_domain { unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; /* Active load balancing */ - unsigned long alb_cnt; + unsigned long alb_count; unsigned long alb_failed; unsigned long alb_pushed; /* SD_BALANCE_EXEC stats */ - unsigned long sbe_cnt; + unsigned long sbe_count; unsigned long sbe_balanced; unsigned long sbe_pushed; /* SD_BALANCE_FORK stats */ - unsigned long sbf_cnt; + unsigned long sbf_count; unsigned long sbf_balanced; unsigned long sbf_pushed; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 81e6978..09e9574 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -119,7 +119,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) * No locking available for sched_info (and too expensive to add one) * Mitigate by taking snapshot of values */ - t1 = tsk->sched_info.pcnt; + t1 = tsk->sched_info.pcount; t2 = tsk->sched_info.run_delay; t3 = tsk->sched_info.cpu_time; diff --git a/kernel/sched.c b/kernel/sched.c index cd2b494..ba9fa6c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -349,19 +349,19 @@ struct rq { unsigned long yld_exp_empty; unsigned long yld_act_empty; unsigned long yld_both_empty; - unsigned long yld_cnt; + unsigned long yld_count; /* schedule() stats */ unsigned long sched_switch; - unsigned long sched_cnt; + unsigned long sched_count; unsigned long sched_goidle; /* try_to_wake_up() stats */ - unsigned long ttwu_cnt; + unsigned long ttwu_count; unsigned long ttwu_local; /* BKL stats */ - unsigned long bkl_cnt; + unsigned long bkl_count; #endif struct lock_class_key rq_lock_key; }; @@ -1481,7 +1481,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) new_cpu = cpu; - schedstat_inc(rq, ttwu_cnt); + schedstat_inc(rq, ttwu_count); if (cpu == this_cpu) { schedstat_inc(rq, ttwu_local); goto out_set_cpu; @@ -2637,7 +2637,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; - schedstat_inc(sd, lb_cnt[idle]); + schedstat_inc(sd, lb_count[idle]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, @@ -2790,7 +2790,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; - schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); + schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, &sd_idle, &cpus, NULL); @@ -2924,7 +2924,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) } if (likely(sd)) { - schedstat_inc(sd, alb_cnt); + schedstat_inc(sd, alb_count); if (move_one_task(target_rq, target_cpu, busiest_rq, sd, CPU_IDLE)) @@ -3414,11 +3414,11 @@ static inline void schedule_debug(struct task_struct *prev) profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - schedstat_inc(this_rq(), sched_cnt); + schedstat_inc(this_rq(), sched_count); #ifdef CONFIG_SCHEDSTATS if (unlikely(prev->lock_depth >= 0)) { - schedstat_inc(this_rq(), bkl_cnt); - schedstat_inc(prev, sched_info.bkl_cnt); + schedstat_inc(this_rq(), bkl_count); + schedstat_inc(prev, sched_info.bkl_count); } #endif } @@ -4558,7 +4558,7 @@ asmlinkage long sys_sched_yield(void) { struct rq *rq = this_rq_lock(); - schedstat_inc(rq, yld_cnt); + schedstat_inc(rq, yld_count); current->sched_class->yield_task(rq); /* diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4659c90..be79cd6 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -137,8 +137,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, " .%-30s: %ld\n", "bkl_cnt", - rq->bkl_cnt); + SEQ_printf(m, " .%-30s: %ld\n", "bkl_count", + rq->bkl_count); #endif SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", cfs_rq->nr_spread_over); @@ -342,7 +342,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.exec_max); PN(se.slice_max); PN(se.wait_max); - P(sched_info.bkl_cnt); + P(sched_info.bkl_count); #endif SEQ_printf(m, "%-25s:%20Ld\n", "nr_switches", (long long)(p->nvcsw + p->nivcsw)); @@ -370,7 +370,7 @@ void proc_sched_set_task(struct task_struct *p) p->se.exec_max = 0; p->se.slice_max = 0; p->se.wait_max = 0; - p->sched_info.bkl_cnt = 0; + p->sched_info.bkl_count = 0; #endif p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 1d9ec98..1c08484 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -16,18 +16,18 @@ static int show_schedstat(struct seq_file *seq, void *v) struct rq *rq = cpu_rq(cpu); #ifdef CONFIG_SMP struct sched_domain *sd; - int dcnt = 0; + int dcount = 0; #endif /* runqueue-specific stats */ seq_printf(seq, "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", cpu, rq->yld_both_empty, - rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, - rq->sched_switch, rq->sched_cnt, rq->sched_goidle, - rq->ttwu_cnt, rq->ttwu_local, + rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, + rq->sched_switch, rq->sched_count, rq->sched_goidle, + rq->ttwu_count, rq->ttwu_local, rq->rq_sched_info.cpu_time, - rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); seq_printf(seq, "\n"); @@ -39,12 +39,12 @@ static int show_schedstat(struct seq_file *seq, void *v) char mask_str[NR_CPUS]; cpumask_scnprintf(mask_str, NR_CPUS, sd->span); - seq_printf(seq, "domain%d %s", dcnt++, mask_str); + seq_printf(seq, "domain%d %s", dcount++, mask_str); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " "%lu", - sd->lb_cnt[itype], + sd->lb_count[itype], sd->lb_balanced[itype], sd->lb_failed[itype], sd->lb_imbalance[itype], @@ -55,9 +55,9 @@ static int show_schedstat(struct seq_file *seq, void *v) } seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" " %lu %lu %lu\n", - sd->alb_cnt, sd->alb_failed, sd->alb_pushed, - sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, - sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, + sd->alb_count, sd->alb_failed, sd->alb_pushed, + sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, + sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); } @@ -101,7 +101,7 @@ rq_sched_info_arrive(struct rq *rq, unsigned long long delta) { if (rq) { rq->rq_sched_info.run_delay += delta; - rq->rq_sched_info.pcnt++; + rq->rq_sched_info.pcount++; } } @@ -164,7 +164,7 @@ static void sched_info_arrive(struct task_struct *t) sched_info_dequeued(t); t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; - t->sched_info.pcnt++; + t->sched_info.pcount++; rq_sched_info_arrive(task_rq(t), delta); } -- cgit v0.10.2 From 2830cf8c90f37526d401f1999250312df970bfa3 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 15 Oct 2007 17:00:12 +0200 Subject: sched: group scheduler SMP migration fix group scheduler SMP migration fix: use task_cfs_rq(p) to get to the relevant fair-scheduling runqueue of a task, rq->cfs is not the right one. Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index ba9fa6c..e1657e0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1040,6 +1040,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); + struct cfs_rq *old_cfsrq = task_cfs_rq(p), + *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); u64 clock_offset; clock_offset = old_rq->clock - new_rq->clock; @@ -1052,7 +1054,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->se.block_start) p->se.block_start -= clock_offset; #endif - p->se.vruntime -= old_rq->cfs.min_vruntime - new_rq->cfs.min_vruntime; + p->se.vruntime -= old_cfsrq->min_vruntime - + new_cfsrq->min_vruntime; __set_task_cpu(p, new_cpu); } -- cgit v0.10.2 From b0ffd246ea947a037746e725bd461bb7e809a4b3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:12 +0200 Subject: sched: clean up min_vruntime use clean up min_vruntime use. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fcd6900..ec0569e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -124,6 +124,16 @@ max_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } +static inline u64 +min_vruntime(u64 min_vruntime, u64 vruntime) +{ + s64 delta = (s64)(vruntime - min_vruntime); + if (delta < 0) + min_vruntime = vruntime; + + return min_vruntime; +} + static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -251,7 +261,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { unsigned long delta_exec_weighted; - u64 next_vruntime, min_vruntime; + u64 vruntime; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); @@ -269,19 +279,13 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, * value tracking the leftmost vruntime in the tree. */ if (first_fair(cfs_rq)) { - next_vruntime = __pick_next_entity(cfs_rq)->vruntime; - - /* min_vruntime() := !max_vruntime() */ - min_vruntime = max_vruntime(curr->vruntime, next_vruntime); - if (min_vruntime == next_vruntime) - min_vruntime = curr->vruntime; - else - min_vruntime = next_vruntime; + vruntime = min_vruntime(curr->vruntime, + __pick_next_entity(cfs_rq)->vruntime); } else - min_vruntime = curr->vruntime; + vruntime = curr->vruntime; cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, min_vruntime); + max_vruntime(cfs_rq->min_vruntime, vruntime); } static void update_curr(struct cfs_rq *cfs_rq) -- cgit v0.10.2 From 5f6d858ecca78f71755859a346d845e302973cd1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:12 +0200 Subject: sched: speed up and simplify vslice calculations speed up and simplify vslice calculations. [ From: Mike Galbraith : build fix ] Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index d5daca4..97f736b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1400,7 +1400,7 @@ extern void sched_idle_next(void); #ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_nr_latency; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_batch_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index be79cd6..995bbd3 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -210,7 +210,7 @@ static int sched_debug_show(struct seq_file *m, void *v) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) PN(sysctl_sched_latency); - PN(sysctl_sched_min_granularity); + PN(sysctl_sched_nr_latency); PN(sysctl_sched_wakeup_granularity); PN(sysctl_sched_batch_wakeup_granularity); PN(sysctl_sched_child_runs_first); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ec0569e..ae2d4b0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -46,7 +46,7 @@ const_debug unsigned int sysctl_sched_child_runs_first = 1; * Minimal preemption granularity for CPU-bound tasks: * (default: 2 msec, units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; +const_debug unsigned int sysctl_sched_nr_latency = 20; /* * sys_sched_yield() compat mode @@ -222,8 +222,7 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) static u64 __sched_period(unsigned long nr_running) { u64 period = sysctl_sched_latency; - unsigned long nr_latency = - sysctl_sched_latency / sysctl_sched_min_granularity; + unsigned long nr_latency = sysctl_sched_nr_latency; if (unlikely(nr_running > nr_latency)) { period *= nr_running; @@ -245,11 +244,15 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) static u64 __sched_vslice(unsigned long nr_running) { - u64 period = __sched_period(nr_running); + unsigned long period = sysctl_sched_latency; + unsigned long nr_latency = sysctl_sched_nr_latency; - do_div(period, nr_running); + if (unlikely(nr_running > nr_latency)) + nr_running = nr_latency; - return period; + period /= nr_running; + + return (u64)period; } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97b15c2..230ca4e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -222,14 +222,11 @@ static ctl_table kern_table[] = { #ifdef CONFIG_SCHED_DEBUG { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_min_granularity_ns", - .data = &sysctl_sched_min_granularity, + .procname = "sched_nr_latency", + .data = &sysctl_sched_nr_latency, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, + .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, -- cgit v0.10.2 From b39c5dd7f938775fd0a1df5b4b1c26f854d15231 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:12 +0200 Subject: sched: cleanup, remove stale comment cleanup, remove stale comment. Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ae2d4b0..c44a295 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -496,7 +496,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) update_curr(cfs_rq); if (wakeup) { - /* se->vruntime += cfs_rq->min_vruntime; */ place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); } -- cgit v0.10.2 From fb615581c78efee25e4d04f1145e8fa8ec705dc3 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 15 Oct 2007 17:00:12 +0200 Subject: sched: group scheduler, fix coding style issues Fix coding style issues reported by Randy Dunlap and others Signed-off-by: Dhaval Giani Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner diff --git a/init/Kconfig b/init/Kconfig index faed9a0..54f31a1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -282,11 +282,11 @@ config CPUSETS Say N if unsure. config FAIR_GROUP_SCHED - bool "Fair group cpu scheduler" + bool "Fair group CPU scheduler" default y depends on EXPERIMENTAL help - This feature lets cpu scheduler recognize task groups and control cpu + This feature lets CPU scheduler recognize task groups and control CPU bandwidth allocation to such task groups. choice @@ -294,11 +294,11 @@ choice prompt "Basis for grouping tasks" default FAIR_USER_SCHED - config FAIR_USER_SCHED - bool "user id" - help - This option will choose userid as the basis for grouping - tasks, thus providing equal cpu bandwidth to each user. +config FAIR_USER_SCHED + bool "user id" + help + This option will choose userid as the basis for grouping + tasks, thus providing equal CPU bandwidth to each user. endchoice diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 995bbd3..48748d0 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -239,11 +239,7 @@ static int root_user_share_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len; - - len = sprintf(page, "%d\n", init_task_grp_load); - - return len; + return sprintf(page, "%d\n", init_task_grp_load); } static int @@ -297,7 +293,7 @@ static int __init init_sched_debug_procfs(void) pe->proc_fops = &sched_debug_fops; #ifdef CONFIG_FAIR_USER_SCHED - pe = create_proc_entry("root_user_share", 0644, NULL); + pe = create_proc_entry("root_user_cpu_share", 0644, NULL); if (!pe) return -ENOMEM; -- cgit v0.10.2 From fad095a7b963d9e914e0cdb73e27355c47709441 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 15 Oct 2007 17:00:12 +0200 Subject: sched: group scheduler, fix bloat Recent fix to check_preempt_wakeup() to check for preemption at higher levels caused a size bloat for !CONFIG_FAIR_GROUP_SCHED. Fix the problem. 42277 10598 320 53195 cfcb kernel/sched.o-before_this_patch 42216 10598 320 53134 cf8e kernel/sched.o-after_this_patch Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c44a295..57e7f36 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -652,15 +652,21 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) #define for_each_leaf_cfs_rq(rq, cfs_rq) \ list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) -/* Do the two (enqueued) tasks belong to the same group ? */ -static inline int is_same_group(struct task_struct *curr, struct task_struct *p) +/* Do the two (enqueued) entities belong to the same group ? */ +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) { - if (curr->se.cfs_rq == p->se.cfs_rq) + if (se->cfs_rq == pse->cfs_rq) return 1; return 0; } +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return se->parent; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ #define for_each_sched_entity(se) \ @@ -693,11 +699,17 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) -static inline int is_same_group(struct task_struct *curr, struct task_struct *p) +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) { return 1; } +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return NULL; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -787,8 +799,9 @@ static void yield_task_fair(struct rq *rq) static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) { struct task_struct *curr = rq->curr; - struct cfs_rq *cfs_rq = task_cfs_rq(curr), *pcfs_rq; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; + s64 delta; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -797,21 +810,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) return; } - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - pcfs_rq = cfs_rq_of(pse); + while (!is_same_group(se, pse)) { + se = parent_entity(se); + pse = parent_entity(pse); + } - if (cfs_rq == pcfs_rq) { - s64 delta = se->vruntime - pse->vruntime; + delta = se->vruntime - pse->vruntime; - if (delta > (s64)sysctl_sched_wakeup_granularity) - resched_task(curr); - break; - } -#ifdef CONFIG_FAIR_GROUP_SCHED - pse = pse->parent; -#endif - } + if (delta > (s64)sysctl_sched_wakeup_granularity) + resched_task(curr); } static struct task_struct *pick_next_task_fair(struct rq *rq) -- cgit v0.10.2 From b9fa3df33f9166daf81bfa8253d339f5a7726122 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 15 Oct 2007 17:00:12 +0200 Subject: sched: group scheduler, fix latency There is a possibility that because of task of a group moving from one cpu to another, it may gain more cpu time that desired. See http://marc.info/?l=linux-kernel&m=119073197730334 for details. This is an attempt to fix that problem. Basically it simulates dequeue of higher level entities as if they are going to sleep. Similarly it simulate wakeup of higher level entities as if they are waking up from sleep. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 57e7f36..de13a6f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -727,6 +727,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) break; cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, wakeup); + wakeup = 1; } } @@ -746,6 +747,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; + sleep = 1; } } -- cgit v0.10.2 From 5522d5d5f70005faeffff3ffc0cfa8eec0155de4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:12 +0200 Subject: sched: mark scheduling classes as const mark scheduling classes as const. The speeds up the code a bit and shrinks it: text data bss dec hex filename 40027 4018 292 44337 ad31 sched.o.before 40190 3842 292 44324 ad24 sched.o.after Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 97f736b..47e3717 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -863,7 +863,7 @@ struct rq; struct sched_domain; struct sched_class { - struct sched_class *next; + const struct sched_class *next; void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); @@ -949,7 +949,7 @@ struct task_struct { int prio, static_prio, normal_prio; struct list_head run_list; - struct sched_class *sched_class; + const struct sched_class *sched_class; struct sched_entity se; #ifdef CONFIG_PREEMPT_NOTIFIERS diff --git a/kernel/sched.c b/kernel/sched.c index e1657e0..f582e2c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -847,9 +847,9 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, int *this_best_prio, struct rq_iterator *iterator); #include "sched_stats.h" -#include "sched_rt.c" -#include "sched_fair.c" #include "sched_idletask.c" +#include "sched_fair.c" +#include "sched_rt.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" #endif @@ -2251,7 +2251,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { - struct sched_class *class = sched_class_highest; + const struct sched_class *class = sched_class_highest; unsigned long total_load_moved = 0; int this_best_prio = this_rq->curr->prio; @@ -2276,7 +2276,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle) { - struct sched_class *class; + const struct sched_class *class; int this_best_prio = MAX_PRIO; for (class = sched_class_highest; class; class = class->next) @@ -3432,7 +3432,7 @@ static inline void schedule_debug(struct task_struct *prev) static inline struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev) { - struct sched_class *class; + const struct sched_class *class; struct task_struct *p; /* @@ -6504,13 +6504,6 @@ void __init sched_init(void) int highest_cpu = 0; int i, j; - /* - * Link up the scheduling class hierarchy: - */ - rt_sched_class.next = &fair_sched_class; - fair_sched_class.next = &idle_sched_class; - idle_sched_class.next = NULL; - for_each_possible_cpu(i) { struct rt_prio_array *array; struct rq *rq; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index de13a6f..32fd976 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -76,8 +76,6 @@ const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL; */ const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL; -extern struct sched_class fair_sched_class; - /************************************************************** * CFS operations on generic schedulable entities: */ @@ -1031,7 +1029,8 @@ static void set_curr_task_fair(struct rq *rq) /* * All the scheduling class methods: */ -struct sched_class fair_sched_class __read_mostly = { +static const struct sched_class fair_sched_class = { + .next = &idle_sched_class, .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 5ebf829..6e2ead4 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -57,7 +57,8 @@ static void set_curr_task_idle(struct rq *rq) /* * Simple, special scheduling class for the per-CPU idle tasks: */ -static struct sched_class idle_sched_class __read_mostly = { +const struct sched_class idle_sched_class = { + /* .next is NULL */ /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e1d5f1c..dbe4d8c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -225,7 +225,8 @@ static void set_curr_task_rt(struct rq *rq) p->se.exec_start = rq->clock; } -static struct sched_class rt_sched_class __read_mostly = { +const struct sched_class rt_sched_class = { + .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, -- cgit v0.10.2 From 3a2520157234d58abce89526756a32c272824f3f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:12 +0200 Subject: sched: whitespace cleanups more whitespace cleanups. No code changed: text data bss dec hex filename 26553 2790 288 29631 73bf sched.o.before 26553 2790 288 29631 73bf sched.o.after Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index f582e2c..e717047 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -193,17 +193,17 @@ static struct sched_entity *init_sched_entity_p[NR_CPUS]; static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; /* Default task group. - * Every task in system belong to this group at bootup. + * Every task in system belong to this group at bootup. */ -struct task_grp init_task_grp = { - .se = init_sched_entity_p, - .cfs_rq = init_cfs_rq_p, - }; +struct task_grp init_task_grp = { + .se = init_sched_entity_p, + .cfs_rq = init_cfs_rq_p, +}; #ifdef CONFIG_FAIR_USER_SCHED -#define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD +# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD #else -#define INIT_TASK_GRP_LOAD NICE_0_LOAD +# define INIT_TASK_GRP_LOAD NICE_0_LOAD #endif static int init_task_grp_load = INIT_TASK_GRP_LOAD; @@ -6516,25 +6516,25 @@ void __init sched_init(void) init_cfs_rq(&rq->cfs, rq); #ifdef CONFIG_FAIR_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); - { - struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); - struct sched_entity *se = - &per_cpu(init_sched_entity, i); - - init_cfs_rq_p[i] = cfs_rq; - init_cfs_rq(cfs_rq, rq); - cfs_rq->tg = &init_task_grp; - list_add(&cfs_rq->leaf_cfs_rq_list, + { + struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); + struct sched_entity *se = + &per_cpu(init_sched_entity, i); + + init_cfs_rq_p[i] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = &init_task_grp; + list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); - init_sched_entity_p[i] = se; - se->cfs_rq = &rq->cfs; - se->my_q = cfs_rq; - se->load.weight = init_task_grp_load; + init_sched_entity_p[i] = se; + se->cfs_rq = &rq->cfs; + se->my_q = cfs_rq; + se->load.weight = init_task_grp_load; se->load.inv_weight = div64_64(1ULL<<32, init_task_grp_load); - se->parent = NULL; - } + se->parent = NULL; + } init_task_grp.shares = init_task_grp_load; #endif @@ -6840,9 +6840,9 @@ void sched_destroy_group(struct task_grp *tg) } /* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. + * The caller of this function should have put the task in its new group + * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to + * reflect its new group. */ void sched_move_task(struct task_struct *tsk) { @@ -6915,4 +6915,4 @@ int sched_group_set_shares(struct task_grp *tg, unsigned long shares) return 0; } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v0.10.2 From 647e7cac2d215fb8890f79252d7eaee3d6743d66 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:13 +0200 Subject: sched: vslice fixups for non-0 nice levels Make vslice accurate wrt nice levels, and add some comments while we're at it. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 32fd976..1f14b56 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -217,6 +217,15 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ + +/* + * The idea is to set a period in which each task runs once. + * + * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch + * this period because otherwise the slices get too small. + * + * p = (nr <= nl) ? l : l*nr/nl + */ static u64 __sched_period(unsigned long nr_running) { u64 period = sysctl_sched_latency; @@ -230,27 +239,45 @@ static u64 __sched_period(unsigned long nr_running) return period; } +/* + * We calculate the wall-time slice from the period by taking a part + * proportional to the weight. + * + * s = p*w/rw + */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 period = __sched_period(cfs_rq->nr_running); + u64 slice = __sched_period(cfs_rq->nr_running); - period *= se->load.weight; - do_div(period, cfs_rq->load.weight); + slice *= se->load.weight; + do_div(slice, cfs_rq->load.weight); - return period; + return slice; } -static u64 __sched_vslice(unsigned long nr_running) +/* + * We calculate the vruntime slice. + * + * vs = s/w = p/rw + */ +static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) { - unsigned long period = sysctl_sched_latency; - unsigned long nr_latency = sysctl_sched_nr_latency; + u64 vslice = __sched_period(nr_running); - if (unlikely(nr_running > nr_latency)) - nr_running = nr_latency; + do_div(vslice, rq_weight); - period /= nr_running; + return vslice; +} - return (u64)period; +static u64 sched_vslice(struct cfs_rq *cfs_rq) +{ + return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); +} + +static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return __sched_vslice(cfs_rq->load.weight + se->load.weight, + cfs_rq->nr_running + 1); } /* @@ -469,10 +496,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime >>= 1; } } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) - vruntime += __sched_vslice(cfs_rq->nr_running)/2; + vruntime += sched_vslice(cfs_rq)/2; if (initial && sched_feat(START_DEBIT)) - vruntime += __sched_vslice(cfs_rq->nr_running + 1); + vruntime += sched_vslice_add(cfs_rq, se); if (!initial) { if (sched_feat(NEW_FAIR_SLEEPERS)) -- cgit v0.10.2 From 08ec3df5109e0555da5b9deb4382fd29733c852c Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 15 Oct 2007 17:00:13 +0200 Subject: sched: fix __pick_next_entity() The thing is that __pick_next_entity() must never be called when first_fair(cfs_rq) == NULL. It wouldn't be a problem, should 'run_node' be the very first field of 'struct sched_entity' (and it's the second). The 'nr_running != 0' check is _not_ enough, due to the fact that 'current' is not within the tree. Generic paths are ok (e.g. schedule() as put_prev_task() is called previously)... I'm more worried about e.g. migration_call() -> CPU_DEAD_FROZEN -> migrate_dead_tasks()... if 'current' == rq->idle, no problems.. if it's one of the SCHED_NORMAL tasks (or imagine, some other use-cases in the future -- i.e. we should not make outer world dependent on internal details of sched_fair class) -- it may be "Houston, we've got a problem" case. it's +16 bytes to the ".text". Another variant is to make 'run_node' the first data member of 'struct sched_entity' but an additional check (se ! = NULL) is still needed in pick_next_entity(). Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 1f14b56..fa78686 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -600,9 +600,12 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = NULL; - set_next_entity(cfs_rq, se); + if (first_fair(cfs_rq)) { + se = __pick_next_entity(cfs_rq); + set_next_entity(cfs_rq, se); + } return se; } -- cgit v0.10.2 From 1e819950660e6a811b549422ffb652273257e45e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:13 +0200 Subject: sched: optimize schedule() a bit on SMP optimize schedule() a bit on SMP, by moving the rq-clock update outside the rq lock. code size is the same: text data bss dec hex filename 25725 2666 96 28487 6f47 sched.o.before 25725 2666 96 28487 6f47 sched.o.after Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index e717047..4f13d37 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3481,9 +3481,13 @@ need_resched_nonpreemptible: schedule_debug(prev); - spin_lock_irq(&rq->lock); - clear_tsk_need_resched(prev); + /* + * Do the rq-clock update outside the rq lock: + */ + local_irq_disable(); __update_rq_clock(rq); + spin_lock(&rq->lock); + clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely((prev->state & TASK_INTERRUPTIBLE) && -- cgit v0.10.2 From 155bb293ae8387526e6e07d42b1691104e55d9a2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:13 +0200 Subject: sched: tweak wakeup granularity tweak wakeup granularity. Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fa78686..0856701 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -58,23 +58,23 @@ unsigned int __read_mostly sysctl_sched_compat_yield; /* * SCHED_BATCH wake-up granularity. - * (default: 25 msec, units: nanoseconds) + * (default: 10 msec, units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL; +const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; /* * SCHED_OTHER wake-up granularity. - * (default: 1 msec, units: nanoseconds) + * (default: 10 msec, units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL; +const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; /************************************************************** * CFS operations on generic schedulable entities: -- cgit v0.10.2 From a9957449b08ab561a33e1e038df06843b8d8dd9f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 15 Oct 2007 17:00:13 +0200 Subject: sched: uninline scheduler * save ~300 bytes * activate_idle_task() was moved to avoid a warning bloat-o-meter output: add/remove: 6/0 grow/shrink: 0/16 up/down: 438/-733 (-295) <=== function old new delta __enqueue_entity - 165 +165 finish_task_switch - 110 +110 update_curr_rt - 79 +79 __load_balance_iterator - 32 +32 __task_rq_unlock - 28 +28 find_process_by_pid - 24 +24 do_sched_setscheduler 133 123 -10 sys_sched_rr_get_interval 176 165 -11 sys_sched_getparam 156 145 -11 normalize_rt_tasks 482 470 -12 sched_getaffinity 112 99 -13 sys_sched_getscheduler 86 72 -14 sched_setaffinity 226 212 -14 sched_setscheduler 666 642 -24 load_balance_start_fair 33 9 -24 load_balance_next_fair 33 9 -24 dequeue_task_rt 133 67 -66 put_prev_task_rt 97 28 -69 schedule_tail 133 50 -83 schedule 682 594 -88 enqueue_entity 499 366 -133 task_new_fair 317 180 -137 Signed-off-by: Alexey Dobriyan Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 4f13d37..ce9bb7a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -608,7 +608,7 @@ repeat_lock_task: return rq; } -static inline void __task_rq_unlock(struct rq *rq) +static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { spin_unlock(&rq->lock); @@ -623,7 +623,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) /* * this_rq_lock - lock this runqueue and disable interrupts. */ -static inline struct rq *this_rq_lock(void) +static struct rq *this_rq_lock(void) __acquires(rq->lock) { struct rq *rq; @@ -986,20 +986,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) } /* - * activate_idle_task - move idle task to the _front_ of runqueue. - */ -static inline void activate_idle_task(struct task_struct *p, struct rq *rq) -{ - update_rq_clock(rq); - - if (p->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible--; - - enqueue_task(rq, p, 0); - inc_nr_running(p, rq); -} - -/* * deactivate_task - remove a task from the runqueue. */ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) @@ -1206,7 +1192,7 @@ void kick_process(struct task_struct *p) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static inline unsigned long source_load(int cpu, int type) +static unsigned long source_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); @@ -1221,7 +1207,7 @@ static inline unsigned long source_load(int cpu, int type) * Return a high guess at the load of a migration-target cpu weighted * according to the scheduling class and "nice" value. */ -static inline unsigned long target_load(int cpu, int type) +static unsigned long target_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); @@ -1813,7 +1799,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * with the lock held can cause deadlocks; see schedule() for * details.) */ -static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) +static void finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; @@ -3020,7 +3006,7 @@ static DEFINE_SPINLOCK(balancing); * * Balancing parameters are set up in arch_init_sched_domains. */ -static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) +static void rebalance_domains(int cpu, enum cpu_idle_type idle) { int balance = 1; struct rq *rq = cpu_rq(cpu); @@ -4140,7 +4126,7 @@ struct task_struct *idle_task(int cpu) * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. */ -static inline struct task_struct *find_process_by_pid(pid_t pid) +static struct task_struct *find_process_by_pid(pid_t pid) { return pid ? find_task_by_pid(pid) : current; } @@ -5157,6 +5143,20 @@ static void migrate_live_tasks(int src_cpu) } /* + * activate_idle_task - move idle task to the _front_ of runqueue. + */ +static void activate_idle_task(struct task_struct *p, struct rq *rq) +{ + update_rq_clock(rq); + + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + + enqueue_task(rq, p, 0); + inc_nr_running(p, rq); +} + +/* * Schedules idle task to be the next runnable task on current CPU. * It does so by boosting its priority to highest possible and adding it to * the _front_ of the runqueue. Used by CPU offline code. @@ -6494,7 +6494,7 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } -static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) { cfs_rq->tasks_timeline = RB_ROOT; #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0856701..48604ea 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -892,7 +892,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) * achieve that by always pre-iterating before returning * the current task: */ -static inline struct task_struct * +static struct task_struct * __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) { struct task_struct *p; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index dbe4d8c..2f26c3d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -7,7 +7,7 @@ * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */ -static inline void update_curr_rt(struct rq *rq) +static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; u64 delta_exec; -- cgit v0.10.2 From a4ec24b48ddef1e93f7578be53270f0b95ad666c Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 15 Oct 2007 17:00:13 +0200 Subject: sched: tidy up SCHED_RR - make timeslices of SCHED_RR tasks constant and not dependent on task's static_prio [1] ; - remove obsolete code (timeslice related bits); - make sched_rr_get_interval() return something more meaningful [2] for SCHED_OTHER tasks. [1] according to the following link, it's not compliant with SUSv3 (not sure though, what is the reference for us :-) http://lkml.org/lkml/2007/3/7/656 [2] the interval is dynamic and can be depicted as follows "should a task be one of the runnable tasks at this particular moment, it would expect to run for this interval of time before being re-scheduled by the scheduler tick". (i.e. it's more precise if a task is runnable at the moment) yeah, this seems to require task_rq_lock/unlock() but this is not a hot path. results: (SCHED_FIFO) dimm@earth:~/storage/prog$ sudo chrt -f 10 ./rr_interval time_slice: 0 : 0 (SCHED_RR) dimm@earth:~/storage/prog$ sudo chrt 10 ./rr_interval time_slice: 0 : 99984800 (SCHED_NORMAL) dimm@earth:~/storage/prog$ ./rr_interval time_slice: 0 : 19996960 (SCHED_NORMAL + a cpu_hog of similar 'weight' on the same CPU --- so should be a half of the previous result) dimm@earth:~/storage/prog$ taskset 1 ./rr_interval time_slice: 0 : 9998480 Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index ce9bb7a..f370f10 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -96,7 +96,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) /* * Some helpers for converting nanosecond timing to jiffy resolution */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) #define NICE_0_LOAD SCHED_LOAD_SCALE @@ -105,11 +105,9 @@ unsigned long long __attribute__((weak)) sched_clock(void) /* * These are the 'tuning knobs' of the scheduler: * - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. + * default timeslice is 100 msecs (used only for SCHED_RR tasks). * Timeslices get refilled after they expire. */ -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) #define DEF_TIMESLICE (100 * HZ / 1000) #ifdef CONFIG_SMP @@ -133,24 +131,6 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) } #endif -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) - -/* - * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - */ -static unsigned int static_prio_timeslice(int static_prio) -{ - if (static_prio == NICE_TO_PRIO(19)) - return 1; - - if (static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, static_prio); -} - static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) @@ -4746,6 +4726,7 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) { struct task_struct *p; + unsigned int time_slice; int retval = -EINVAL; struct timespec t; @@ -4762,9 +4743,21 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) if (retval) goto out_unlock; - jiffies_to_timespec(p->policy == SCHED_FIFO ? - 0 : static_prio_timeslice(p->static_prio), &t); + if (p->policy == SCHED_FIFO) + time_slice = 0; + else if (p->policy == SCHED_RR) + time_slice = DEF_TIMESLICE; + else { + struct sched_entity *se = &p->se; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(p, &flags); + time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); + task_rq_unlock(rq, &flags); + } read_unlock(&tasklist_lock); + jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: return retval; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2f26c3d..d0097a0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -206,7 +206,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) if (--p->time_slice) return; - p->time_slice = static_prio_timeslice(p->static_prio); + p->time_slice = DEF_TIMESLICE; /* * Requeue to the end of queue if we are not the only element -- cgit v0.10.2 From a03c9061d93822f66eb6287f8e9cf5833a12b49c Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 15 Oct 2007 17:00:13 +0200 Subject: sched: cleanup, remove calc_weighted() remove obsolete code -- calc_weighted() Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 48604ea..d8502ec 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -342,17 +342,6 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); } -static inline unsigned long -calc_weighted(unsigned long delta, struct sched_entity *se) -{ - unsigned long weight = se->load.weight; - - if (unlikely(weight != NICE_0_LOAD)) - return (u64)delta * se->load.weight >> NICE_0_SHIFT; - else - return delta; -} - /* * Task is being enqueued - update stats: */ -- cgit v0.10.2 From a2a2d680735ad7c3b5250704b3863abf54ff4020 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 15 Oct 2007 17:00:13 +0200 Subject: sched: cleanup, make dequeue_entity() and update_stats_wait_end() similar make dequeue_entity() / enqueue_entity() and update_stats_dequeue() / update_stats_enqueue() look similar, structure-wise. zero effect, functionality-wise: text data bss dec hex filename 34550 3026 100 37676 932c sched.o.before 34550 3026 100 37676 932c sched.o.after Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d8502ec..7826e18 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -366,7 +366,6 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_curr(cfs_rq); /* * Mark the end of the wait period if dequeueing a * waiting task: @@ -505,7 +504,7 @@ static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) { /* - * Update the fair clock. + * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); @@ -524,6 +523,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + update_stats_dequeue(cfs_rq, se); if (sleep) { #ifdef CONFIG_SCHEDSTATS @@ -787,8 +791,7 @@ static void yield_task_fair(struct rq *rq) if (likely(!sysctl_sched_compat_yield)) { __update_rq_clock(rq); /* - * Dequeue and enqueue the task to update its - * position within the tree: + * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); -- cgit v0.10.2 From af92723262f3e0c431083f668b605a1dcdbe8f3d Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 15 Oct 2007 17:00:13 +0200 Subject: sched: cleanup, remove the TASK_NONINTERACTIVE flag Here's another piece of low hanging obsolete fruit. Remove obsolete TASK_NONINTERACTIVE. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar diff --git a/fs/pipe.c b/fs/pipe.c index 6b3d91a..f1fa2b4 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -45,8 +45,7 @@ void pipe_wait(struct pipe_inode_info *pipe) * Pipes are system-local resources, so sleeping on them * is considered a noninteractive wait: */ - prepare_to_wait(&pipe->wait, &wait, - TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); + prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); if (pipe->inode) mutex_unlock(&pipe->inode->i_mutex); schedule(); diff --git a/include/linux/sched.h b/include/linux/sched.h index 47e3717..49c7b37 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -175,8 +175,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define EXIT_ZOMBIE 16 #define EXIT_DEAD 32 /* in tsk->state again */ -#define TASK_NONINTERACTIVE 64 -#define TASK_DEAD 128 +#define TASK_DEAD 64 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) -- cgit v0.10.2 From 3e9830dcabdeb3656855ec1b678b6bcf3b50261c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:13 +0200 Subject: sched: run sched_domain_debug() if CONFIG_SCHED_DEBUG=y run sched_domain_debug() if CONFIG_SCHED_DEBUG=y, instead of relying on the hand-crafted SCHED_DOMAIN_DEBUG switch. Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index f370f10..1a80ac1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5476,8 +5476,7 @@ int __init migration_init(void) int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); -#undef SCHED_DOMAIN_DEBUG -#ifdef SCHED_DOMAIN_DEBUG +#ifdef CONFIG_SCHED_DEBUG static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; -- cgit v0.10.2 From 26797a34a24cfeab9951a6f42f27432c0b2546af Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:13 +0200 Subject: sched: break out if printing a warning in sched_domain_debug() checkpatch.pl and Andy Whitcroft noticed the following bug: we did not break out after printing an error. Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 1a80ac1..7fefd8a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5534,16 +5534,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk("\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); + break; } if (!cpus_weight(group->cpumask)) { printk("\n"); printk(KERN_ERR "ERROR: empty group\n"); + break; } if (cpus_intersects(groupmask, group->cpumask)) { printk("\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); + break; } cpus_or(groupmask, groupmask, group->cpumask); -- cgit v0.10.2 From 8927f49479756c1aff76e8202ad32733c965864f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:13 +0200 Subject: sched: style cleanup fix up __setup() style bug - noticed via checkpatch.pl. Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 7fefd8a..10b7bed 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5680,7 +5680,7 @@ static int __init isolated_cpu_setup(char *str) return 1; } -__setup ("isolcpus=", isolated_cpu_setup); +__setup("isolcpus=", isolated_cpu_setup); /* * init_sched_build_groups takes the cpumask we wish to span, and a pointer -- cgit v0.10.2 From a65914b3658043da27c159b8a28c5811bb0a88c9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:13 +0200 Subject: sched: kfree(NULL) is valid kfree(NULL) is valid. pointed out by checkpatch.pl. the fix shrinks the code a bit: text data bss dec hex filename 40024 3842 100 43966 abbe sched.o.before 40002 3842 100 43944 aba8 sched.o.after Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 10b7bed..23da933 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6784,17 +6784,14 @@ struct task_grp *sched_create_group(void) err: for_each_possible_cpu(i) { - if (tg->cfs_rq && tg->cfs_rq[i]) + if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se && tg->se[i]) + if (tg->se) kfree(tg->se[i]); } - if (tg->cfs_rq) - kfree(tg->cfs_rq); - if (tg->se) - kfree(tg->se); - if (tg) - kfree(tg); + kfree(tg->cfs_rq); + kfree(tg->se); + kfree(tg); return ERR_PTR(-ENOMEM); } -- cgit v0.10.2 From 06877c33fe9261ccdf143492c28de93c56493079 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:13 +0200 Subject: sched: cleanup: rename SCHED_FEAT_USE_TREE_AVG to SCHED_FEAT_TREE_AVG cleanup: rename SCHED_FEAT_USE_TREE_AVG to SCHED_FEAT_TREE_AVG, to make SCHED_FEAT_ names more consistent. Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 23da933..5bfe1df 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -440,14 +440,14 @@ static void update_rq_clock(struct rq *rq) enum { SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, SCHED_FEAT_START_DEBIT = 2, - SCHED_FEAT_USE_TREE_AVG = 4, + SCHED_FEAT_TREE_AVG = 4, SCHED_FEAT_APPROX_AVG = 8, }; const_debug unsigned int sysctl_sched_features = SCHED_FEAT_NEW_FAIR_SLEEPERS *1 | SCHED_FEAT_START_DEBIT *1 | - SCHED_FEAT_USE_TREE_AVG *0 | + SCHED_FEAT_TREE_AVG *0 | SCHED_FEAT_APPROX_AVG *0; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7826e18..14a9b9b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -477,7 +477,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime = cfs_rq->min_vruntime; - if (sched_feat(USE_TREE_AVG)) { + if (sched_feat(TREE_AVG)) { struct sched_entity *last = __pick_last_entity(cfs_rq); if (last) { vruntime += last->vruntime; -- cgit v0.10.2 From 4cf86d77f5942336e7cd9de874b38b3c83b54d5e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: cleanup: rename task_grp to task_group cleanup: rename task_grp to task_group. No need to save two characters and 'grp' is annoying to read. Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index 49c7b37..3cddbfc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -136,7 +136,7 @@ extern unsigned long weighted_cpuload(const int cpu); struct seq_file; struct cfs_rq; -struct task_grp; +struct task_group; #ifdef CONFIG_SCHED_DEBUG extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); @@ -598,7 +598,7 @@ struct user_struct { uid_t uid; #ifdef CONFIG_FAIR_USER_SCHED - struct task_grp *tg; + struct task_group *tg; #endif }; @@ -1842,12 +1842,12 @@ extern void normalize_rt_tasks(void); #ifdef CONFIG_FAIR_GROUP_SCHED -extern struct task_grp init_task_grp; +extern struct task_group init_task_group; -extern struct task_grp *sched_create_group(void); -extern void sched_destroy_group(struct task_grp *tg); +extern struct task_group *sched_create_group(void); +extern void sched_destroy_group(struct task_group *tg); extern void sched_move_task(struct task_struct *tsk); -extern int sched_group_set_shares(struct task_grp *tg, unsigned long shares); +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); #endif diff --git a/kernel/sched.c b/kernel/sched.c index 5bfe1df..f2b8db4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -156,7 +156,7 @@ struct rt_prio_array { struct cfs_rq; /* task group related information */ -struct task_grp { +struct task_group { /* schedulable entities of this group on each cpu */ struct sched_entity **se; /* runqueue "owned" by this group on each cpu */ @@ -175,7 +175,7 @@ static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; /* Default task group. * Every task in system belong to this group at bootup. */ -struct task_grp init_task_grp = { +struct task_group init_task_group = { .se = init_sched_entity_p, .cfs_rq = init_cfs_rq_p, }; @@ -186,17 +186,17 @@ struct task_grp init_task_grp = { # define INIT_TASK_GRP_LOAD NICE_0_LOAD #endif -static int init_task_grp_load = INIT_TASK_GRP_LOAD; +static int init_task_group_load = INIT_TASK_GRP_LOAD; /* return group to which a task belongs */ -static inline struct task_grp *task_grp(struct task_struct *p) +static inline struct task_group *task_group(struct task_struct *p) { - struct task_grp *tg; + struct task_group *tg; #ifdef CONFIG_FAIR_USER_SCHED tg = p->user->tg; #else - tg = &init_task_grp; + tg = &init_task_group; #endif return tg; @@ -205,8 +205,8 @@ static inline struct task_grp *task_grp(struct task_struct *p) /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_cfs_rq(struct task_struct *p) { - p->se.cfs_rq = task_grp(p)->cfs_rq[task_cpu(p)]; - p->se.parent = task_grp(p)->se[task_cpu(p)]; + p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)]; + p->se.parent = task_group(p)->se[task_cpu(p)]; } #else @@ -244,7 +244,7 @@ struct cfs_rq { * list is used during load balance. */ struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ - struct task_grp *tg; /* group that "owns" this runqueue */ + struct task_group *tg; /* group that "owns" this runqueue */ struct rcu_head rcu; #endif }; @@ -6522,19 +6522,19 @@ void __init sched_init(void) init_cfs_rq_p[i] = cfs_rq; init_cfs_rq(cfs_rq, rq); - cfs_rq->tg = &init_task_grp; + cfs_rq->tg = &init_task_group; list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); init_sched_entity_p[i] = se; se->cfs_rq = &rq->cfs; se->my_q = cfs_rq; - se->load.weight = init_task_grp_load; + se->load.weight = init_task_group_load; se->load.inv_weight = - div64_64(1ULL<<32, init_task_grp_load); + div64_64(1ULL<<32, init_task_group_load); se->parent = NULL; } - init_task_grp.shares = init_task_grp_load; + init_task_group.shares = init_task_group_load; #endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) @@ -6725,9 +6725,9 @@ void set_curr_task(int cpu, struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED /* allocate runqueue etc for a new task group */ -struct task_grp *sched_create_group(void) +struct task_group *sched_create_group(void) { - struct task_grp *tg; + struct task_group *tg; struct cfs_rq *cfs_rq; struct sched_entity *se; struct rq *rq; @@ -6800,7 +6800,7 @@ err: static void free_sched_group(struct rcu_head *rhp) { struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu); - struct task_grp *tg = cfs_rq->tg; + struct task_group *tg = cfs_rq->tg; struct sched_entity *se; int i; @@ -6819,7 +6819,7 @@ static void free_sched_group(struct rcu_head *rhp) } /* Destroy runqueue etc associated with a task group */ -void sched_destroy_group(struct task_grp *tg) +void sched_destroy_group(struct task_group *tg) { struct cfs_rq *cfs_rq; int i; @@ -6895,7 +6895,7 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) spin_unlock_irq(&rq->lock); } -int sched_group_set_shares(struct task_grp *tg, unsigned long shares) +int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 48748d0..6f87b31 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -239,7 +239,7 @@ static int root_user_share_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - return sprintf(page, "%d\n", init_task_grp_load); + return sprintf(page, "%d\n", init_task_group_load); } static int @@ -260,8 +260,8 @@ root_user_share_write_proc(struct file *file, const char __user *buffer, mutex_lock(&root_user_share_mutex); - init_task_grp_load = shares; - rc = sched_group_set_shares(&init_task_grp, shares); + init_task_group_load = shares; + rc = sched_group_set_shares(&init_task_group, shares); mutex_unlock(&root_user_share_mutex); diff --git a/kernel/user.c b/kernel/user.c index c6387fa..0c9a787 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,7 +51,7 @@ struct user_struct root_user = { .session_keyring = &root_session_keyring, #endif #ifdef CONFIG_FAIR_USER_SCHED - .tg = &init_task_grp, + .tg = &init_task_group, #endif }; -- cgit v0.10.2 From 0702e3ebc1e42576a04d29f8adacf13be825b800 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: cleanup: function prototype cleanups noticed by Thomas Gleixner: cleanup: function prototype cleanups - move into single line wherever possible. Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 14a9b9b..a9dfb77 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -112,8 +112,7 @@ static inline struct task_struct *task_of(struct sched_entity *se) * Scheduling class tree data structure manipulation methods: */ -static inline u64 -max_vruntime(u64 min_vruntime, u64 vruntime) +static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - min_vruntime); if (delta > 0) @@ -122,8 +121,7 @@ max_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline u64 -min_vruntime(u64 min_vruntime, u64 vruntime) +static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - min_vruntime); if (delta < 0) @@ -132,8 +130,7 @@ min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline s64 -entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { return se->vruntime - cfs_rq->min_vruntime; } @@ -141,8 +138,7 @@ entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * Enqueue an entity into the rb-tree: */ -static void -__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; struct rb_node *parent = NULL; @@ -179,8 +175,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); } -static void -__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->rb_leftmost == &se->run_node) cfs_rq->rb_leftmost = rb_next(&se->run_node); -- cgit v0.10.2 From 00bf7bfc2eaf775b634774e9ec435d720b6ecee7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: fix: move the CPU check into ->task_new_fair() noticed by Peter Zijlstra: fix: move the CPU check into ->task_new_fair(), this way we can call place_entity() and get child ->vruntime right at initial wakeup time. (without this there can be large latencies) Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra diff --git a/kernel/sched.c b/kernel/sched.c index f2b8db4..b41ef66 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1660,17 +1660,14 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; - int this_cpu; rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); - this_cpu = smp_processor_id(); /* parent's CPU */ update_rq_clock(rq); p->prio = effective_prio(p); - if (task_cpu(p) != this_cpu || !p->sched_class->task_new || - !current->se.on_rq) { + if (!p->sched_class->task_new || !current->se.on_rq || !rq->cfs.curr) { activate_task(rq, p, 0); } else { /* diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a9dfb77..f5f49176 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1007,13 +1007,14 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); struct sched_entity *se = &p->se, *curr = cfs_rq->curr; + int this_cpu = smp_processor_id(); sched_info_queued(p); update_curr(cfs_rq); place_entity(cfs_rq, se, 1); - if (sysctl_sched_child_runs_first && + if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && curr->vruntime < se->vruntime) { /* * Upon rescheduling, sched_class::put_prev_task() will place -- cgit v0.10.2 From a58f6f253d268f7b9712bd13c344a1fd89a3192f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: export cpu_clock() export cpu_clock() - the preferred API instead of sched_clock(). Signed-off-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index b41ef66..a3c3ec8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -470,6 +470,7 @@ unsigned long long cpu_clock(int cpu) return now; } +EXPORT_SYMBOL_GPL(cpu_clock); #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) -- cgit v0.10.2 From 810e95ccd58d91369191aa4ecc9e6d4a10d8d0c8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: another wakeup_granularity fix unit mis-match: wakeup_gran was used against a vruntime Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f5f49176..3ecbfd0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -818,7 +818,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - s64 delta; + s64 delta, gran; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -833,8 +833,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) } delta = se->vruntime - pse->vruntime; + gran = sysctl_sched_wakeup_granularity; + if (unlikely(se->load.weight != NICE_0_LOAD)) + gran = calc_delta_fair(gran, &se->load); - if (delta > (s64)sysctl_sched_wakeup_granularity) + if (delta > gran) resched_task(curr); } -- cgit v0.10.2 From 8ca0e14ffb12c257de591571a9e96102acdb1c64 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: disable sleeper_fairness on SCHED_BATCH disable sleeper fairness for batch tasks - they are about batch processing after all. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3ecbfd0..410b77a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -485,7 +485,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime += sched_vslice_add(cfs_rq, se); if (!initial) { - if (sched_feat(NEW_FAIR_SLEEPERS)) + struct task_struct *p = container_of(se, struct task_struct, se); + + if (sched_feat(NEW_FAIR_SLEEPERS) && p->policy != SCHED_BATCH) vruntime -= sysctl_sched_latency; vruntime = max_t(s64, vruntime, se->vruntime); -- cgit v0.10.2 From 5cb350baf580017da38199625b7365b1763d7180 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: group scheduling, sysfs tunables Add tunables in sysfs to modify a user's cpu share. A directory is created in sysfs for each new user in the system. /sys/kernel/uids//cpu_share Reading this file returns the cpu shares granted for the user. Writing into this file modifies the cpu share for the user. Only an administrator is allowed to modify a user's cpu share. Ex: # cd /sys/kernel/uids/ # cat 512/cpu_share 1024 # echo 2048 > 512/cpu_share # cat 512/cpu_share 2048 # Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt index 84901e7..88bcb87 100644 --- a/Documentation/sched-design-CFS.txt +++ b/Documentation/sched-design-CFS.txt @@ -117,3 +117,70 @@ Some implementation details: iterators of the scheduling modules are used. The balancing code got quite a bit simpler as a result. + +Group scheduler extension to CFS +================================ + +Normally the scheduler operates on individual tasks and strives to provide +fair CPU time to each task. Sometimes, it may be desirable to group tasks +and provide fair CPU time to each such task group. For example, it may +be desirable to first provide fair CPU time to each user on the system +and then to each task belonging to a user. + +CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets +SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such +groups. At present, there are two (mutually exclusive) mechanisms to group +tasks for CPU bandwidth control purpose: + + - Based on user id (CONFIG_FAIR_USER_SCHED) + In this option, tasks are grouped according to their user id. + - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED) + This options lets the administrator create arbitrary groups + of tasks, using the "cgroup" pseudo filesystem. See + Documentation/cgroups.txt for more information about this + filesystem. + +Only one of these options to group tasks can be chosen and not both. + +Group scheduler tunables: + +When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for +each new user and a "cpu_share" file is added in that directory. + + # cd /sys/kernel/uids + # cat 512/cpu_share # Display user 512's CPU share + 1024 + # echo 2048 > 512/cpu_share # Modify user 512's CPU share + # cat 512/cpu_share # Display user 512's CPU share + 2048 + # + +CPU bandwidth between two users are divided in the ratio of their CPU shares. +For ex: if you would like user "root" to get twice the bandwidth of user +"guest", then set the cpu_share for both the users such that "root"'s +cpu_share is twice "guest"'s cpu_share + + +When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created +for each group created using the pseudo filesystem. See example steps +below to create task groups and modify their CPU share using the "cgroups" +pseudo filesystem + + # mkdir /dev/cpuctl + # mount -t cgroup -ocpu none /dev/cpuctl + # cd /dev/cpuctl + + # mkdir multimedia # create "multimedia" group of tasks + # mkdir browser # create "browser" group of tasks + + # #Configure the multimedia group to receive twice the CPU bandwidth + # #that of browser group + + # echo 2048 > multimedia/cpu.shares + # echo 1024 > browser/cpu.shares + + # firefox & # Launch firefox and move it to "browser" group + # echo > browser/tasks + + # #Launch gmplayer (or your favourite movie player) + # echo > multimedia/tasks diff --git a/include/linux/sched.h b/include/linux/sched.h index 3cddbfc..04233c8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -87,6 +87,7 @@ struct sched_param { #include #include #include +#include #include @@ -599,9 +600,18 @@ struct user_struct { #ifdef CONFIG_FAIR_USER_SCHED struct task_group *tg; + struct kset kset; + struct subsys_attribute user_attr; + struct work_struct work; #endif }; +#ifdef CONFIG_FAIR_USER_SCHED +extern int uids_kobject_init(void); +#else +static inline int uids_kobject_init(void) { return 0; } +#endif + extern struct user_struct *find_user(uid_t); extern struct user_struct root_user; @@ -1848,6 +1858,7 @@ extern struct task_group *sched_create_group(void); extern void sched_destroy_group(struct task_group *tg); extern void sched_move_task(struct task_struct *tsk); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern unsigned long sched_group_shares(struct task_group *tg); #endif diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index d0e5c48..6046939 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include #include #include +#include #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) @@ -116,6 +117,13 @@ static int __init ksysfs_init(void) ¬es_attr); } + /* + * Create "/sys/kernel/uids" directory and corresponding root user's + * directory under it. + */ + if (!error) + error = uids_kobject_init(); + return error; } diff --git a/kernel/sched.c b/kernel/sched.c index a3c3ec8..9ac9989 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -162,6 +162,8 @@ struct task_group { /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; unsigned long shares; + /* spinlock to serialize modification to shares */ + spinlock_t lock; }; /* Default task group's sched entity on each cpu */ @@ -6533,6 +6535,7 @@ void __init sched_init(void) se->parent = NULL; } init_task_group.shares = init_task_group_load; + spin_lock_init(&init_task_group.lock); #endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) @@ -6777,6 +6780,7 @@ struct task_group *sched_create_group(void) } tg->shares = NICE_0_LOAD; + spin_lock_init(&tg->lock); return tg; @@ -6897,8 +6901,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; + spin_lock(&tg->lock); if (tg->shares == shares) - return 0; + goto done; /* return -EINVAL if the new value is not sane */ @@ -6906,7 +6911,14 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) for_each_possible_cpu(i) set_se_shares(tg->se[i], shares); +done: + spin_unlock(&tg->lock); return 0; } +unsigned long sched_group_shares(struct task_group *tg) +{ + return tg->shares; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 6f87b31..0aab455 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -231,45 +231,6 @@ static void sysrq_sched_debug_show(void) sched_debug_show(NULL, NULL); } -#ifdef CONFIG_FAIR_USER_SCHED - -static DEFINE_MUTEX(root_user_share_mutex); - -static int -root_user_share_read_proc(char *page, char **start, off_t off, int count, - int *eof, void *data) -{ - return sprintf(page, "%d\n", init_task_group_load); -} - -static int -root_user_share_write_proc(struct file *file, const char __user *buffer, - unsigned long count, void *data) -{ - unsigned long shares; - char kbuf[sizeof(unsigned long)+1]; - int rc = 0; - - if (copy_from_user(kbuf, buffer, sizeof(kbuf))) - return -EFAULT; - - shares = simple_strtoul(kbuf, NULL, 0); - - if (!shares) - shares = NICE_0_LOAD; - - mutex_lock(&root_user_share_mutex); - - init_task_group_load = shares; - rc = sched_group_set_shares(&init_task_group, shares); - - mutex_unlock(&root_user_share_mutex); - - return (rc < 0 ? rc : count); -} - -#endif /* CONFIG_FAIR_USER_SCHED */ - static int sched_debug_open(struct inode *inode, struct file *filp) { return single_open(filp, sched_debug_show, NULL); @@ -292,15 +253,6 @@ static int __init init_sched_debug_procfs(void) pe->proc_fops = &sched_debug_fops; -#ifdef CONFIG_FAIR_USER_SCHED - pe = create_proc_entry("root_user_cpu_share", 0644, NULL); - if (!pe) - return -ENOMEM; - - pe->read_proc = root_user_share_read_proc; - pe->write_proc = root_user_share_write_proc; -#endif - return 0; } diff --git a/kernel/user.c b/kernel/user.c index 0c9a787..74cadea 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -55,7 +55,41 @@ struct user_struct root_user = { #endif }; +/* + * These routines must be called with the uidhash spinlock held! + */ +static inline void uid_hash_insert(struct user_struct *up, + struct hlist_head *hashent) +{ + hlist_add_head(&up->uidhash_node, hashent); +} + +static inline void uid_hash_remove(struct user_struct *up) +{ + hlist_del_init(&up->uidhash_node); +} + +static inline struct user_struct *uid_hash_find(uid_t uid, + struct hlist_head *hashent) +{ + struct user_struct *user; + struct hlist_node *h; + + hlist_for_each_entry(user, h, hashent, uidhash_node) { + if (user->uid == uid) { + atomic_inc(&user->__count); + return user; + } + } + + return NULL; +} + #ifdef CONFIG_FAIR_USER_SCHED + +static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ +static DEFINE_MUTEX(uids_mutex); + static void sched_destroy_user(struct user_struct *up) { sched_destroy_group(up->tg); @@ -77,42 +111,173 @@ static void sched_switch_user(struct task_struct *p) sched_move_task(p); } -#else /* CONFIG_FAIR_USER_SCHED */ +static inline void uids_mutex_lock(void) +{ + mutex_lock(&uids_mutex); +} -static void sched_destroy_user(struct user_struct *up) { } -static int sched_create_user(struct user_struct *up) { return 0; } -static void sched_switch_user(struct task_struct *p) { } +static inline void uids_mutex_unlock(void) +{ + mutex_unlock(&uids_mutex); +} -#endif /* CONFIG_FAIR_USER_SCHED */ +/* return cpu shares held by the user */ +ssize_t cpu_shares_show(struct kset *kset, char *buffer) +{ + struct user_struct *up = container_of(kset, struct user_struct, kset); -/* - * These routines must be called with the uidhash spinlock held! + return sprintf(buffer, "%lu\n", sched_group_shares(up->tg)); +} + +/* modify cpu shares held by the user */ +ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size) +{ + struct user_struct *up = container_of(kset, struct user_struct, kset); + unsigned long shares; + int rc; + + sscanf(buffer, "%lu", &shares); + + rc = sched_group_set_shares(up->tg, shares); + + return (rc ? rc : size); +} + +static void user_attr_init(struct subsys_attribute *sa, char *name, int mode) +{ + sa->attr.name = name; + sa->attr.mode = mode; + sa->show = cpu_shares_show; + sa->store = cpu_shares_store; +} + +/* Create "/sys/kernel/uids/" directory and + * "/sys/kernel/uids//cpu_share" file for this user. */ -static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) +static int user_kobject_create(struct user_struct *up) { - hlist_add_head(&up->uidhash_node, hashent); + struct kset *kset = &up->kset; + struct kobject *kobj = &kset->kobj; + int error; + + memset(kset, 0, sizeof(struct kset)); + kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */ + kobject_set_name(kobj, "%d", up->uid); + kset_init(kset); + user_attr_init(&up->user_attr, "cpu_share", 0644); + + error = kobject_add(kobj); + if (error) + goto done; + + error = sysfs_create_file(kobj, &up->user_attr.attr); + if (error) + kobject_del(kobj); + +done: + return error; } -static inline void uid_hash_remove(struct user_struct *up) +/* create these in sysfs filesystem: + * "/sys/kernel/uids" directory + * "/sys/kernel/uids/0" directory (for root user) + * "/sys/kernel/uids/0/cpu_share" file (for root user) + */ +int __init uids_kobject_init(void) { - hlist_del_init(&up->uidhash_node); + int error; + + /* create under /sys/kernel dir */ + uids_kobject.parent = &kernel_subsys.kobj; + kobject_set_name(&uids_kobject, "uids"); + kobject_init(&uids_kobject); + + error = kobject_add(&uids_kobject); + if (!error) + error = user_kobject_create(&root_user); + + return error; } -static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) +/* work function to remove sysfs directory for a user and free up + * corresponding structures. + */ +static void remove_user_sysfs_dir(struct work_struct *w) { - struct user_struct *user; - struct hlist_node *h; + struct user_struct *up = container_of(w, struct user_struct, work); + struct kobject *kobj = &up->kset.kobj; + unsigned long flags; + int remove_user = 0; - hlist_for_each_entry(user, h, hashent, uidhash_node) { - if(user->uid == uid) { - atomic_inc(&user->__count); - return user; - } + /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() + * atomic. + */ + uids_mutex_lock(); + + local_irq_save(flags); + + if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { + uid_hash_remove(up); + remove_user = 1; + spin_unlock_irqrestore(&uidhash_lock, flags); + } else { + local_irq_restore(flags); } - return NULL; + if (!remove_user) + goto done; + + sysfs_remove_file(kobj, &up->user_attr.attr); + kobject_del(kobj); + + sched_destroy_user(up); + key_put(up->uid_keyring); + key_put(up->session_keyring); + kmem_cache_free(uid_cachep, up); + +done: + uids_mutex_unlock(); +} + +/* IRQs are disabled and uidhash_lock is held upon function entry. + * IRQ state (as stored in flags) is restored and uidhash_lock released + * upon function exit. + */ +static inline void free_user(struct user_struct *up, unsigned long flags) +{ + /* restore back the count */ + atomic_inc(&up->__count); + spin_unlock_irqrestore(&uidhash_lock, flags); + + INIT_WORK(&up->work, remove_user_sysfs_dir); + schedule_work(&up->work); } +#else /* CONFIG_FAIR_USER_SCHED */ + +static void sched_destroy_user(struct user_struct *up) { } +static int sched_create_user(struct user_struct *up) { return 0; } +static void sched_switch_user(struct task_struct *p) { } +static inline int user_kobject_create(struct user_struct *up) { return 0; } +static inline void uids_mutex_lock(void) { } +static inline void uids_mutex_unlock(void) { } + +/* IRQs are disabled and uidhash_lock is held upon function entry. + * IRQ state (as stored in flags) is restored and uidhash_lock released + * upon function exit. + */ +static inline void free_user(struct user_struct *up, unsigned long flags) +{ + uid_hash_remove(up); + spin_unlock_irqrestore(&uidhash_lock, flags); + sched_destroy_user(up); + key_put(up->uid_keyring); + key_put(up->session_keyring); + kmem_cache_free(uid_cachep, up); +} + +#endif /* CONFIG_FAIR_USER_SCHED */ + /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). @@ -139,16 +304,10 @@ void free_uid(struct user_struct *up) return; local_irq_save(flags); - if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { - uid_hash_remove(up); - spin_unlock_irqrestore(&uidhash_lock, flags); - sched_destroy_user(up); - key_put(up->uid_keyring); - key_put(up->session_keyring); - kmem_cache_free(uid_cachep, up); - } else { + if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) + free_user(up, flags); + else local_irq_restore(flags); - } } struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) @@ -156,6 +315,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) struct hlist_head *hashent = uidhashentry(ns, uid); struct user_struct *up; + /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert() + * atomic. + */ + uids_mutex_lock(); + spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); spin_unlock_irq(&uidhash_lock); @@ -191,6 +355,15 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) return NULL; } + if (user_kobject_create(new)) { + sched_destroy_user(new); + key_put(new->uid_keyring); + key_put(new->session_keyring); + kmem_cache_free(uid_cachep, new); + uids_mutex_unlock(); + return NULL; + } + /* * Before adding this, check whether we raced * on adding the same user already.. @@ -198,7 +371,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - sched_destroy_user(new); + /* This case is not possible when CONFIG_FAIR_USER_SCHED + * is defined, since we serialize alloc_uid() using + * uids_mutex. Hence no need to call + * sched_destroy_user() or remove_user_sysfs_dir(). + */ key_put(new->uid_keyring); key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); @@ -209,6 +386,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) spin_unlock_irq(&uidhash_lock); } + + uids_mutex_unlock(); + return up; } -- cgit v0.10.2 From 638e13ac37a1a89473415f407cbffc1688a20fe2 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: fix rt ptracer monopolizing CPU yield() in wait_task_inactive(), can cause a high priority thread to be scheduled back in, and there by loop forever while it is waiting for some lower priority thread which is unfortunately still on the runqueue. Use schedule_timeout_uninterruptible(1) instead. Signed-off-by: Gautham R Shenoy Credit: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 9ac9989..48fc74b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1133,7 +1133,7 @@ repeat: * yield - it could be a while. */ if (unlikely(on_rq)) { - yield(); + schedule_timeout_uninterruptible(1); goto repeat; } -- cgit v0.10.2 From ace8b3d633f93da8535921bf3e3679db3c619578 Mon Sep 17 00:00:00 2001 From: Zou Nan hai Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: some proc entries are missed in sched_domain sys_ctl debug code cache_nice_tries and flags entry do not appear in proc fs sched_domain directory, because ctl_table entry is skipped. This patch fixes the issue. Signed-off-by: Zou Nan hai Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 48fc74b..b7dff36 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5284,7 +5284,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table = sd_alloc_ctl_entry(12); set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); @@ -5304,10 +5304,10 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[10], "cache_nice_tries", + set_table_entry(&table[9], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[12], "flags", &sd->flags, + set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); return table; -- cgit v0.10.2 From e62dd02ed0af35631c6ca473e50758c9594773cf Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: fix group scheduling for SCHED_BATCH The following patch (sched: disable sleeper_fairness on SCHED_BATCH) seems to break GROUP_SCHED. Although, it may be 'oops'-less due to the possibility of 'p' being always a valid address. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 410b77a..3ac096e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -485,9 +485,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime += sched_vslice_add(cfs_rq, se); if (!initial) { - struct task_struct *p = container_of(se, struct task_struct, se); - - if (sched_feat(NEW_FAIR_SLEEPERS) && p->policy != SCHED_BATCH) + if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && + task_of(se)->policy != SCHED_BATCH) vruntime -= sysctl_sched_latency; vruntime = max_t(s64, vruntime, se->vruntime); -- cgit v0.10.2 From ce6c131131df442f0d49d064129ecc52d9fe8ca9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: disable forced preemption by default Implement feature bit to disable forced preemption. This way it can be checked whether a workload is overscheduling or not. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index b7dff36..0bd8f2c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -444,13 +444,15 @@ enum { SCHED_FEAT_START_DEBIT = 2, SCHED_FEAT_TREE_AVG = 4, SCHED_FEAT_APPROX_AVG = 8, + SCHED_FEAT_WAKEUP_PREEMPT = 16, }; const_debug unsigned int sysctl_sched_features = SCHED_FEAT_NEW_FAIR_SLEEPERS *1 | SCHED_FEAT_START_DEBIT *1 | SCHED_FEAT_TREE_AVG *0 | - SCHED_FEAT_APPROX_AVG *0; + SCHED_FEAT_APPROX_AVG *0 | + SCHED_FEAT_WAKEUP_PREEMPT *1; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3ac096e..3843ec7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -626,7 +626,7 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) */ update_curr(cfs_rq); - if (cfs_rq->nr_running > 1) + if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) check_preempt_tick(cfs_rq, curr); } @@ -828,18 +828,20 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) return; } - while (!is_same_group(se, pse)) { - se = parent_entity(se); - pse = parent_entity(pse); - } + if (sched_feat(WAKEUP_PREEMPT)) { + while (!is_same_group(se, pse)) { + se = parent_entity(se); + pse = parent_entity(pse); + } - delta = se->vruntime - pse->vruntime; - gran = sysctl_sched_wakeup_granularity; - if (unlikely(se->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, &se->load); + delta = se->vruntime - pse->vruntime; + gran = sysctl_sched_wakeup_granularity; + if (unlikely(se->load.weight != NICE_0_LOAD)) + gran = calc_delta_fair(gran, &se->load); - if (delta > gran) - resched_task(curr); + if (delta > gran) + resched_task(curr); + } } static struct task_struct *pick_next_task_fair(struct rq *rq) -- cgit v0.10.2 From 95938a35c5562afa7af7252821e44132391a3db8 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: prevent wakeup over-scheduling Prevent wakeup over-scheduling. Once a task has been preempted by a task of the same or lower priority, it becomes ineligible for repeated preemption by same until it has been ticked, or slept. Instead, the task is marked for preemption at the next tick. Tasks of higher priority still preempt immediately. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index 04233c8..8be5b57 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -912,6 +912,7 @@ struct sched_entity { struct load_weight load; /* for load-balancing */ struct rb_node run_node; unsigned int on_rq; + int peer_preempt; u64 exec_start; u64 sum_exec_runtime; diff --git a/kernel/sched.c b/kernel/sched.c index 0bd8f2c..e8051bd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -445,6 +445,7 @@ enum { SCHED_FEAT_TREE_AVG = 4, SCHED_FEAT_APPROX_AVG = 8, SCHED_FEAT_WAKEUP_PREEMPT = 16, + SCHED_FEAT_PREEMPT_RESTRICT = 32, }; const_debug unsigned int sysctl_sched_features = @@ -452,7 +453,8 @@ const_debug unsigned int sysctl_sched_features = SCHED_FEAT_START_DEBIT *1 | SCHED_FEAT_TREE_AVG *0 | SCHED_FEAT_APPROX_AVG *0 | - SCHED_FEAT_WAKEUP_PREEMPT *1; + SCHED_FEAT_WAKEUP_PREEMPT *1 | + SCHED_FEAT_PREEMPT_RESTRICT *1; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3843ec7..f819f943 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -526,6 +526,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_stats_dequeue(cfs_rq, se); if (sleep) { + se->peer_preempt = 0; #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -553,8 +554,10 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) + if (delta_exec > ideal_runtime || + (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt)) resched_task(rq_of(cfs_rq)->curr); + curr->peer_preempt = 0; } static void @@ -839,8 +842,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (unlikely(se->load.weight != NICE_0_LOAD)) gran = calc_delta_fair(gran, &se->load); - if (delta > gran) - resched_task(curr); + if (delta > gran) { + int now = !sched_feat(PREEMPT_RESTRICT); + + if (now || p->prio < curr->prio || !se->peer_preempt++) + resched_task(curr); + } } } @@ -1034,6 +1041,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) check_spread(cfs_rq, curr); __enqueue_entity(cfs_rq, se); account_entity_enqueue(cfs_rq, se); + se->peer_preempt = 0; resched_task(rq->curr); } -- cgit v0.10.2 From d274a4cee190c880ec25b60501efe50c4435b3d7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: update comment update comment: clarify time-slices and remove obsolete tuning detail. Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f819f943..ec1592e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -25,14 +25,12 @@ * (default: 20ms, units: nanoseconds) * * NOTE: this latency value is not the same as the concept of - * 'timeslice length' - timeslices in CFS are of variable length. - * (to see the precise effective timeslice length of your workload, - * run vmstat and monitor the context-switches field) + * 'timeslice length' - timeslices in CFS are of variable length + * and have no persistent notion like in traditional, time-slice + * based scheduling concepts. * - * On SMP systems the value of this is multiplied by the log2 of the - * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way - * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) - * Targeted preemption latency for CPU-bound tasks: + * (to see the precise effective timeslice length of your workload, + * run vmstat and monitor the context-switches (cs) field) */ const_debug unsigned int sysctl_sched_latency = 20000000ULL; -- cgit v0.10.2 From 3a5c359a58c39801d838c508f127bdb228af28b0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: cleanup: remove unnecessary gotos Replace loops implemented with gotos with real loops. Replace err = ...; goto x; x: return err; with return ...; No functional changes. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index e8051bd..4c15b17 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -562,16 +562,13 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { - struct rq *rq; - -repeat_lock_task: - rq = task_rq(p); - spin_lock(&rq->lock); - if (unlikely(rq != task_rq(p))) { + for (;;) { + struct rq *rq = task_rq(p); + spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; spin_unlock(&rq->lock); - goto repeat_lock_task; } - return rq; } /* @@ -584,15 +581,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) { struct rq *rq; -repeat_lock_task: - local_irq_save(*flags); - rq = task_rq(p); - spin_lock(&rq->lock); - if (unlikely(rq != task_rq(p))) { + for (;;) { + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; spin_unlock_irqrestore(&rq->lock, *flags); - goto repeat_lock_task; } - return rq; } static void __task_rq_unlock(struct rq *rq) @@ -1083,69 +1079,71 @@ void wait_task_inactive(struct task_struct *p) int running, on_rq; struct rq *rq; -repeat: - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); + for (;;) { + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_running()" will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_running(rq, p)) - cpu_relax(); + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_running()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_running(rq, p)) + cpu_relax(); - /* - * Ok, time to look more closely! We need the rq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_rq_lock(p, &flags); - running = task_running(rq, p); - on_rq = p->se.on_rq; - task_rq_unlock(rq, &flags); + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &flags); + running = task_running(rq, p); + on_rq = p->se.on_rq; + task_rq_unlock(rq, &flags); - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - goto repeat; - } + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it wa still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(on_rq)) { - schedule_timeout_uninterruptible(1); - goto repeat; - } + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it wa still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(on_rq)) { + schedule_timeout_uninterruptible(1); + continue; + } - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; + } } /*** @@ -1236,7 +1234,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) /* Skip over this group if it has no CPUs allowed */ if (!cpus_intersects(group->cpumask, p->cpus_allowed)) - goto nextgroup; + continue; local_group = cpu_isset(this_cpu, group->cpumask); @@ -1264,9 +1262,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) min_load = avg_load; idlest = group; } -nextgroup: - group = group->next; - } while (group != sd->groups); + } while (group = group->next, group != sd->groups); if (!idlest || 100*this_load < imbalance*min_load) return NULL; @@ -3517,27 +3513,30 @@ asmlinkage void __sched preempt_schedule(void) if (likely(ti->preempt_count || irqs_disabled())) return; -need_resched: - add_preempt_count(PREEMPT_ACTIVE); - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ + do { + add_preempt_count(PREEMPT_ACTIVE); + + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ #ifdef CONFIG_PREEMPT_BKL - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; #endif - schedule(); + schedule(); #ifdef CONFIG_PREEMPT_BKL - task->lock_depth = saved_lock_depth; + task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); + sub_preempt_count(PREEMPT_ACTIVE); - /* we could miss a preemption opportunity between schedule and now */ - barrier(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); } EXPORT_SYMBOL(preempt_schedule); @@ -3557,29 +3556,32 @@ asmlinkage void __sched preempt_schedule_irq(void) /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); -need_resched: - add_preempt_count(PREEMPT_ACTIVE); - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ + do { + add_preempt_count(PREEMPT_ACTIVE); + + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ #ifdef CONFIG_PREEMPT_BKL - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; #endif - local_irq_enable(); - schedule(); - local_irq_disable(); + local_irq_enable(); + schedule(); + local_irq_disable(); #ifdef CONFIG_PREEMPT_BKL - task->lock_depth = saved_lock_depth; + task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); + sub_preempt_count(PREEMPT_ACTIVE); - /* we could miss a preemption opportunity between schedule and now */ - barrier(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); } #endif /* CONFIG_PREEMPT */ @@ -4324,10 +4326,10 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) asmlinkage long sys_sched_getscheduler(pid_t pid) { struct task_struct *p; - int retval = -EINVAL; + int retval; if (pid < 0) - goto out_nounlock; + return -EINVAL; retval = -ESRCH; read_lock(&tasklist_lock); @@ -4338,8 +4340,6 @@ asmlinkage long sys_sched_getscheduler(pid_t pid) retval = p->policy; } read_unlock(&tasklist_lock); - -out_nounlock: return retval; } @@ -4352,10 +4352,10 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) { struct sched_param lp; struct task_struct *p; - int retval = -EINVAL; + int retval; if (!param || pid < 0) - goto out_nounlock; + return -EINVAL; read_lock(&tasklist_lock); p = find_process_by_pid(pid); @@ -4375,7 +4375,6 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) */ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -out_nounlock: return retval; out_unlock: @@ -4731,11 +4730,11 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) { struct task_struct *p; unsigned int time_slice; - int retval = -EINVAL; + int retval; struct timespec t; if (pid < 0) - goto out_nounlock; + return -EINVAL; retval = -ESRCH; read_lock(&tasklist_lock); @@ -4763,8 +4762,8 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) read_unlock(&tasklist_lock); jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; -out_nounlock: return retval; + out_unlock: read_unlock(&tasklist_lock); return retval; @@ -5070,35 +5069,34 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) struct rq *rq; int dest_cpu; -restart: - /* On same node? */ - mask = node_to_cpumask(cpu_to_node(dead_cpu)); - cpus_and(mask, mask, p->cpus_allowed); - dest_cpu = any_online_cpu(mask); - - /* On any allowed CPU? */ - if (dest_cpu == NR_CPUS) - dest_cpu = any_online_cpu(p->cpus_allowed); - - /* No more Mr. Nice Guy. */ - if (dest_cpu == NR_CPUS) { - rq = task_rq_lock(p, &flags); - cpus_setall(p->cpus_allowed); - dest_cpu = any_online_cpu(p->cpus_allowed); - task_rq_unlock(rq, &flags); + do { + /* On same node? */ + mask = node_to_cpumask(cpu_to_node(dead_cpu)); + cpus_and(mask, mask, p->cpus_allowed); + dest_cpu = any_online_cpu(mask); + + /* On any allowed CPU? */ + if (dest_cpu == NR_CPUS) + dest_cpu = any_online_cpu(p->cpus_allowed); + + /* No more Mr. Nice Guy. */ + if (dest_cpu == NR_CPUS) { + rq = task_rq_lock(p, &flags); + cpus_setall(p->cpus_allowed); + dest_cpu = any_online_cpu(p->cpus_allowed); + task_rq_unlock(rq, &flags); - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - p->pid, p->comm, dead_cpu); - } - if (!__migrate_task(p, dead_cpu, dest_cpu)) - goto restart; + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + p->pid, p->comm, dead_cpu); + } + } while (!__migrate_task(p, dead_cpu, dest_cpu)); } /* @@ -5913,24 +5911,23 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) if (!sg) return; -next_sg: - for_each_cpu_mask(j, sg->cpumask) { - struct sched_domain *sd; + do { + for_each_cpu_mask(j, sg->cpumask) { + struct sched_domain *sd; - sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { - /* - * Only add "power" once for each - * physical package. - */ - continue; - } + sd = &per_cpu(phys_domains, j); + if (j != first_cpu(sd->groups->cpumask)) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } - sg_inc_cpu_power(sg, sd->groups->__cpu_power); - } - sg = sg->next; - if (sg != group_head) - goto next_sg; + sg_inc_cpu_power(sg, sd->groups->__cpu_power); + } + sg = sg->next; + } while (sg != group_head); } #endif -- cgit v0.10.2 From 8cbbe86dfcfd68ad69916164bdc838d9e09adca8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: sched: cleanup: refactor common code of sleep_on / wait_for_completion Refactor common code of sleep_on / wait_for_completion These functions were largely cut'n'pasted. This moves the common code into single helpers instead. Advantage is about 1k less code on x86-64 and 91 lines of code removed. It adds one function call to the non timeout version of the functions; i don't expect this to be measurable. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 4c15b17..db88b56 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3697,206 +3697,116 @@ void fastcall complete_all(struct completion *x) } EXPORT_SYMBOL(complete_all); -void fastcall __sched wait_for_completion(struct completion *x) +static inline long __sched +do_wait_for_common(struct completion *x, long timeout, int state) { - might_sleep(); - - spin_lock_irq(&x->wait.lock); if (!x->done) { DECLARE_WAITQUEUE(wait, current); wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - __set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&x->wait.lock); - schedule(); - spin_lock_irq(&x->wait.lock); - } while (!x->done); - __remove_wait_queue(&x->wait, &wait); - } - x->done--; - spin_unlock_irq(&x->wait.lock); -} -EXPORT_SYMBOL(wait_for_completion); - -unsigned long fastcall __sched -wait_for_completion_timeout(struct completion *x, unsigned long timeout) -{ - might_sleep(); - - spin_lock_irq(&x->wait.lock); - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - wait.flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(&x->wait, &wait); - do { - __set_current_state(TASK_UNINTERRUPTIBLE); + if (state == TASK_INTERRUPTIBLE && + signal_pending(current)) { + __remove_wait_queue(&x->wait, &wait); + return -ERESTARTSYS; + } + __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = schedule_timeout(timeout); spin_lock_irq(&x->wait.lock); if (!timeout) { __remove_wait_queue(&x->wait, &wait); - goto out; + return timeout; } } while (!x->done); __remove_wait_queue(&x->wait, &wait); } x->done--; -out: - spin_unlock_irq(&x->wait.lock); return timeout; } -EXPORT_SYMBOL(wait_for_completion_timeout); -int fastcall __sched wait_for_completion_interruptible(struct completion *x) +static long __sched +wait_for_common(struct completion *x, long timeout, int state) { - int ret = 0; - might_sleep(); spin_lock_irq(&x->wait.lock); - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - wait.flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(&x->wait, &wait); - do { - if (signal_pending(current)) { - ret = -ERESTARTSYS; - __remove_wait_queue(&x->wait, &wait); - goto out; - } - __set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irq(&x->wait.lock); - schedule(); - spin_lock_irq(&x->wait.lock); - } while (!x->done); - __remove_wait_queue(&x->wait, &wait); - } - x->done--; -out: + timeout = do_wait_for_common(x, timeout, state); spin_unlock_irq(&x->wait.lock); + return timeout; +} - return ret; +void fastcall __sched wait_for_completion(struct completion *x) +{ + wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } -EXPORT_SYMBOL(wait_for_completion_interruptible); +EXPORT_SYMBOL(wait_for_completion); unsigned long fastcall __sched -wait_for_completion_interruptible_timeout(struct completion *x, - unsigned long timeout) +wait_for_completion_timeout(struct completion *x, unsigned long timeout) { - might_sleep(); - - spin_lock_irq(&x->wait.lock); - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - wait.flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(&x->wait, &wait); - do { - if (signal_pending(current)) { - timeout = -ERESTARTSYS; - __remove_wait_queue(&x->wait, &wait); - goto out; - } - __set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irq(&x->wait.lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&x->wait.lock); - if (!timeout) { - __remove_wait_queue(&x->wait, &wait); - goto out; - } - } while (!x->done); - __remove_wait_queue(&x->wait, &wait); - } - x->done--; -out: - spin_unlock_irq(&x->wait.lock); - return timeout; + return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); } -EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); +EXPORT_SYMBOL(wait_for_completion_timeout); -static inline void -sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) +int __sched wait_for_completion_interruptible(struct completion *x) { - spin_lock_irqsave(&q->lock, *flags); - __add_wait_queue(q, wait); - spin_unlock(&q->lock); + return wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); } +EXPORT_SYMBOL(wait_for_completion_interruptible); -static inline void -sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) +unsigned long fastcall __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) { - spin_lock_irq(&q->lock); - __remove_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, *flags); + return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); } +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); -void __sched interruptible_sleep_on(wait_queue_head_t *q) +static long __sched +sleep_on_common(wait_queue_head_t *q, int state, long timeout) { unsigned long flags; wait_queue_t wait; init_waitqueue_entry(&wait, current); - current->state = TASK_INTERRUPTIBLE; + __set_current_state(state); - sleep_on_head(q, &wait, &flags); - schedule(); - sleep_on_tail(q, &wait, &flags); + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, &wait); + spin_unlock(&q->lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&q->lock); + __remove_wait_queue(q, &wait); + spin_unlock_irqrestore(&q->lock, flags); + + return timeout; +} + +void __sched interruptible_sleep_on(wait_queue_head_t *q) +{ + sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } EXPORT_SYMBOL(interruptible_sleep_on); long __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - current->state = TASK_INTERRUPTIBLE; - - sleep_on_head(q, &wait, &flags); - timeout = schedule_timeout(timeout); - sleep_on_tail(q, &wait, &flags); - - return timeout; + return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); } EXPORT_SYMBOL(interruptible_sleep_on_timeout); void __sched sleep_on(wait_queue_head_t *q) { - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - current->state = TASK_UNINTERRUPTIBLE; - - sleep_on_head(q, &wait, &flags); - schedule(); - sleep_on_tail(q, &wait, &flags); + sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } EXPORT_SYMBOL(sleep_on); long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) { - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - current->state = TASK_UNINTERRUPTIBLE; - - sleep_on_head(q, &wait, &flags); - timeout = schedule_timeout(timeout); - sleep_on_tail(q, &wait, &flags); - - return timeout; + return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); } EXPORT_SYMBOL(sleep_on_timeout); -- cgit v0.10.2 From 3a5e4dc12f23fb96fafd4f5d0f61e6c3070f80a5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 15 Oct 2007 17:00:15 +0200 Subject: sched: cleanup: refactor normalize_rt_tasks Replace a particularly ugly ifdef with an inline and a new macro. Also split up the function to be easier to read. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index db88b56..2c6295b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,6 +75,12 @@ unsigned long long __attribute__((weak)) sched_clock(void) return (unsigned long long)jiffies * (1000000000 / HZ); } +#ifdef CONFIG_SMP +#define is_migration_thread(p, rq) ((p) == (rq)->migration_thread) +#else +#define is_migration_thread(p, rq) 0 +#endif + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -6532,12 +6538,25 @@ EXPORT_SYMBOL(__might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ +static void normalize_task(struct rq *rq, struct task_struct *p) +{ + int on_rq; + update_rq_clock(rq); + on_rq = p->se.on_rq; + if (on_rq) + deactivate_task(rq, p, 0); + __setscheduler(rq, p, SCHED_NORMAL, 0); + if (on_rq) { + activate_task(rq, p, 0); + resched_task(rq->curr); + } +} + void normalize_rt_tasks(void) { struct task_struct *g, *p; unsigned long flags; struct rq *rq; - int on_rq; read_lock_irq(&tasklist_lock); do_each_thread(g, p) { @@ -6561,26 +6580,10 @@ void normalize_rt_tasks(void) spin_lock_irqsave(&p->pi_lock, flags); rq = __task_rq_lock(p); -#ifdef CONFIG_SMP - /* - * Do not touch the migration thread: - */ - if (p == rq->migration_thread) - goto out_unlock; -#endif - update_rq_clock(rq); - on_rq = p->se.on_rq; - if (on_rq) - deactivate_task(rq, p, 0); - __setscheduler(rq, p, SCHED_NORMAL, 0); - if (on_rq) { - activate_task(rq, p, 0); - resched_task(rq->curr); - } -#ifdef CONFIG_SMP - out_unlock: -#endif + if (!is_migration_thread(p, rq)) + normalize_task(rq, p); + __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); } while_each_thread(g, p); -- cgit v0.10.2 From d5036e89dcf7c19b3d03219d7d385bc96965b7fe Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:15 +0200 Subject: sched: clean up is_migration_thread() clean up is_migration_thread() and turn it into an inline function. Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 2c6295b..7ef66bd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,12 +75,6 @@ unsigned long long __attribute__((weak)) sched_clock(void) return (unsigned long long)jiffies * (1000000000 / HZ); } -#ifdef CONFIG_SMP -#define is_migration_thread(p, rq) ((p) == (rq)->migration_thread) -#else -#define is_migration_thread(p, rq) 0 -#endif - /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -371,6 +365,15 @@ static inline int cpu_of(struct rq *rq) #endif } +static inline int is_migration_thread(struct task_struct *p, struct rq *rq) +{ +#ifdef CONFIG_SMP + return p == rq->migration_thread; +#else + return 0; +#endif +} + /* * Update the per-runqueue clock, as finegrained as the platform can give * us, but without assuming monotonicity, etc.: -- cgit v0.10.2 From 1666703af948ae87c87c2bc7121aa34271cc52ab Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 15 Oct 2007 17:00:18 +0200 Subject: sched: remove stale comment from sched_group_set_shares() remove stale comment from sched_group_set_shares(). Function never returns -EINVAL. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 7ef66bd..fc61b1f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6822,8 +6822,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) if (tg->shares == shares) goto done; - /* return -EINVAL if the new value is not sane */ - tg->shares = shares; for_each_possible_cpu(i) set_se_shares(tg->se[i], shares); -- cgit v0.10.2 From 178be793485d70d871a0fd46b29e9e3e7da636ad Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:18 +0200 Subject: sched: do not normalize kernel threads via SysRq-N do not normalize kernel threads via SysRq-N: the migration threads, softlockup threads, etc. might be essential for the system to function properly. So only zap user tasks. pointed out by Andi Kleen. Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index fc61b1f..791dd08 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -365,15 +365,6 @@ static inline int cpu_of(struct rq *rq) #endif } -static inline int is_migration_thread(struct task_struct *p, struct rq *rq) -{ -#ifdef CONFIG_SMP - return p == rq->migration_thread; -#else - return 0; -#endif -} - /* * Update the per-runqueue clock, as finegrained as the platform can give * us, but without assuming monotonicity, etc.: @@ -6563,6 +6554,12 @@ void normalize_rt_tasks(void) read_lock_irq(&tasklist_lock); do_each_thread(g, p) { + /* + * Only normalize user tasks: + */ + if (!p->mm) + continue; + p->se.exec_start = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -6584,8 +6581,7 @@ void normalize_rt_tasks(void) spin_lock_irqsave(&p->pi_lock, flags); rq = __task_rq_lock(p); - if (!is_migration_thread(p, rq)) - normalize_task(rq, p); + normalize_task(rq, p); __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); -- cgit v0.10.2 From fb7dde37ece82e13de383afd7042c45df67a9d17 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 15 Oct 2007 17:00:18 +0200 Subject: sched: generate uevents for user creation/destruction Generate uevents when a user is being created/destroyed. These events can be used to configure cpu share of a new user. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar diff --git a/kernel/user.c b/kernel/user.c index 74cadea..f0e561e 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -174,6 +174,8 @@ static int user_kobject_create(struct user_struct *up) if (error) kobject_del(kobj); + kobject_uevent(kobj, KOBJ_ADD); + done: return error; } @@ -189,6 +191,7 @@ int __init uids_kobject_init(void) /* create under /sys/kernel dir */ uids_kobject.parent = &kernel_subsys.kobj; + uids_kobject.kset = &kernel_subsys; kobject_set_name(&uids_kobject, "uids"); kobject_init(&uids_kobject); @@ -228,6 +231,7 @@ static void remove_user_sysfs_dir(struct work_struct *w) goto done; sysfs_remove_file(kobj, &up->user_attr.attr); + kobject_uevent(kobj, KOBJ_REMOVE); kobject_del(kobj); sched_destroy_user(up); -- cgit v0.10.2 From 91c234b4e3419c786cac2d5b7a7b96443e512e3a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:18 +0200 Subject: sched: do not wakeup-preempt with SCHED_BATCH tasks do not wakeup-preempt with SCHED_BATCH tasks, their preemption is batched too, driven by the tick. Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ec1592e..c240b72 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -828,6 +828,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) resched_task(curr); return; } + /* + * Batch tasks do not preempt (their preemption is driven by + * the tick): + */ + if (unlikely(p->policy == SCHED_BATCH)) + return; if (sched_feat(WAKEUP_PREEMPT)) { while (!is_same_group(se, pse)) { -- cgit v0.10.2 From e5f32a3856caabe745381279f7f32e3b581b59dc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:18 +0200 Subject: sched: speed up context-switches a bit speed up context-switches a bit by not clearing p->exec_start. (as a side-effect, this also makes p->exec_start a universal timestamp available to cache-hot estimations.) Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c240b72..cea1fa3 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -379,15 +379,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) se->exec_start = rq_of(cfs_rq)->clock; } -/* - * We are descheduling a task - update its stats: - */ -static inline void -update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - se->exec_start = 0; -} - /************************************************** * Scheduling class queueing methods: */ @@ -609,8 +600,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) if (prev->on_rq) update_curr(cfs_rq); - update_stats_curr_end(cfs_rq, prev); - check_spread(cfs_rq, prev); if (prev->on_rq) { update_stats_wait_start(cfs_rq, prev); -- cgit v0.10.2 From da84d96176729fb48a8458561e5d8647103168b8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:18 +0200 Subject: sched: reintroduce cache-hot affinity reintroduce a simplified version of cache-hot/cold scheduling affinity. This improves performance with certain SMP workloads, such as sysbench. Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index 8be5b57..fcc9a5a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1415,6 +1415,7 @@ extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_batch_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; +extern unsigned int sysctl_sched_migration_cost; #endif extern unsigned int sysctl_sched_compat_yield; diff --git a/kernel/sched.c b/kernel/sched.c index 791dd08..089d8b1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2119,6 +2119,17 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, } /* + * Is this task likely cache-hot: + */ +static inline int +task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) +{ + s64 delta = now - p->se.exec_start; + + return delta < (long long)sysctl_sched_migration_cost; +} + +/* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static @@ -2139,6 +2150,22 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, if (task_running(rq, p)) return 0; + /* + * Aggressive migration if: + * 1) task is cache cold, or + * 2) too many balance attempts have failed. + */ + + if (sd->nr_balance_failed > sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (task_hot(p, rq->clock, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif + return 1; + } + + if (task_hot(p, rq->clock, sd)) + return 0; return 1; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cea1fa3..a17b785 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -74,6 +74,8 @@ const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; */ const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; +const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + /************************************************************** * CFS operations on generic schedulable entities: */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 230ca4e..ec14aa8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -277,6 +277,14 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_migration_cost", + .data = &sysctl_sched_migration_cost, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif { .ctl_name = CTL_UNNUMBERED, -- cgit v0.10.2 From ff56b2f01537aef7237d5ac8bf6bfbb409c1a127 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Oct 2007 17:00:18 +0200 Subject: sched: activate task_hot() only on fair-scheduled tasks activate task_hot() only for fair-scheduled tasks (i.e. disable it for RT tasks). Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 089d8b1..945ab13 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2122,11 +2122,16 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, * Is this task likely cache-hot: */ static inline int -task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { - s64 delta = now - p->se.exec_start; + s64 delta; - return delta < (long long)sysctl_sched_migration_cost; + if (p->sched_class != &fair_sched_class) + return 0; + + delta = now - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; } /* -- cgit v0.10.2 From 2d92f22784b7b8879ebe3254e44c92cb8792b0dd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:18 +0200 Subject: sched: debug: increase width of debug line increase width of debug line - in preparation of more debugging info. Signed-off-by: Ingo Molnar diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 0aab455..7558159 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -198,7 +198,7 @@ static int sched_debug_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n", + SEQ_printf(m, "Sched Debug Version: v0.06-v22, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); @@ -271,11 +271,12 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) rcu_read_unlock(); SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); - SEQ_printf(m, "----------------------------------------------\n"); + SEQ_printf(m, + "---------------------------------------------------------\n"); #define P(F) \ - SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) + SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) #define PN(F) \ - SEQ_printf(m, "%-25s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) + SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) PN(se.exec_start); PN(se.vruntime); @@ -292,7 +293,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.wait_max); P(sched_info.bkl_count); #endif - SEQ_printf(m, "%-25s:%20Ld\n", + SEQ_printf(m, "%-35s:%21Ld\n", "nr_switches", (long long)(p->nvcsw + p->nivcsw)); P(se.load.weight); P(policy); @@ -305,7 +306,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) t0 = sched_clock(); t1 = sched_clock(); - SEQ_printf(m, "%-25s:%20Ld\n", + SEQ_printf(m, "%-35s:%21Ld\n", "clock-delta", (long long)(t1-t0)); } } -- cgit v0.10.2 From cc367732ff0b1c63d0d7bdd11e6d1661794ef6a3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:18 +0200 Subject: sched: debug, improve migration statistics add new migration statistics when SCHED_DEBUG and SCHEDSTATS is enabled. Available in /proc//sched. Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index fcc9a5a..3a6e05e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -931,6 +931,24 @@ struct sched_entity { u64 block_max; u64 exec_max; u64 slice_max; + + u64 nr_migrations; + u64 nr_migrations_cold; + u64 nr_failed_migrations_affine; + u64 nr_failed_migrations_running; + u64 nr_failed_migrations_hot; + u64 nr_forced_migrations; + u64 nr_forced2_migrations; + + u64 nr_wakeups; + u64 nr_wakeups_sync; + u64 nr_wakeups_migrate; + u64 nr_wakeups_local; + u64 nr_wakeups_remote; + u64 nr_wakeups_affine; + u64 nr_wakeups_affine_attempts; + u64 nr_wakeups_passive; + u64 nr_wakeups_idle; #endif #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched.c b/kernel/sched.c index 945ab13..3b27c3a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1005,6 +1005,23 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) #ifdef CONFIG_SMP +/* + * Is this task likely cache-hot: + */ +static inline int +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +{ + s64 delta; + + if (p->sched_class != &fair_sched_class) + return 0; + + delta = now - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; +} + + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); @@ -1022,6 +1039,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.sleep_start -= clock_offset; if (p->se.block_start) p->se.block_start -= clock_offset; + if (old_cpu != new_cpu) { + schedstat_inc(p, se.nr_migrations); + if (task_hot(p, old_rq->clock, NULL)) + schedstat_inc(p, se.nr_forced2_migrations); + } #endif p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; @@ -1394,8 +1416,13 @@ static int wake_idle(int cpu, struct task_struct *p) if (sd->flags & SD_WAKE_IDLE) { cpus_and(tmp, sd->span, p->cpus_allowed); for_each_cpu_mask(i, tmp) { - if (idle_cpu(i)) + if (idle_cpu(i)) { + if (i != task_cpu(p)) { + schedstat_inc(p, + se.nr_wakeups_idle); + } return i; + } } } else { break; @@ -1426,7 +1453,7 @@ static inline int wake_idle(int cpu, struct task_struct *p) */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) { - int cpu, this_cpu, success = 0; + int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; long old_state; struct rq *rq; @@ -1445,6 +1472,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) goto out_running; cpu = task_cpu(p); + orig_cpu = cpu; this_cpu = smp_processor_id(); #ifdef CONFIG_SMP @@ -1488,6 +1516,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) unsigned long tl = this_load; unsigned long tl_per_task; + schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); /* @@ -1507,6 +1536,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) * there is no bad imbalance. */ schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(p, se.nr_wakeups_affine); goto out_set_cpu; } } @@ -1518,6 +1548,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (this_sd->flags & SD_WAKE_BALANCE) { if (imbalance*this_load <= 100*load) { schedstat_inc(this_sd, ttwu_move_balance); + schedstat_inc(p, se.nr_wakeups_passive); goto out_set_cpu; } } @@ -1543,6 +1574,15 @@ out_set_cpu: out_activate: #endif /* CONFIG_SMP */ + schedstat_inc(p, se.nr_wakeups); + if (sync) + schedstat_inc(p, se.nr_wakeups_sync); + if (orig_cpu != cpu) + schedstat_inc(p, se.nr_wakeups_migrate); + if (cpu == this_cpu) + schedstat_inc(p, se.nr_wakeups_local); + else + schedstat_inc(p, se.nr_wakeups_remote); update_rq_clock(rq); activate_task(rq, p, 1); /* @@ -2119,22 +2159,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, } /* - * Is this task likely cache-hot: - */ -static inline int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) -{ - s64 delta; - - if (p->sched_class != &fair_sched_class) - return 0; - - delta = now - p->se.exec_start; - - return delta < (s64)sysctl_sched_migration_cost; -} - -/* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static @@ -2148,12 +2172,16 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpu_isset(this_cpu, p->cpus_allowed)) + if (!cpu_isset(this_cpu, p->cpus_allowed)) { + schedstat_inc(p, se.nr_failed_migrations_affine); return 0; + } *all_pinned = 0; - if (task_running(rq, p)) + if (task_running(rq, p)) { + schedstat_inc(p, se.nr_failed_migrations_running); return 0; + } /* * Aggressive migration if: @@ -2163,14 +2191,18 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, if (sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS - if (task_hot(p, rq->clock, sd)) + if (task_hot(p, rq->clock, sd)) { schedstat_inc(sd, lb_hot_gained[idle]); + schedstat_inc(p, se.nr_forced_migrations); + } #endif return 1; } - if (task_hot(p, rq->clock, sd)) + if (task_hot(p, rq->clock, sd)) { + schedstat_inc(p, se.nr_failed_migrations_hot); return 0; + } return 1; } diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 7558159..27e82cb 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -260,6 +260,7 @@ __initcall(init_sched_debug_procfs); void proc_sched_show_task(struct task_struct *p, struct seq_file *m) { + unsigned long nr_switches; unsigned long flags; int num_threads = 1; @@ -273,8 +274,12 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); SEQ_printf(m, "---------------------------------------------------------\n"); +#define __P(F) \ + SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) #define P(F) \ SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) +#define __PN(F) \ + SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) #define PN(F) \ SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) @@ -282,6 +287,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.vruntime); PN(se.sum_exec_runtime); + nr_switches = p->nvcsw + p->nivcsw; + #ifdef CONFIG_SCHEDSTATS PN(se.wait_start); PN(se.sleep_start); @@ -292,14 +299,55 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.slice_max); PN(se.wait_max); P(sched_info.bkl_count); + P(se.nr_migrations); + P(se.nr_migrations_cold); + P(se.nr_failed_migrations_affine); + P(se.nr_failed_migrations_running); + P(se.nr_failed_migrations_hot); + P(se.nr_forced_migrations); + P(se.nr_forced2_migrations); + P(se.nr_wakeups); + P(se.nr_wakeups_sync); + P(se.nr_wakeups_migrate); + P(se.nr_wakeups_local); + P(se.nr_wakeups_remote); + P(se.nr_wakeups_affine); + P(se.nr_wakeups_affine_attempts); + P(se.nr_wakeups_passive); + P(se.nr_wakeups_idle); + + { + u64 avg_atom, avg_per_cpu; + + avg_atom = p->se.sum_exec_runtime; + if (nr_switches) + do_div(avg_atom, nr_switches); + else + avg_atom = -1LL; + + avg_per_cpu = p->se.sum_exec_runtime; + if (p->se.nr_migrations) + avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations); + else + avg_per_cpu = -1LL; + + __PN(avg_atom); + __PN(avg_per_cpu); + } #endif + __P(nr_switches); SEQ_printf(m, "%-35s:%21Ld\n", - "nr_switches", (long long)(p->nvcsw + p->nivcsw)); + "nr_voluntary_switches", (long long)p->nvcsw); + SEQ_printf(m, "%-35s:%21Ld\n", + "nr_involuntary_switches", (long long)p->nivcsw); + P(se.load.weight); P(policy); P(prio); -#undef P #undef PN +#undef __PN +#undef P +#undef __P { u64 t0, t1; @@ -314,13 +362,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) void proc_sched_set_task(struct task_struct *p) { #ifdef CONFIG_SCHEDSTATS - p->se.sleep_max = 0; - p->se.block_max = 0; - p->se.exec_max = 0; - p->se.slice_max = 0; - p->se.wait_max = 0; - p->sched_info.bkl_count = 0; + p->se.wait_max = 0; + p->se.sleep_max = 0; + p->se.sum_sleep_runtime = 0; + p->se.block_max = 0; + p->se.exec_max = 0; + p->se.slice_max = 0; + p->se.nr_migrations = 0; + p->se.nr_migrations_cold = 0; + p->se.nr_failed_migrations_affine = 0; + p->se.nr_failed_migrations_running = 0; + p->se.nr_failed_migrations_hot = 0; + p->se.nr_forced_migrations = 0; + p->se.nr_forced2_migrations = 0; + p->se.nr_wakeups = 0; + p->se.nr_wakeups_sync = 0; + p->se.nr_wakeups_migrate = 0; + p->se.nr_wakeups_local = 0; + p->se.nr_wakeups_remote = 0; + p->se.nr_wakeups_affine = 0; + p->se.nr_wakeups_affine_attempts = 0; + p->se.nr_wakeups_passive = 0; + p->se.nr_wakeups_idle = 0; + p->sched_info.bkl_count = 0; #endif - p->se.sum_exec_runtime = 0; - p->se.prev_sum_exec_runtime = 0; + p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; + p->nvcsw = 0; + p->nivcsw = 0; } -- cgit v0.10.2 From 6bc1665ba71de0f207391b01b187b21b2619c15c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:18 +0200 Subject: sched: allow the immediate migration of cache-cold tasks allow the immediate migration of cache-cold tasks. Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 3b27c3a..7506127 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1016,6 +1016,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) if (p->sched_class != &fair_sched_class) return 0; + if (sysctl_sched_migration_cost == -1) + return 1; + if (sysctl_sched_migration_cost == 0) + return 0; + delta = now - p->se.exec_start; return delta < (s64)sysctl_sched_migration_cost; @@ -2189,7 +2194,8 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - if (sd->nr_balance_failed > sd->cache_nice_tries) { + if (!task_hot(p, rq->clock, sd) || + sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS if (task_hot(p, rq->clock, sd)) { schedstat_inc(sd, lb_hot_gained[idle]); -- cgit v0.10.2 From 95dbb421d12fdd9796ed153853daf3679809274f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:19 +0200 Subject: sched: reintroduce topology.h tunings reintroduce the 2.6.22 topology.h tunings again - they result in slightly better balancing. Signed-off-by: Ingo Molnar diff --git a/include/linux/topology.h b/include/linux/topology.h index 525d437..865a63e 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -98,7 +98,7 @@ .cache_nice_tries = 0, \ .busy_idx = 0, \ .idle_idx = 0, \ - .newidle_idx = 0, \ + .newidle_idx = 1, \ .wake_idx = 0, \ .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ @@ -128,15 +128,14 @@ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ .busy_idx = 2, \ - .idle_idx = 0, \ - .newidle_idx = 0, \ + .idle_idx = 1, \ + .newidle_idx = 2, \ .wake_idx = 1, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ - | SD_WAKE_IDLE \ | SD_SHARE_PKG_RESOURCES\ | BALANCE_FOR_MC_POWER, \ .last_balance = jiffies, \ @@ -159,15 +158,14 @@ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ .busy_idx = 2, \ - .idle_idx = 0, \ - .newidle_idx = 0, \ + .idle_idx = 1, \ + .newidle_idx = 2, \ .wake_idx = 1, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ - | SD_WAKE_IDLE \ | BALANCE_FOR_PKG_POWER,\ .last_balance = jiffies, \ .balance_interval = 1, \ -- cgit v0.10.2 From 7a6c6bcee029a978f866511d6e41dbc7301fde4c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:19 +0200 Subject: sched: enable wake-idle on CONFIG_SCHED_MC=y most multicore CPUs today have shared L2 caches, so tune things so that the spreading amongst cores is more aggressive. Signed-off-by: Ingo Molnar diff --git a/include/linux/topology.h b/include/linux/topology.h index 865a63e..47729f1 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -98,7 +98,7 @@ .cache_nice_tries = 0, \ .busy_idx = 0, \ .idle_idx = 0, \ - .newidle_idx = 1, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ @@ -128,14 +128,15 @@ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 2, \ + .idle_idx = 0, \ + .newidle_idx = 0, \ .wake_idx = 1, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ + | SD_WAKE_IDLE \ | SD_SHARE_PKG_RESOURCES\ | BALANCE_FOR_MC_POWER, \ .last_balance = jiffies, \ -- cgit v0.10.2 From 0dbee3a6b006dbe814d002cb18e94bf24a216451 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 15 Oct 2007 17:00:19 +0200 Subject: Make scheduler debug file operations const In general, struct file_operations are const in the kernel, to not have false cacheline sharing and to catch bugs at compiletime with accidental writes to them. The new scheduler code introduces a new non-const one; fix this up. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 27e82cb..a5e517e 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -236,7 +236,7 @@ static int sched_debug_open(struct inode *inode, struct file *filp) return single_open(filp, sched_debug_show, NULL); } -static struct file_operations sched_debug_fops = { +static const struct file_operations sched_debug_fops = { .open = sched_debug_open, .read = seq_read, .llseek = seq_lseek, -- cgit v0.10.2 From 5cf9f062c8e33d5a09eaa447550330162b2a96ed Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Mon, 15 Oct 2007 17:00:19 +0200 Subject: sched: domain sysctl fixes: use kcalloc() kcalloc checks for n * sizeof(element) overflows and it zeros. Signed-off-by: Milton Miller Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 7506127..d29950a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5243,10 +5243,9 @@ static struct ctl_table sd_ctl_root[] = { static struct ctl_table *sd_alloc_ctl_entry(int n) { struct ctl_table *entry = - kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); + kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); BUG_ON(!entry); - memset(entry, 0, n * sizeof(struct ctl_table)); return entry; } @@ -6018,7 +6017,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, + sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); -- cgit v0.10.2 From 97b6ea7b6369d51a451a7d5747a7939a593fdd9c Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Mon, 15 Oct 2007 17:00:19 +0200 Subject: sched: domain sysctl fixes: use for_each_online_cpu() init_sched_domain_sysctl was walking cpus 0-n and referencing per_cpu variables. If the cpus_possible mask is not contigious this will result in a crash referencing unallocated data. If the online mask is not contigious then we would show offline cpus and miss online ones. Signed-off-by: Milton Miller Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index d29950a..374f421 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5326,11 +5326,12 @@ static void init_sched_domain_sysctl(void) sd_ctl_dir[0].child = entry; - for (i = 0; i < cpu_num; i++, entry++) { + for_each_online_cpu(i) { snprintf(buf, 32, "cpu%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); entry->mode = 0555; entry->child = sd_alloc_ctl_cpu_table(i); + entry++; } sd_sysctl_header = register_sysctl_table(sd_ctl_root); } -- cgit v0.10.2 From 6382bc90f5664c450afc1f896e7ddb35ba182af9 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Mon, 15 Oct 2007 17:00:19 +0200 Subject: sched: domain sysctl fixes: unregister the sysctl table before domains Unregister and free the sysctl table before destroying domains, then rebuild and register after creating the new domains. This prevents the sysctl table from pointing to freed memory for root to write. Signed-off-by: Milton Miller Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 374f421..a2dd054 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5250,6 +5250,18 @@ static struct ctl_table *sd_alloc_ctl_entry(int n) return entry; } +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ + struct ctl_table *entry = *tablep; + + for (entry = *tablep; entry->procname; entry++) + if (entry->child) + sd_free_ctl_entry(&entry->child); + + kfree(*tablep); + *tablep = NULL; +} + static void set_table_entry(struct ctl_table *entry, const char *procname, void *data, int maxlen, @@ -5318,7 +5330,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) } static struct ctl_table_header *sd_sysctl_header; -static void init_sched_domain_sysctl(void) +static void register_sched_domain_sysctl(void) { int i, cpu_num = num_online_cpus(); struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); @@ -5335,8 +5347,18 @@ static void init_sched_domain_sysctl(void) } sd_sysctl_header = register_sysctl_table(sd_ctl_root); } + +static void unregister_sched_domain_sysctl(void) +{ + unregister_sysctl_table(sd_sysctl_header); + sd_sysctl_header = NULL; + sd_free_ctl_entry(&sd_ctl_dir[0].child); +} #else -static void init_sched_domain_sysctl(void) +static void register_sched_domain_sysctl(void) +{ +} +static void unregister_sched_domain_sysctl(void) { } #endif @@ -6271,6 +6293,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) err = build_sched_domains(&cpu_default_map); + register_sched_domain_sysctl(); + return err; } @@ -6287,6 +6311,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) { int i; + unregister_sched_domain_sysctl(); + for_each_cpu_mask(i, *cpu_map) cpu_attach_domain(NULL, i); synchronize_sched(); @@ -6317,6 +6343,8 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) if (!err && !cpus_empty(*partition2)) err = build_sched_domains(partition2); + register_sched_domain_sysctl(); + return err; } @@ -6448,8 +6476,6 @@ void __init sched_init_smp(void) /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); - init_sched_domain_sysctl(); - /* Move init over to a non-isolated CPU */ if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); -- cgit v0.10.2 From ad1cdc1d7883e88f936f7888a092e4e3e6d8c631 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Mon, 15 Oct 2007 17:00:19 +0200 Subject: sched: domain sysctl fixes: do not crash on allocation failure Now that we are calling this at runtime, a more relaxed error path is suggested. If an allocation fails, we just register the partial table, which will show empty directories. Signed-off-by: Milton Miller Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index a2dd054..f40fe02 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5245,8 +5245,6 @@ static struct ctl_table *sd_alloc_ctl_entry(int n) struct ctl_table *entry = kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); - BUG_ON(!entry); - return entry; } @@ -5279,6 +5277,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) { struct ctl_table *table = sd_alloc_ctl_entry(12); + if (table == NULL) + return NULL; + set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[1], "max_interval", &sd->max_interval, @@ -5316,6 +5317,8 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) for_each_domain(cpu, sd) domain_num++; entry = table = sd_alloc_ctl_entry(domain_num + 1); + if (table == NULL) + return NULL; i = 0; for_each_domain(cpu, sd) { @@ -5336,6 +5339,9 @@ static void register_sched_domain_sysctl(void) struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); char buf[32]; + if (entry == NULL) + return; + sd_ctl_dir[0].child = entry; for_each_online_cpu(i) { -- cgit v0.10.2 From 6323469f9b72530eb90c96ba162cc70f2f4611de Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Mon, 15 Oct 2007 17:00:19 +0200 Subject: sched: domain sysctl fixes: add terminator comment we had an incorrect-terminator bug in sd_alloc_ctl_domain_table() before, so add a comment that documents it. Signed-off-by: Milton Miller Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index f40fe02..9887ca00 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5303,6 +5303,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); + /* &table[11] is terminator */ return table; } -- cgit v0.10.2 From 5e84cfde51cf303d368fcb48f22059f37b3872de Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Mon, 15 Oct 2007 17:00:19 +0200 Subject: sched: guest CPU accounting: add guest-CPU /proc/stat field as recent CPUs introduce a third running state, after "user" and "system", we need a new field, "guest", in cpustat to store the time used by the CPU to run virtual CPU. Modify /proc/stat to display this new field. Signed-off-by: Laurent Vivier Acked-by: Avi Kivity Signed-off-by: Ingo Molnar diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index bee251c..b872a01 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -443,6 +443,7 @@ static int show_stat(struct seq_file *p, void *v) int i; unsigned long jif; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; + cputime64_t guest; u64 sum = 0; struct timespec boottime; unsigned int *per_irq_sum; @@ -453,6 +454,7 @@ static int show_stat(struct seq_file *p, void *v) user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; + guest = cputime64_zero; getboottime(&boottime); jif = boottime.tv_sec; @@ -467,6 +469,7 @@ static int show_stat(struct seq_file *p, void *v) irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); + guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); for (j = 0; j < NR_IRQS; j++) { unsigned int temp = kstat_cpu(i).irqs[j]; sum += temp; @@ -474,7 +477,7 @@ static int show_stat(struct seq_file *p, void *v) } } - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n", + seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), (unsigned long long)cputime64_to_clock_t(system), @@ -482,7 +485,8 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(iowait), (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal)); + (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(guest)); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ @@ -494,7 +498,9 @@ static int show_stat(struct seq_file *p, void *v) irq = kstat_cpu(i).cpustat.irq; softirq = kstat_cpu(i).cpustat.softirq; steal = kstat_cpu(i).cpustat.steal; - seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n", + guest = kstat_cpu(i).cpustat.guest; + seq_printf(p, + "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", i, (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), @@ -503,7 +509,8 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(iowait), (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal)); + (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(guest)); } seq_printf(p, "intr %llu", (unsigned long long)sum); diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 43e895f..12bf44f 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -23,6 +23,7 @@ struct cpu_usage_stat { cputime64_t idle; cputime64_t iowait; cputime64_t steal; + cputime64_t guest; }; struct kernel_stat { -- cgit v0.10.2 From 9ac52315d4cf5f561f36dabaf0720c00d3553162 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Mon, 15 Oct 2007 17:00:19 +0200 Subject: sched: guest CPU accounting: add guest-CPU /proc//stat fields like for cpustat, introduce the "gtime" (guest time of the task) and "cgtime" (guest time of the task children) fields for the tasks. Modify signal_struct and task_struct. Modify /proc//stat to display these new fields. Signed-off-by: Laurent Vivier Acked-by: Avi Kivity Signed-off-by: Ingo Molnar diff --git a/fs/proc/array.c b/fs/proc/array.c index ee4814d..27b59f5 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -370,6 +370,11 @@ static cputime_t task_stime(struct task_struct *p) } #endif +static cputime_t task_gtime(struct task_struct *p) +{ + return p->gtime; +} + static int do_task_stat(struct task_struct *task, char *buffer, int whole) { unsigned long vsize, eip, esp, wchan = ~0UL; @@ -385,6 +390,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) unsigned long cmin_flt = 0, cmaj_flt = 0; unsigned long min_flt = 0, maj_flt = 0; cputime_t cutime, cstime, utime, stime; + cputime_t cgtime, gtime; unsigned long rsslim = 0; char tcomm[sizeof(task->comm)]; unsigned long flags; @@ -403,6 +409,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) sigemptyset(&sigign); sigemptyset(&sigcatch); cutime = cstime = utime = stime = cputime_zero; + cgtime = gtime = cputime_zero; rcu_read_lock(); if (lock_task_sighand(task, &flags)) { @@ -420,6 +427,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) cmaj_flt = sig->cmaj_flt; cutime = sig->cutime; cstime = sig->cstime; + cgtime = sig->cgtime; rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; /* add up live thread stats at the group level */ @@ -430,6 +438,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) maj_flt += t->maj_flt; utime = cputime_add(utime, task_utime(t)); stime = cputime_add(stime, task_stime(t)); + gtime = cputime_add(gtime, task_gtime(t)); t = next_thread(t); } while (t != task); @@ -437,6 +446,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) maj_flt += sig->maj_flt; utime = cputime_add(utime, sig->utime); stime = cputime_add(stime, sig->stime); + gtime += cputime_add(gtime, sig->gtime); } sid = signal_session(sig); @@ -454,6 +464,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) maj_flt = task->maj_flt; utime = task_utime(task); stime = task_stime(task); + gtime = task_gtime(task); } /* scale priority and nice values from timeslices to -20..20 */ @@ -471,7 +482,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", task->pid, tcomm, state, @@ -516,7 +527,9 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) task_cpu(task), task->rt_priority, task->policy, - (unsigned long long)delayacct_blkio_ticks(task)); + (unsigned long long)delayacct_blkio_ticks(task), + cputime_to_clock_t(gtime), + cputime_to_clock_t(cgtime)); if (mm) mmput(mm); return res; diff --git a/include/linux/sched.h b/include/linux/sched.h index 3a6e05e..fefce22 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -517,6 +517,8 @@ struct signal_struct { * in __exit_signal, except for the group leader. */ cputime_t utime, stime, cutime, cstime; + cputime_t gtime; + cputime_t cgtime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; @@ -1048,6 +1050,7 @@ struct task_struct { unsigned int rt_priority; cputime_t utime, stime; + cputime_t gtime; unsigned long nvcsw, nivcsw; /* context switch counts */ struct timespec start_time; /* monotonic time */ struct timespec real_start_time; /* boot based time */ diff --git a/kernel/exit.c b/kernel/exit.c index 993369e..7f7959d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -111,6 +111,7 @@ static void __exit_signal(struct task_struct *tsk) */ sig->utime = cputime_add(sig->utime, tsk->utime); sig->stime = cputime_add(sig->stime, tsk->stime); + sig->gtime = cputime_add(sig->gtime, tsk->gtime); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -1242,6 +1243,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap, cputime_add(p->stime, cputime_add(sig->stime, sig->cstime))); + psig->cgtime = + cputime_add(psig->cgtime, + cputime_add(p->gtime, + cputime_add(sig->gtime, + sig->cgtime))); psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += diff --git a/kernel/fork.c b/kernel/fork.c index 5e67f90..3fc3c13 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -877,6 +877,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts sig->tty_old_pgrp = NULL; sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; + sig->gtime = cputime_zero; + sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; @@ -1045,6 +1047,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = cputime_zero; p->stime = cputime_zero; + p->gtime = cputime_zero; #ifdef CONFIG_TASK_XACCT p->rchar = 0; /* I/O counter: bytes read */ -- cgit v0.10.2 From 94886b84b1bcdc95f34f70e7fce407efefe472e1 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Mon, 15 Oct 2007 17:00:19 +0200 Subject: sched: guest CPU accounting: maintain stats in account_system_time() modify account_system_time() to add cputime to cpustat->guest if we are running a VCPU. We add this cputime to cpustat->user instead of cpustat->system because this part of KVM code is in fact user code although it is executed in the kernel. We duplicate VCPU time between guest and user to allow an unmodified "top(1)" to display correct value. A modified "top(1)" is able to display good cpu user time and cpu guest time by subtracting cpu guest time from cpu user time. Update "gtime" in task_struct accordingly. Signed-off-by: Laurent Vivier Acked-by: Avi Kivity Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index fefce22..228e0a8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1342,6 +1342,7 @@ static inline void put_task_struct(struct task_struct *t) #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ +#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ diff --git a/kernel/sched.c b/kernel/sched.c index 9887ca00..5a91fe0b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3303,6 +3303,25 @@ void account_user_time(struct task_struct *p, cputime_t cputime) } /* + * Account guest cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in virtual machine since the last update + */ +void account_guest_time(struct task_struct *p, cputime_t cputime) +{ + cputime64_t tmp; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + + tmp = cputime_to_cputime64(cputime); + + p->utime = cputime_add(p->utime, cputime); + p->gtime = cputime_add(p->gtime, cputime); + + cpustat->user = cputime64_add(cpustat->user, tmp); + cpustat->guest = cputime64_add(cpustat->guest, tmp); +} + +/* * Account system cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -3315,6 +3334,12 @@ void account_system_time(struct task_struct *p, int hardirq_offset, struct rq *rq = this_rq(); cputime64_t tmp; + if (p->flags & PF_VCPU) { + account_guest_time(p, cputime); + p->flags &= ~PF_VCPU; + return; + } + p->stime = cputime_add(p->stime, cputime); /* Add system time to cpustat. */ -- cgit v0.10.2 From d172fcd3ae1ca7ac27ec8904242fd61e0e11d332 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Mon, 15 Oct 2007 17:00:19 +0200 Subject: sched: guest CPU accounting: maintain guest state in KVM Modify KVM to update guest time accounting. [ mingo@elte.hu: ported to 2.6.24 KVM. ] Signed-off-by: Laurent Vivier Acked-by: Avi Kivity Signed-off-by: Ingo Molnar diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index ad08138..3b0bc4b 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -624,6 +624,16 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu); int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); +static inline void kvm_guest_enter(void) +{ + current->flags |= PF_VCPU; +} + +static inline void kvm_guest_exit(void) +{ + current->flags &= ~PF_VCPU; +} + static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code) { diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 353e585..af2d288 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -2046,6 +2046,7 @@ again: kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); vcpu->guest_mode = 1; + kvm_guest_enter(); if (vcpu->requests) if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) @@ -2053,6 +2054,7 @@ again: kvm_x86_ops->run(vcpu, kvm_run); + kvm_guest_exit(); vcpu->guest_mode = 0; local_irq_enable(); -- cgit v0.10.2 From 71e20f1873d46e138c26ce83f8fe54b7221f572f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:19 +0200 Subject: sched: affine sync wakeups make sync wakeups affine for cache-cold tasks: if a cache-cold task is woken up by a sync wakeup then use the opportunity to migrate it straight away. (the two tasks are 'related' because they communicate) Signed-off-by: Ingo Molnar diff --git a/fs/pipe.c b/fs/pipe.c index f1fa2b4..e66ec48 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -382,7 +382,7 @@ redo: /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { - wake_up_interruptible(&pipe->wait); + wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) @@ -555,7 +555,7 @@ redo2: out: mutex_unlock(&inode->i_mutex); if (do_wakeup) { - wake_up_interruptible(&pipe->wait); + wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0) @@ -649,7 +649,7 @@ pipe_release(struct inode *inode, int decr, int decw) if (!pipe->readers && !pipe->writers) { free_pipe_info(inode); } else { - wake_up_interruptible(&pipe->wait); + wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } diff --git a/kernel/sched.c b/kernel/sched.c index 5a91fe0b..7fd3434 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1521,6 +1521,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) unsigned long tl = this_load; unsigned long tl_per_task; + /* + * Attract cache-cold tasks on sync wakeups: + */ + if (sync && !task_hot(p, rq->clock, this_sd)) + goto out_set_cpu; + schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); @@ -1598,7 +1604,7 @@ out_activate: * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - if (!sync || cpu != this_cpu) + if (!sync || rq->curr == rq->idle) check_preempt_curr(rq, p); success = 1; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2b57eaf..6996cba 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -334,7 +334,7 @@ static void unix_write_space(struct sock *sk) read_lock(&sk->sk_callback_lock); if (unix_writable(sk)) { if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + wake_up_interruptible_sync(sk->sk_sleep); sk_wake_async(sk, 2, POLL_OUT); } read_unlock(&sk->sk_callback_lock); @@ -1639,7 +1639,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, if (!skb) goto out_unlock; - wake_up_interruptible(&u->peer_wait); + wake_up_interruptible_sync(&u->peer_wait); if (msg->msg_name) unix_copy_addr(msg, skb->sk); -- cgit v0.10.2 From 9c63d9c021f375a2708ad79043d6f4dd1291a085 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Oct 2007 17:00:20 +0200 Subject: sched: sync wakeups preempt too make sure sync wakeups preempt too - the scheduler will not overschedule as we've got various throttles against that. As a result, sync wakeups can be used more widely in the kernel (to signal wakeup affinity between tasks), and no arbitrary latencies will be introduced either. Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 7fd3434..bba57ad 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1596,16 +1596,7 @@ out_activate: schedstat_inc(p, se.nr_wakeups_remote); update_rq_clock(rq); activate_task(rq, p, 1); - /* - * Sync wakeups (i.e. those types of wakeups where the waker - * has indicated that it will leave the CPU in short order) - * don't trigger a preemption, if the woken up task will run on - * this cpu. (in this case the 'I will reschedule' promise of - * the waker guarantees that the freshly woken up task is going - * to be considered on this CPU.) - */ - if (!sync || rq->curr == rq->idle) - check_preempt_curr(rq, p); + check_preempt_curr(rq, p); success = 1; out_running: -- cgit v0.10.2