From 743cb1ff191f00fee653212bdbcee1e56086d6ce Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Jul 2014 17:00:21 +0200 Subject: sched/fair: Make calculate_imbalance() independent Rik noticed that calculate_imbalance() relies on update_sd_pick_busiest() to guarantee that busiest->sum_nr_running > busiest->group_capacity_factor. Break this implicit assumption (with the intent of not providing it anymore) by having calculat_imbalance() verify it and not rely on others. Reported-by: Rik van Riel Signed-off-by: Peter Zijlstra Acked-by: Vincent Guittot Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20140729152631.GW12054@laptop.lan Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bfa3c86..e9477e6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6248,7 +6248,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return fix_small_imbalance(env, sds); } - if (!busiest->group_imb) { + if (busiest->sum_nr_running > busiest->group_capacity_factor) { /* * Don't want to pull so many tasks that a group would go idle. * Except of course for the group_imb case, since then we might -- cgit v0.10.2 From caeb178c60f4f93f1b45c0bc056b5cf6d217b67f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 28 Jul 2014 14:16:28 -0400 Subject: sched/fair: Make update_sd_pick_busiest() return 'true' on a busier sd Currently update_sd_pick_busiest only identifies the busiest sd that is either overloaded, or has a group imbalance. When no sd is imbalanced or overloaded, the load balancer fails to find the busiest domain. This breaks load balancing between domains that are not overloaded, in the !SD_ASYM_PACKING case. This patch makes update_sd_pick_busiest return true when the busiest sd yet is encountered. Groups are ranked in the order overloaded > imbalanced > other, with higher ranked groups getting priority even when their load is lower. This is necessary due to the possibility of unequal capacities and cpumasks between domains within a sched group. Behaviour for SD_ASYM_PACKING does not seem to match the comment, but I have no hardware to test that so I have left the behaviour of that code unchanged. Enum for group classification suggested by Peter Zijlstra. Signed-off-by: Rik van Riel [peterz: replaced sg_lb_stats::group_imb with the new enum group_type in an attempt to avoid endless recalculation] Signed-off-by: Peter Zijlstra Acked-by: Vincent Guittot Acked-by: Michael Neuling Cc: ktkhai@parallels.com Cc: tim.c.chen@linux.intel.com Cc: nicolas.pitre@linaro.org Cc: jhladky@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140729152743.GI3935@laptop Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9477e6..9437725 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5559,6 +5559,13 @@ static unsigned long task_h_load(struct task_struct *p) #endif /********** Helpers for find_busiest_group ************************/ + +enum group_type { + group_other = 0, + group_imbalanced, + group_overloaded, +}; + /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -5572,7 +5579,7 @@ struct sg_lb_stats { unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; - int group_imb; /* Is there an imbalance in the group ? */ + enum group_type group_type; int group_has_free_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -5610,6 +5617,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, + .sum_nr_running = 0, + .group_type = group_other, }, }; } @@ -5891,6 +5900,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro return capacity_factor; } +static enum group_type +group_classify(struct sched_group *group, struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running > sgs->group_capacity_factor) + return group_overloaded; + + if (sg_imbalanced(group)) + return group_imbalanced; + + return group_other; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -5942,9 +5963,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; - - sgs->group_imb = sg_imbalanced(group); sgs->group_capacity_factor = sg_capacity_factor(env, group); + sgs->group_type = group_classify(group, sgs); if (sgs->group_capacity_factor > sgs->sum_nr_running) sgs->group_has_free_capacity = 1; @@ -5968,13 +5988,19 @@ static bool update_sd_pick_busiest(struct lb_env *env, struct sched_group *sg, struct sg_lb_stats *sgs) { - if (sgs->avg_load <= sds->busiest_stat.avg_load) - return false; + struct sg_lb_stats *busiest = &sds->busiest_stat; - if (sgs->sum_nr_running > sgs->group_capacity_factor) + if (sgs->group_type > busiest->group_type) return true; - if (sgs->group_imb) + if (sgs->group_type < busiest->group_type) + return false; + + if (sgs->avg_load <= busiest->avg_load) + return false; + + /* This is the busiest node in its class. */ + if (!(env->sd->flags & SD_ASYM_PACKING)) return true; /* @@ -5982,8 +6008,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * numbered CPUs in the group, therefore mark all groups * higher than ourself as busy. */ - if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && - env->dst_cpu < group_first_cpu(sg)) { + if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { if (!sds->busiest) return true; @@ -6228,7 +6253,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local = &sds->local_stat; busiest = &sds->busiest_stat; - if (busiest->group_imb) { + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages * to ensure cpu-load equilibrium, look at wider averages. XXX @@ -6248,7 +6273,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return fix_small_imbalance(env, sds); } - if (busiest->sum_nr_running > busiest->group_capacity_factor) { + if (busiest->group_type == group_overloaded) { /* * Don't want to pull so many tasks that a group would go idle. * Except of course for the group_imb case, since then we might @@ -6337,7 +6362,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * work because they assume all things are equal, which typically * isn't true due to cpus_allowed constraints and the like. */ - if (busiest->group_imb) + if (busiest->group_type == group_imbalanced) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ -- cgit v0.10.2 From 9a5d9ba6a3631d55c358fe1bdbaa162a97471a05 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Jul 2014 17:15:11 +0200 Subject: sched/fair: Allow calculate_imbalance() to move idle cpus Allow calculate_imbalance() to 'create' idle cpus in the busiest group if there are idle cpus in the local group. Suggested-by: Rik van Riel Signed-off-by: Peter Zijlstra Acked-by: Vincent Guittot Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140729152705.GX12054@laptop.lan Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9437725..df1ed17 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6273,12 +6273,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return fix_small_imbalance(env, sds); } - if (busiest->group_type == group_overloaded) { - /* - * Don't want to pull so many tasks that a group would go idle. - * Except of course for the group_imb case, since then we might - * have to drop below capacity to reach cpu-load equilibrium. - */ + /* + * If there aren't any idle cpus, avoid creating some. + */ + if (busiest->group_type == group_overloaded && + local->group_type == group_overloaded) { load_above_capacity = (busiest->sum_nr_running - busiest->group_capacity_factor); -- cgit v0.10.2 From aaecac4ad46b35ad308245384d019633fb9bc21b Mon Sep 17 00:00:00 2001 From: Zhihui Zhang Date: Fri, 1 Aug 2014 21:18:03 -0400 Subject: sched: Rename a misleading variable in build_overlap_sched_groups() The child variable in build_overlap_sched_groups() actually refers to the peer or sibling domain of the given CPU. Rename it to sibling to be consistent with the naming in build_group_mask(). Signed-off-by: Zhihui Zhang Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1406942283-18249-1-git-send-email-zzhsuny@gmail.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1211575..7d1ec6e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5739,7 +5739,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered = sched_domains_tmpmask; struct sd_data *sdd = sd->private; - struct sched_domain *child; + struct sched_domain *sibling; int i; cpumask_clear(covered); @@ -5750,10 +5750,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (cpumask_test_cpu(i, covered)) continue; - child = *per_cpu_ptr(sdd->sd, i); + sibling = *per_cpu_ptr(sdd->sd, i); /* See the comment near build_group_mask(). */ - if (!cpumask_test_cpu(i, sched_domain_span(child))) + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), @@ -5763,10 +5763,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) goto fail; sg_span = sched_group_cpus(sg); - if (child->child) { - child = child->child; - cpumask_copy(sg_span, sched_domain_span(child)); - } else + if (sibling->child) + cpumask_copy(sg_span, sched_domain_span(sibling->child)); + else cpumask_set_cpu(i, sg_span); cpumask_or(covered, covered, sg_span); -- cgit v0.10.2 From b932c03c34f3b03c7364c06aa8cae5b74609fc41 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 4 Aug 2014 13:23:27 -0400 Subject: sched/numa: Fix off-by-one in capacity check Commit a43455a1d572daf7b730fe12eb747d1e17411365 ensures that task_numa_migrate will call task_numa_compare on the preferred node all the time, even when the preferred node has no free capacity. This could lead to a performance regression if nr_running == capacity on both the source and the destination node. This can be avoided by also checking for nr_running == capacity on the source node, which is one stricter than checking .has_free_capacity. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: vincent.guittot@linaro.org Cc: Morten.Rasmussen@arm.com Cc: nicolas.pitre@linaro.org Cc: efault@gmx.de Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1407173008-9334-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df1ed17..e1cf419 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1206,7 +1206,7 @@ static void task_numa_compare(struct task_numa_env *env, if (!cur) { /* Is there capacity at our destination? */ - if (env->src_stats.has_free_capacity && + if (env->src_stats.nr_running <= env->src_stats.task_capacity && !env->dst_stats.has_free_capacity) goto unlock; -- cgit v0.10.2 From 83d7f2424741c9dc76c21377c9d00d47abaf88df Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 4 Aug 2014 13:23:28 -0400 Subject: sched/numa: Fix numa capacity computation Commit c61037e9 fixes the phenomenon of 'fantom' cores due to N*frac(smt_power) >= 1 by limiting the capacity to the actual number of cores in the load balancing code. This patch applies the same correction to the NUMA balancing code. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: vincent.guittot@linaro.org Cc: Morten.Rasmussen@arm.com Cc: nicolas.pitre@linaro.org Cc: efault@gmx.de Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1407173008-9334-3-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e1cf419..1413c44 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1038,7 +1038,8 @@ struct numa_stats { */ static void update_numa_stats(struct numa_stats *ns, int nid) { - int cpu, cpus = 0; + int smt, cpu, cpus = 0; + unsigned long capacity; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { @@ -1062,8 +1063,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->task_capacity = - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); + /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ + smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); + capacity = cpus / smt; /* cores */ + + ns->task_capacity = min_t(unsigned, capacity, + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } -- cgit v0.10.2 From 5d07f4202c5d63b73ba1734ed38e08461a689313 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 13 Aug 2014 21:19:53 +0200 Subject: sched: s/do_each_thread/for_each_process_thread/ in core.c Change kernel/sched/core.c to use for_each_process_thread(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Hidetoshi Seto Cc: Frank Mayhar Cc: Frederic Weisbecker Cc: Andrew Morton Cc: Sanjay Rao Cc: Larry Woodman Cc: Rik van Riel Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140813191953.GA19315@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7d1ec6e..4f2826f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4505,7 +4505,7 @@ void show_state_filter(unsigned long state_filter) " task PC stack pid father\n"); #endif rcu_read_lock(); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: @@ -4513,7 +4513,7 @@ void show_state_filter(unsigned long state_filter) touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); - } while_each_thread(g, p); + } touch_all_softlockup_watchdogs(); @@ -7137,7 +7137,7 @@ void normalize_rt_tasks(void) struct rq *rq; read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* * Only normalize user tasks: */ @@ -7168,8 +7168,7 @@ void normalize_rt_tasks(void) __task_rq_unlock(rq); raw_spin_unlock(&p->pi_lock); - } while_each_thread(g, p); - + } read_unlock_irqrestore(&tasklist_lock, flags); } @@ -7357,10 +7356,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg) { struct task_struct *g, *p; - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (rt_task(p) && task_rq(p)->rt.tg == tg) return 1; - } while_each_thread(g, p); + } return 0; } -- cgit v0.10.2 From d38e83c715270cc2e137bbf6f25206c8c023896b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 13 Aug 2014 21:19:56 +0200 Subject: sched: s/do_each_thread/for_each_process_thread/ in debug.c Change kernel/sched/debug.c to use for_each_process_thread(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Hidetoshi Seto Cc: Frank Mayhar Cc: Frederic Weisbecker Cc: Andrew Morton Cc: Sanjay Rao Cc: Larry Woodman Cc: Rik van Riel Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140813191956.GA19324@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 627b3c3..c7fe1ea0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -160,14 +160,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) "----------------------------------------------------\n"); read_lock_irqsave(&tasklist_lock, flags); - - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); - } while_each_thread(g, p); - + } read_unlock_irqrestore(&tasklist_lock, flags); } -- cgit v0.10.2 From 1e4dda08b4c39b3d8f4a3ee7269d49e0200c8af8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 13 Aug 2014 21:20:00 +0200 Subject: sched: Change thread_group_cputime() to use for_each_thread() Change thread_group_cputime() to use for_each_thread() instead of buggy while_each_thread(). This also makes the pid_alive() check unnecessary. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Hidetoshi Seto Cc: Frank Mayhar Cc: Frederic Weisbecker Cc: Andrew Morton Cc: Sanjay Rao Cc: Larry Woodman Cc: Rik van Riel Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140813192000.GA19327@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 72fdf06..3e52836 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -294,18 +294,12 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) times->sum_exec_runtime = sig->sum_sched_runtime; rcu_read_lock(); - /* make sure we can trust tsk->thread_group list */ - if (!likely(pid_alive(tsk))) - goto out; - - t = tsk; - do { + for_each_thread(tsk, t) { task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; times->sum_exec_runtime += task_sched_runtime(t); - } while_each_thread(tsk, t); -out: + } rcu_read_unlock(); } -- cgit v0.10.2 From 5aface53d1a0ef7823215c4078fca8445995d006 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 13 Aug 2014 21:20:03 +0200 Subject: sched: Change autogroup_move_group() to use for_each_thread() Change autogroup_move_group() to use for_each_thread() instead of buggy while_each_thread(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Hidetoshi Seto Cc: Frank Mayhar Cc: Frederic Weisbecker Cc: Andrew Morton Cc: Sanjay Rao Cc: Larry Woodman Cc: Rik van Riel Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140813192003.GA19334@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e73efba..8a2e230 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) goto out; - t = p; - do { + for_each_thread(p, t) sched_move_task(t); - } while_each_thread(p, t); - out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); -- cgit v0.10.2 From 8b06c55bdb8b402cb4814e83dc4b1cb245fcc9f5 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 13 Aug 2014 13:28:12 -0400 Subject: sched: Match declaration with definition Match the declaration of runqueues with the definition. Signed-off-by: Pranith Kumar Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1407950893-32731-1-git-send-email-bobby.prani@gmail.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 579712f..4c2b87f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -647,7 +647,7 @@ static inline int cpu_of(struct rq *rq) #endif } -DECLARE_PER_CPU(struct rq, runqueues); +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) -- cgit v0.10.2 From f36c019c79edb3a89920afae1b2b45987af1a112 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 6 Aug 2014 12:06:01 +0400 Subject: sched/fair: Fix reschedule which is generated on throttled cfs_rq (sched_entity::on_rq == 1) does not guarantee the task is pickable; changes on throttled cfs_rq must not lead to reschedule. Check for task_struct::on_rq instead. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1407312361.8424.35.camel@tkhai Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1413c44..bc37bb9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7494,7 +7494,7 @@ static void task_fork_fair(struct task_struct *p) static void prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->se.on_rq) + if (!p->on_rq) return; /* @@ -7550,15 +7550,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { - struct sched_entity *se = &p->se; #ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *se = &p->se; /* * Since the real-depth could have been changed (only FAIR * class maintain depth value), reset depth properly. */ se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!se->on_rq) + if (!p->on_rq) return; /* -- cgit v0.10.2 From da0c1e65b51a289540159663aa4b90ba2366bc21 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 20 Aug 2014 13:47:32 +0400 Subject: sched: Add wrapper for checking task_struct::on_rq Implement task_on_rq_queued() and use it everywhere instead of on_rq check. No functional changes. The only exception is we do not use the wrapper in check_for_tasks(), because it requires to export task_on_rq_queued() in global header files. Next patch in series would return it back, so we do not twist it from here to there. Signed-off-by: Kirill Tkhai Cc: Peter Zijlstra Cc: Paul Turner Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Mike Galbraith Cc: Kirill Tkhai Cc: Tim Chen Cc: Nicolas Pitre Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1408528052.23412.87.camel@tkhai Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4f2826f..a02b624 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1043,7 +1043,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) + if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; } @@ -1088,7 +1088,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) static void __migrate_swap_task(struct task_struct *p, int cpu) { - if (p->on_rq) { + if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; src_rq = task_rq(p); @@ -1214,7 +1214,7 @@ static int migration_cpu_stop(void *data); unsigned long wait_task_inactive(struct task_struct *p, long match_state) { unsigned long flags; - int running, on_rq; + int running, queued; unsigned long ncsw; struct rq *rq; @@ -1252,7 +1252,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(rq, p); - on_rq = p->on_rq; + queued = task_on_rq_queued(p); ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ @@ -1284,7 +1284,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * running right now), it's preempted, and we should * yield - it could be a while. */ - if (unlikely(on_rq)) { + if (unlikely(queued)) { ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); set_current_state(TASK_UNINTERRUPTIBLE); @@ -1478,7 +1478,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); - p->on_rq = 1; + p->on_rq = TASK_ON_RQ_QUEUED; /* if a worker is waking up, notify workqueue */ if (p->flags & PF_WQ_WORKER) @@ -1537,7 +1537,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) int ret = 0; rq = __task_rq_lock(p); - if (p->on_rq) { + if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); ttwu_do_wakeup(rq, p, wake_flags); @@ -1678,7 +1678,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) success = 1; /* we're going to change ->state */ cpu = task_cpu(p); - if (p->on_rq && ttwu_remote(p, wake_flags)) + if (task_on_rq_queued(p) && ttwu_remote(p, wake_flags)) goto stat; #ifdef CONFIG_SMP @@ -1742,7 +1742,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) goto out; - if (!p->on_rq) + if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0); @@ -2095,7 +2095,7 @@ void wake_up_new_task(struct task_struct *p) init_task_runnable_average(p); rq = __task_rq_lock(p); activate_task(rq, p, 0); - p->on_rq = 1; + p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -2444,7 +2444,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) * project cycles that may never be accounted to this * thread, breaking clock_gettime(). */ - if (task_current(rq, p) && p->on_rq) { + if (task_current(rq, p) && task_on_rq_queued(p)) { update_rq_clock(rq); ns = rq_clock_task(rq) - p->se.exec_start; if ((s64)ns < 0) @@ -2490,7 +2490,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * If we see ->on_cpu without ->on_rq, the task is leaving, and has * been accounted, so we're correct here as well. */ - if (!p->on_cpu || !p->on_rq) + if (!p->on_cpu || !task_on_rq_queued(p)) return p->se.sum_exec_runtime; #endif @@ -2794,7 +2794,7 @@ need_resched: switch_count = &prev->nvcsw; } - if (prev->on_rq || rq->skip_clock_update < 0) + if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) update_rq_clock(rq); next = pick_next_task(rq, prev); @@ -2959,7 +2959,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, on_rq, running, enqueue_flag = 0; + int oldprio, queued, running, enqueue_flag = 0; struct rq *rq; const struct sched_class *prev_class; @@ -2988,9 +2988,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) p->sched_class->put_prev_task(rq, p); @@ -3030,7 +3030,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, p, enqueue_flag); check_class_changed(rq, p, prev_class, oldprio); @@ -3041,7 +3041,7 @@ out_unlock: void set_user_nice(struct task_struct *p, long nice) { - int old_prio, delta, on_rq; + int old_prio, delta, queued; unsigned long flags; struct rq *rq; @@ -3062,8 +3062,8 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - on_rq = p->on_rq; - if (on_rq) + queued = task_on_rq_queued(p); + if (queued) dequeue_task(rq, p, 0); p->static_prio = NICE_TO_PRIO(nice); @@ -3072,7 +3072,7 @@ void set_user_nice(struct task_struct *p, long nice) p->prio = effective_prio(p); delta = p->prio - old_prio; - if (on_rq) { + if (queued) { enqueue_task(rq, p, 0); /* * If the task increased its priority or is running and @@ -3344,7 +3344,7 @@ static int __sched_setscheduler(struct task_struct *p, { int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : MAX_RT_PRIO - 1 - attr->sched_priority; - int retval, oldprio, oldpolicy = -1, on_rq, running; + int retval, oldprio, oldpolicy = -1, queued, running; int policy = attr->sched_policy; unsigned long flags; const struct sched_class *prev_class; @@ -3541,9 +3541,9 @@ change: return 0; } - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) p->sched_class->put_prev_task(rq, p); @@ -3553,7 +3553,7 @@ change: if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (queued) { /* * We enqueue to tail when the priority of a task is * increased (user space view). @@ -4568,7 +4568,7 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; - idle->on_rq = 1; + idle->on_rq = TASK_ON_RQ_QUEUED; #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif @@ -4645,7 +4645,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (p->on_rq) { + if (task_on_rq_queued(p)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &flags); @@ -4695,7 +4695,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (p->on_rq) { + if (task_on_rq_queued(p)) { dequeue_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); enqueue_task(rq_dest, p, 0); @@ -4736,13 +4736,13 @@ void sched_setnuma(struct task_struct *p, int nid) { struct rq *rq; unsigned long flags; - bool on_rq, running; + bool queued, running; rq = task_rq_lock(p, &flags); - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) p->sched_class->put_prev_task(rq, p); @@ -4751,7 +4751,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (running) p->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, p, 0); task_rq_unlock(rq, p, &flags); } @@ -7116,13 +7116,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p) .sched_policy = SCHED_NORMAL, }; int old_prio = p->prio; - int on_rq; + int queued; - on_rq = p->on_rq; - if (on_rq) + queued = task_on_rq_queued(p); + if (queued) dequeue_task(rq, p, 0); __setscheduler(rq, p, &attr); - if (on_rq) { + if (queued) { enqueue_task(rq, p, 0); resched_curr(rq); } @@ -7309,16 +7309,16 @@ void sched_offline_group(struct task_group *tg) void sched_move_task(struct task_struct *tsk) { struct task_group *tg; - int on_rq, running; + int queued, running; unsigned long flags; struct rq *rq; rq = task_rq_lock(tsk, &flags); running = task_current(rq, tsk); - on_rq = tsk->on_rq; + queued = task_on_rq_queued(tsk); - if (on_rq) + if (queued) dequeue_task(rq, tsk, 0); if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); @@ -7331,14 +7331,14 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, on_rq); + tsk->sched_class->task_move_group(tsk, queued); else #endif set_task_rq(tsk, task_cpu(tsk)); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, tsk, 0); task_rq_unlock(rq, tsk, &flags); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 255ce13..d21a8e0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -530,7 +530,7 @@ again: update_rq_clock(rq); dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; - if (p->on_rq) { + if (task_on_rq_queued(p)) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (task_has_dl_policy(rq->curr)) check_preempt_curr_dl(rq, p, 0); @@ -1030,7 +1030,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) * means a stop task can slip in, in which case we need to * re-start task selection. */ - if (rq->stop && rq->stop->on_rq) + if (rq->stop && task_on_rq_queued(rq->stop)) return RETRY_TASK; } @@ -1257,7 +1257,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || - task_running(rq, task) || !task->on_rq)) { + task_running(rq, task) || + !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); later_rq = NULL; break; @@ -1296,7 +1297,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); - BUG_ON(!p->on_rq); + BUG_ON(!task_on_rq_queued(p)); BUG_ON(!dl_task(p)); return p; @@ -1443,7 +1444,7 @@ static int pull_dl_task(struct rq *this_rq) dl_time_before(p->dl.deadline, this_rq->dl.earliest_dl.curr))) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->on_rq); + WARN_ON(!task_on_rq_queued(p)); /* * Then we pull iff p has actually an earlier @@ -1596,7 +1597,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (unlikely(p->dl.dl_throttled)) return; - if (p->on_rq && rq->curr != p) { + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) /* Only reschedule if pushing failed */ @@ -1614,7 +1615,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (p->on_rq || rq->curr == p) { + if (task_on_rq_queued(p) || rq->curr == p) { #ifdef CONFIG_SMP /* * This might be too much, but unfortunately diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc37bb9..9e6ca0d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7494,7 +7494,7 @@ static void task_fork_fair(struct task_struct *p) static void prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; /* @@ -7519,11 +7519,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it's on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it's !on_rq, then only when + * If it's queued, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !queued, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!p->on_rq && p->state != TASK_RUNNING) { + if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -7558,7 +7558,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) */ se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; /* @@ -7604,7 +7604,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_move_group_fair(struct task_struct *p, int on_rq) +static void task_move_group_fair(struct task_struct *p, int queued) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq; @@ -7623,7 +7623,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * fair sleeper stuff for the first placement, but who cares. */ /* - * When !on_rq, vruntime of the task has usually NOT been normalized. + * When !queued, vruntime of the task has usually NOT been normalized. * But there are some cases where it has already been normalized: * * - Moving a forked child which is waiting for being woken up by @@ -7634,14 +7634,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * To prevent boost or penalty in the new cfs_rq caused by delta * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. */ - if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) - on_rq = 1; + if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) + queued = 1; - if (!on_rq) + if (!queued) se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); se->depth = se->parent ? se->parent->depth + 1 : 0; - if (!on_rq) { + if (!queued) { cfs_rq = cfs_rq_of(se); se->vruntime += cfs_rq->min_vruntime; #ifdef CONFIG_SMP diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5f6edca..4feac8f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) * means a dl or stop task can slip in, in which case we need * to re-start task selection. */ - if (unlikely((rq->stop && rq->stop->on_rq) || + if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || rq->dl.dl_nr_running)) return RETRY_TASK; } @@ -1624,7 +1624,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(lowest_rq->cpu, tsk_cpus_allowed(task)) || task_running(rq, task) || - !task->on_rq)) { + !task_on_rq_queued(task))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1658,7 +1658,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); - BUG_ON(!p->on_rq); + BUG_ON(!task_on_rq_queued(p)); BUG_ON(!rt_task(p)); return p; @@ -1809,7 +1809,7 @@ static int pull_rt_task(struct rq *this_rq) */ if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->on_rq); + WARN_ON(!task_on_rq_queued(p)); /* * There's a chance that p is higher in priority @@ -1870,7 +1870,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, BUG_ON(!rt_task(p)); - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; weight = cpumask_weight(new_mask); @@ -1936,7 +1936,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!p->on_rq || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) return; if (pull_rt_task(rq)) @@ -1970,7 +1970,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (p->on_rq && rq->curr != p) { + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && /* Don't resched if we changed runqueues */ @@ -1989,7 +1989,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) static void prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; if (rq->curr == p) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4c2b87f..26566d0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -15,6 +15,9 @@ struct rq; +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 + extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; @@ -942,6 +945,10 @@ static inline int task_running(struct rq *rq, struct task_struct *p) #endif } +static inline int task_on_rq_queued(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_QUEUED; +} #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index bfe0eda..67426e5 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *stop = rq->stop; - if (!stop || !stop->on_rq) + if (!stop || !task_on_rq_queued(stop)) return NULL; put_prev_task(rq, prev); -- cgit v0.10.2 From cca26e8009d1939a6a5bf0200d276fa26f03e536 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 20 Aug 2014 13:47:42 +0400 Subject: sched: Teach scheduler to understand TASK_ON_RQ_MIGRATING state This is a new p->on_rq state which will be used to indicate that a task is in a process of migrating between two RQs. It allows to get rid of double_rq_lock(), which we used to use to change a rq of a queued task before. Let's consider an example. To move a task between src_rq and dst_rq we will do the following: raw_spin_lock(&src_rq->lock); /* p is a task which is queued on src_rq */ p = ...; dequeue_task(src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, dst_cpu); raw_spin_unlock(&src_rq->lock); /* * Both RQs are unlocked here. * Task p is dequeued from src_rq * but its on_rq value is not zero. */ raw_spin_lock(&dst_rq->lock); p->on_rq = TASK_ON_RQ_QUEUED; enqueue_task(dst_rq, p, 0); raw_spin_unlock(&dst_rq->lock); While p->on_rq is TASK_ON_RQ_MIGRATING, task is considered as "migrating", and other parallel scheduler actions with it are not available to parallel callers. The parallel caller is spining till migration is completed. The unavailable actions are changing of cpu affinity, changing of priority etc, in other words all the functionality which used to require task_rq(p)->lock before (and related to the task). To implement TASK_ON_RQ_MIGRATING support we primarily are using the following fact. Most of scheduler users (from which we are protecting a migrating task) use task_rq_lock() and __task_rq_lock() to get the lock of task_rq(p). These primitives know that task's cpu may change, and they are spining while the lock of the right RQ is not held. We add one more condition into them, so they will be also spinning until the migration is finished. Signed-off-by: Kirill Tkhai Cc: Peter Zijlstra Cc: Paul Turner Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Mike Galbraith Cc: Kirill Tkhai Cc: Tim Chen Cc: Nicolas Pitre Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1408528062.23412.88.camel@tkhai Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a02b624..71b8360 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -333,9 +333,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) for (;;) { rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) return rq; raw_spin_unlock(&rq->lock); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); } } @@ -352,10 +355,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) raw_spin_lock_irqsave(&p->pi_lock, *flags); rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) return rq; raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); } } @@ -1678,7 +1684,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) success = 1; /* we're going to change ->state */ cpu = task_cpu(p); - if (task_on_rq_queued(p) && ttwu_remote(p, wake_flags)) + if (p->on_rq && ttwu_remote(p, wake_flags)) goto stat; #ifdef CONFIG_SMP diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 26566d0..aa0f73b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -17,6 +17,7 @@ struct rq; /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 +#define TASK_ON_RQ_MIGRATING 2 extern __read_mostly int scheduler_running; @@ -950,6 +951,11 @@ static inline int task_on_rq_queued(struct task_struct *p) return p->on_rq == TASK_ON_RQ_QUEUED; } +static inline int task_on_rq_migrating(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_MIGRATING; +} + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif -- cgit v0.10.2 From a1e01829796aa7a993e28ffd7fee5c8d525be175 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 20 Aug 2014 13:47:50 +0400 Subject: sched: Remove double_rq_lock() from __migrate_task() Avoid double_rq_lock() and use TASK_ON_RQ_MIGRATING for __migrate_task(). The advantage is (obviously) not holding two rq->lock's at the same time and thereby increasing parallelism. The important point to note is that because we acquire dst->lock immediately after releasing src->lock the potential wait time of task_rq_lock() callers on TASK_ON_RQ_MIGRATING is not longer than it would have been in the double rq lock scenario. Signed-off-by: Kirill Tkhai Cc: Peter Zijlstra Cc: Paul Turner Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Mike Galbraith Cc: Kirill Tkhai Cc: Tim Chen Cc: Nicolas Pitre Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1408528070.23412.89.camel@tkhai Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 71b8360..a773c91 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4679,20 +4679,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); */ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { - struct rq *rq_dest, *rq_src; + struct rq *rq; int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; - rq_src = cpu_rq(src_cpu); - rq_dest = cpu_rq(dest_cpu); + rq = cpu_rq(src_cpu); raw_spin_lock(&p->pi_lock); - double_rq_lock(rq_src, rq_dest); + raw_spin_lock(&rq->lock); /* Already moved. */ if (task_cpu(p) != src_cpu) goto done; + /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) goto fail; @@ -4702,15 +4702,22 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * placed properly. */ if (task_on_rq_queued(p)) { - dequeue_task(rq_src, p, 0); + dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, dest_cpu); - enqueue_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p, 0); + raw_spin_unlock(&rq->lock); + + rq = cpu_rq(dest_cpu); + raw_spin_lock(&rq->lock); + BUG_ON(task_rq(p) != rq); + p->on_rq = TASK_ON_RQ_QUEUED; + enqueue_task(rq, p, 0); + check_preempt_curr(rq, p, 0); } done: ret = 1; fail: - double_rq_unlock(rq_src, rq_dest); + raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); return ret; } -- cgit v0.10.2 From e5673f280501298dbb56efa46e333cf64ee5080a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 20 Aug 2014 13:48:01 +0400 Subject: sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop() Avoid double_rq_lock() and use the TASK_ON_RQ_MIGRATING state for active_load_balance_cpu_stop(). The advantage is (obviously) not holding two 'rq->lock's at the same time and thereby increasing parallelism. Further note that if there was no task to migrate we will not have acquired the second rq->lock at all. The important point to note is that because we acquire dst->lock immediately after releasing src->lock the potential wait time of task_rq_lock() callers on TASK_ON_RQ_MIGRATING is not longer than it would have been in the double rq lock scenario. Signed-off-by: Kirill Tkhai Cc: Peter Zijlstra Cc: Paul Turner Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Mike Galbraith Cc: Kirill Tkhai Cc: Tim Chen Cc: Nicolas Pitre Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1408528081.23412.92.camel@tkhai Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9e6ca0d..7e5cf05 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5138,6 +5138,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; + lockdep_assert_held(&env->src_rq->lock); + if (p->sched_class != &fair_sched_class) return 0; @@ -5257,6 +5259,9 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot = 0; + + lockdep_assert_held(&env->src_rq->lock); + /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or @@ -5341,30 +5346,49 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) } /* - * move_one_task tries to move exactly one task from busiest to this_rq, as + * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as * part of active balancing operations within "domain". - * Returns 1 if successful and 0 otherwise. * - * Called with both runqueues locked. + * Returns a task if successful and NULL otherwise. */ -static int move_one_task(struct lb_env *env) +static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p, *n; + lockdep_assert_held(&env->src_rq->lock); + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { if (!can_migrate_task(p, env)) continue; - move_task(p, env); + deactivate_task(env->src_rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, env->dst_cpu); + /* - * Right now, this is only the second place move_task() - * is called, so we can safely collect move_task() - * stats here rather than inside move_task(). + * Right now, this is only the second place where + * lb_gained[env->idle] is updated (other is move_tasks) + * so we can safely collect stats here rather than + * inside move_tasks(). */ schedstat_inc(env->sd, lb_gained[env->idle]); - return 1; + return p; } - return 0; + return NULL; +} + +/* + * attach_one_task() -- attaches the task returned from detach_one_task() to + * its new rq. + */ +static void attach_one_task(struct rq *rq, struct task_struct *p) +{ + raw_spin_lock(&rq->lock); + BUG_ON(task_rq(p) != rq); + p->on_rq = TASK_ON_RQ_QUEUED; + activate_task(rq, p, 0); + check_preempt_curr(rq, p, 0); + raw_spin_unlock(&rq->lock); } static const unsigned int sched_nr_migrate_break = 32; @@ -6943,6 +6967,7 @@ static int active_load_balance_cpu_stop(void *data) int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; + struct task_struct *p = NULL; raw_spin_lock_irq(&busiest_rq->lock); @@ -6962,9 +6987,6 @@ static int active_load_balance_cpu_stop(void *data) */ BUG_ON(busiest_rq == target_rq); - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { @@ -6985,16 +7007,22 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); - if (move_one_task(&env)) + p = detach_one_task(&env); + if (p) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); } rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); out_unlock: busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); + raw_spin_unlock(&busiest_rq->lock); + + if (p) + attach_one_task(target_rq, p); + + local_irq_enable(); + return 0; } -- cgit v0.10.2 From 163122b7fcfa28c0e4a838fcc8043c616746802e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 20 Aug 2014 13:48:29 +0400 Subject: sched/fair: Remove double_lock_balance() from load_balance() Avoid double_rq_lock() and use TASK_ON_RQ_MIGRATING for load_balance(). The advantage is (obviously) not holding two rq->lock's at the same time and thereby increasing parallelism. Further note that if there was no task to migrate we will not have acquired the second rq->lock at all. The important point to note is that because we acquire dst->lock immediately after releasing src->lock the potential wait time of task_rq_lock() callers on TASK_ON_RQ_MIGRATING is not longer than it would have been in the double rq lock scenario. Signed-off-by: Kirill Tkhai Cc: Peter Zijlstra Cc: Paul Turner Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Mike Galbraith Cc: Kirill Tkhai Cc: Tim Chen Cc: Nicolas Pitre Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1408528109.23412.94.camel@tkhai Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e5cf05..d3427a8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4709,7 +4709,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* - * This is possible from callers such as move_task(), in which we + * This is possible from callers such as attach_tasks(), in which we * unconditionally check_prempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. @@ -5117,21 +5117,10 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + struct list_head tasks; }; /* - * move_task - move a task from one runqueue to another runqueue. - * Both runqueues must be locked. - */ -static void move_task(struct task_struct *p, struct lb_env *env) -{ - deactivate_task(env->src_rq, p, 0); - set_task_cpu(p, env->dst_cpu); - activate_task(env->dst_rq, p, 0); - check_preempt_curr(env->dst_rq, p, 0); -} - -/* * Is this task likely cache-hot: */ static int task_hot(struct task_struct *p, struct lb_env *env) @@ -5346,6 +5335,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) } /* + * detach_task() -- detach the task for the migration specified in env + */ +static void detach_task(struct task_struct *p, struct lb_env *env) +{ + lockdep_assert_held(&env->src_rq->lock); + + deactivate_task(env->src_rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, env->dst_cpu); +} + +/* * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as * part of active balancing operations within "domain". * @@ -5361,15 +5362,13 @@ static struct task_struct *detach_one_task(struct lb_env *env) if (!can_migrate_task(p, env)) continue; - deactivate_task(env->src_rq, p, 0); - p->on_rq = TASK_ON_RQ_MIGRATING; - set_task_cpu(p, env->dst_cpu); + detach_task(p, env); /* * Right now, this is only the second place where - * lb_gained[env->idle] is updated (other is move_tasks) + * lb_gained[env->idle] is updated (other is detach_tasks) * so we can safely collect stats here rather than - * inside move_tasks(). + * inside detach_tasks(). */ schedstat_inc(env->sd, lb_gained[env->idle]); return p; @@ -5377,35 +5376,22 @@ static struct task_struct *detach_one_task(struct lb_env *env) return NULL; } -/* - * attach_one_task() -- attaches the task returned from detach_one_task() to - * its new rq. - */ -static void attach_one_task(struct rq *rq, struct task_struct *p) -{ - raw_spin_lock(&rq->lock); - BUG_ON(task_rq(p) != rq); - p->on_rq = TASK_ON_RQ_QUEUED; - activate_task(rq, p, 0); - check_preempt_curr(rq, p, 0); - raw_spin_unlock(&rq->lock); -} - static const unsigned int sched_nr_migrate_break = 32; /* - * move_tasks tries to move up to imbalance weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. + * detach_tasks() -- tries to detach up to imbalance weighted load from + * busiest_rq, as part of a balancing operation within domain "sd". * - * Called with both runqueues locked. + * Returns number of detached tasks if successful and 0 otherwise. */ -static int move_tasks(struct lb_env *env) +static int detach_tasks(struct lb_env *env) { struct list_head *tasks = &env->src_rq->cfs_tasks; struct task_struct *p; unsigned long load; - int pulled = 0; + int detached = 0; + + lockdep_assert_held(&env->src_rq->lock); if (env->imbalance <= 0) return 0; @@ -5436,14 +5422,16 @@ static int move_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - move_task(p, env); - pulled++; + detach_task(p, env); + list_add(&p->se.group_node, &env->tasks); + + detached++; env->imbalance -= load; #ifdef CONFIG_PREEMPT /* * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is pulled to minimize + * kernels will stop after the first task is detached to minimize * the critical section. */ if (env->idle == CPU_NEWLY_IDLE) @@ -5463,13 +5451,58 @@ next: } /* - * Right now, this is one of only two places move_task() is called, - * so we can safely collect move_task() stats here rather than - * inside move_task(). + * Right now, this is one of only two places we collect this stat + * so we can safely collect detach_one_task() stats here rather + * than inside detach_one_task(). */ - schedstat_add(env->sd, lb_gained[env->idle], pulled); + schedstat_add(env->sd, lb_gained[env->idle], detached); - return pulled; + return detached; +} + +/* + * attach_task() -- attach the task detached by detach_task() to its new rq. + */ +static void attach_task(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_held(&rq->lock); + + BUG_ON(task_rq(p) != rq); + p->on_rq = TASK_ON_RQ_QUEUED; + activate_task(rq, p, 0); + check_preempt_curr(rq, p, 0); +} + +/* + * attach_one_task() -- attaches the task returned from detach_one_task() to + * its new rq. + */ +static void attach_one_task(struct rq *rq, struct task_struct *p) +{ + raw_spin_lock(&rq->lock); + attach_task(rq, p); + raw_spin_unlock(&rq->lock); +} + +/* + * attach_tasks() -- attaches all tasks detached by detach_tasks() to their + * new rq. + */ +static void attach_tasks(struct lb_env *env) +{ + struct list_head *tasks = &env->tasks; + struct task_struct *p; + + raw_spin_lock(&env->dst_rq->lock); + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); + list_del_init(&p->se.group_node); + + attach_task(env->dst_rq, p); + } + + raw_spin_unlock(&env->dst_rq->lock); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6603,6 +6636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .loop_break = sched_nr_migrate_break, .cpus = cpus, .fbq_type = all, + .tasks = LIST_HEAD_INIT(env.tasks), }; /* @@ -6652,16 +6686,29 @@ redo: env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: - local_irq_save(flags); - double_rq_lock(env.dst_rq, busiest); + raw_spin_lock_irqsave(&busiest->lock, flags); /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations */ - cur_ld_moved = move_tasks(&env); - ld_moved += cur_ld_moved; - double_rq_unlock(env.dst_rq, busiest); + cur_ld_moved = detach_tasks(&env); + + /* + * We've detached some tasks from busiest_rq. Every + * task is masked "TASK_ON_RQ_MIGRATING", so we can safely + * unlock busiest->lock, and we are able to be sure + * that nobody can manipulate the tasks in parallel. + * See task_rq_lock() family for the details. + */ + + raw_spin_unlock(&busiest->lock); + + if (cur_ld_moved) { + attach_tasks(&env); + ld_moved += cur_ld_moved; + } + local_irq_restore(flags); /* @@ -6797,7 +6844,7 @@ more_balance: * If we've begun active balancing, start to back off. This * case may not be covered by the all_pinned logic if there * is only 1 task on the busy runqueue (because we don't call - * move_tasks). + * detach_tasks). */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; -- cgit v0.10.2 From 35b123e2f701b28977db2cde7dbbdb3fad28cad1 Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Fri, 22 Aug 2014 17:50:43 +0300 Subject: sched/fair: Replace rcu_assign_pointer() with RCU_INIT_POINTER() The use of "rcu_assign_pointer()" is NULLing out the pointer. According to RCU_INIT_POINTER()'s block comment: "1. This use of RCU_INIT_POINTER() is NULLing out the pointer" it is better to use it instead of rcu_assign_pointer() because it has a smaller overhead. The following Coccinelle semantic patch was used: @@ @@ - rcu_assign_pointer + RCU_INIT_POINTER (..., NULL) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Peter Zijlstra (Intel) Cc: paulmck@linux.vnet.ibm.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140822145043.GA580@ada Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d3427a8..02fc949 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1780,7 +1780,7 @@ void task_numa_free(struct task_struct *p) list_del(&p->numa_entry); grp->nr_tasks--; spin_unlock_irqrestore(&grp->lock, flags); - rcu_assign_pointer(p->numa_group, NULL); + RCU_INIT_POINTER(p->numa_group, NULL); put_numa_group(grp); } -- cgit v0.10.2 From 6b44f519017b219a12b37173c7eef8dfce2c0100 Mon Sep 17 00:00:00 2001 From: Scot Doyle Date: Sun, 24 Aug 2014 17:12:27 +0000 Subject: sched/wait: Document timeout corner case The timeout may elapse without 0 being returned, such as when waiting on an unused queue. Document this possibility. Signed-off-by: Scot Doyle Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.LNX.2.11.1408241710070.6462@localhost.localdomain Signed-off-by: Ingo Molnar diff --git a/include/linux/wait.h b/include/linux/wait.h index 6fb1ba5..034f6fc 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -280,9 +280,11 @@ do { \ * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * - * The function returns 0 if the @timeout elapsed, or the remaining - * jiffies (at least 1) if the @condition evaluated to %true before - * the @timeout elapsed. + * Returns: + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * or the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed. */ #define wait_event_timeout(wq, condition, timeout) \ ({ \ @@ -363,9 +365,11 @@ do { \ * change the result of the wait condition. * * Returns: - * 0 if the @timeout elapsed, -%ERESTARTSYS if it was interrupted by - * a signal, or the remaining jiffies (at least 1) if the @condition - * evaluated to %true before the @timeout elapsed. + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was + * interrupted by a signal. */ #define wait_event_interruptible_timeout(wq, condition, timeout) \ ({ \ -- cgit v0.10.2 From 177ef2a6315ea7bf173653182324e1dcd08ffeaa Mon Sep 17 00:00:00 2001 From: "xiaofeng.yan" Date: Tue, 26 Aug 2014 03:15:41 +0000 Subject: sched/deadline: Fix a precision problem in the microseconds range An overrun could happen in function start_hrtick_dl() when a task with SCHED_DEADLINE runs in the microseconds range. For example, if a task with SCHED_DEADLINE has the following parameters: Task runtime deadline period P1 200us 500us 500us The deadline and period from task P1 are less than 1ms. In order to achieve microsecond precision, we need to enable HRTICK feature by the next command: PC#echo "HRTICK" > /sys/kernel/debug/sched_features PC#trace-cmd record -e sched_switch & PC#./schedtool -E -t 200000:500000:500000 -e ./test The binary test is in an endless while(1) loop here. Some pieces of trace.dat are as follows: -0 157.603157: sched_switch: :R ==> 2481:4294967295: test test-2481 157.603203: sched_switch: 2481:R ==> 0:120: swapper/2 -0 157.605657: sched_switch: :R ==> 2481:4294967295: test test-2481 157.608183: sched_switch: 2481:R ==> 2483:120: trace-cmd trace-cmd-2483 157.609656: sched_switch:2483:R==>2481:4294967295: test We can get the runtime of P1 from the information above: runtime = 157.608183 - 157.605657 runtime = 0.002526(2.526ms) The correct runtime should be less than or equal to 200us at some point. The problem is caused by a conditional judgment "delta > 10000" in function start_hrtick_dl(). Because no hrtimer start up to control the rest of runtime when the reset of runtime is less than 10us. So the process will continue to run until tick-period is coming. Move the code with the limit of the least time slice from hrtick_start_fair() to hrtick_start() because the EDF schedule class also needs this function in start_hrtick_dl(). To fix this problem, we call hrtimer_start() unconditionally in start_hrtick_dl(), and make sure the scheduling slice won't be smaller than 10us in hrtimer_start(). Signed-off-by: Xiaofeng Yan Reviewed-by: Li Zefan Acked-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409022941-5880-1-git-send-email-xiaofeng.yan@huawei.com [ Massaged the changelog and the code. ] Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a773c91..8d00f4a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -455,7 +455,15 @@ static void __hrtick_start(void *arg) void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = ktime_add_ns(timer->base->get_time(), delay); + ktime_t time; + s64 delta; + + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense and can cause timer DoS. + */ + delta = max_t(s64, delay, 10000LL); + time = ktime_add_ns(timer->base->get_time(), delta); hrtimer_set_expires(timer, time); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d21a8e0..cc4eb89 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -997,10 +997,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, #ifdef CONFIG_SCHED_HRTICK static void start_hrtick_dl(struct rq *rq, struct task_struct *p) { - s64 delta = p->dl.dl_runtime - p->dl.runtime; - - if (delta > 10000) - hrtick_start(rq, p->dl.runtime); + hrtick_start(rq, p->dl.runtime); } #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02fc949..50d2025 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3897,14 +3897,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) resched_curr(rq); return; } - - /* - * Don't schedule slices shorter than 10000ns, that just - * doesn't make sense. Rely on vruntime for fairness. - */ - if (rq->curr != p) - delta = max_t(s64, 10000LL, delta); - hrtick_start(rq, delta); } } -- cgit v0.10.2 From 90ed9cbe765ad358b3151a12b8bf889a3cbcd573 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 15 Aug 2014 16:05:36 -0400 Subject: exit: Always reap resource stats in __exit_signal() Oleg pointed out that wait_task_zombie adds a task's usage statistics to the parent's signal struct, but the task's own signal struct should also propagate the statistics at exit time. This allows thread_group_cputime(reaped_zombie) to get the statistics after __unhash_process() has made the task invisible to for_each_thread, but before the thread has actually been rcu freed, making sure no non-monotonic results are returned inside that window. Suggested-by: Oleg Nesterov Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: David Rientjes Cc: Guillaume Morin Cc: Ionut Alexa Cc: Linus Torvalds Cc: Li Zefan Cc: Michal Hocko Cc: Michal Schmidt Cc: Oleg Nesterov Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Link: http://lkml.kernel.org/r/1408133138-22048-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/exit.c b/kernel/exit.c index 32c58f7..b93d46d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -115,30 +115,29 @@ static void __exit_signal(struct task_struct *tsk) if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); - /* - * Accumulate here the counters for all threads but the - * group leader as they die, so they can be added into - * the process-wide totals when those are taken. - * The group leader stays around as a zombie as long - * as there are other threads. When it gets reaped, - * the exit.c code will add its counts into these totals. - * We won't ever get here for the group leader, since it - * will have been the last reference on the signal_struct. - */ - task_cputime(tsk, &utime, &stime); - sig->utime += utime; - sig->stime += stime; - sig->gtime += task_gtime(tsk); - sig->min_flt += tsk->min_flt; - sig->maj_flt += tsk->maj_flt; - sig->nvcsw += tsk->nvcsw; - sig->nivcsw += tsk->nivcsw; - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; } + /* + * Accumulate here the counters for all threads but the group leader + * as they die, so they can be added into the process-wide totals + * when those are taken. The group leader stays around as a zombie as + * long as there are other threads. When it gets reaped, the exit.c + * code will add its counts into these totals. We won't ever get here + * for the group leader, since it will have been the last reference on + * the signal_struct. + */ + task_cputime(tsk, &utime, &stime); + sig->utime += utime; + sig->stime += stime; + sig->gtime += task_gtime(tsk); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); -- cgit v0.10.2 From e78c3496790ee8a36522a838b59b388e8a709e65 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 16 Aug 2014 13:40:10 -0400 Subject: time, signal: Protect resource use statistics with seqlock Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability issues on large systems, due to both functions being serialized with a lock. The lock protects against reporting a wrong value, due to a thread in the task group exiting, its statistics reporting up to the signal struct, and that exited task's statistics being counted twice (or not at all). Protecting that with a lock results in times() and clock_gettime() being completely serialized on large systems. This can be fixed by using a seqlock around the events that gather and propagate statistics. As an additional benefit, the protection code can be moved into thread_group_cputime(), slightly simplifying the calling functions. In the case of posix_cpu_clock_get_task() things can be simplified a lot, because the calling function already ensures that the task sticks around, and the rest is now taken care of in thread_group_cputime(). This way the statistics reporting code can run lockless. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Alex Thorlton Cc: Andrew Morton Cc: Daeseok Youn Cc: David Rientjes Cc: Dongsheng Yang Cc: Geert Uytterhoeven Cc: Guillaume Morin Cc: Ionut Alexa Cc: Kees Cook Cc: Linus Torvalds Cc: Li Zefan Cc: Michal Hocko Cc: Michal Schmidt Cc: Oleg Nesterov Cc: Vladimir Davydov Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Link: http://lkml.kernel.org/r/20140816134010.26a9b572@annuminas.surriel.com Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885..dd9eb48 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -645,6 +645,7 @@ struct signal_struct { * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ + seqlock_t stats_lock; cputime_t utime, stime, cutime, cstime; cputime_t gtime; cputime_t cgtime; diff --git a/kernel/exit.c b/kernel/exit.c index b93d46d..fa09b86 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -127,6 +127,7 @@ static void __exit_signal(struct task_struct *tsk) * the signal_struct. */ task_cputime(tsk, &utime, &stime); + write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); @@ -140,6 +141,7 @@ static void __exit_signal(struct task_struct *tsk) sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); /* * Do this under ->siglock, we can race with another thread @@ -1042,6 +1044,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; + write_seqlock(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1064,6 +1067,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); + write_sequnlock(&psig->stats_lock); spin_unlock_irq(&p->real_parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index 0cf9cdb..9387ae8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1068,6 +1068,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); + seqlock_init(&sig->stats_lock); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 3e52836..49b7cfe 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -288,18 +288,28 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) struct signal_struct *sig = tsk->signal; cputime_t utime, stime; struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; + unsigned int seq, nextseq; rcu_read_lock(); - for_each_thread(tsk, t) { - task_cputime(t, &utime, &stime); - times->utime += utime; - times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); - } + /* Attempt a lockless read on the first round. */ + nextseq = 0; + do { + seq = nextseq; + read_seqbegin_or_lock(&sig->stats_lock, &seq); + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + for_each_thread(tsk, t) { + task_cputime(t, &utime, &stime); + times->utime += utime; + times->stime += stime; + times->sum_exec_runtime += task_sched_runtime(t); + } + /* If lockless access failed, take the lock. */ + nextseq = 1; + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry(&sig->stats_lock, seq); rcu_read_unlock(); } @@ -611,9 +621,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) cputime_adjust(&cputime, &p->prev_cputime, ut, st); } -/* - * Must be called with siglock held. - */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; diff --git a/kernel/sys.c b/kernel/sys.c index ce81291..b663664 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -862,11 +862,9 @@ void do_sys_times(struct tms *tms) { cputime_t tgutime, tgstime, cutime, cstime; - spin_lock_irq(¤t->sighand->siglock); thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; - spin_unlock_irq(¤t->sighand->siglock); tms->tms_utime = cputime_to_clock_t(tgutime); tms->tms_stime = cputime_to_clock_t(tgstime); tms->tms_cutime = cputime_to_clock_t(cutime); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b89464..492b986 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, if (same_thread_group(tsk, current)) err = cpu_clock_sample(which_clock, tsk, &rtn); } else { - unsigned long flags; - struct sighand_struct *sighand; - - /* - * while_each_thread() is not yet entirely RCU safe, - * keep locking the group while sampling process - * clock for now. - */ - sighand = lock_task_sighand(tsk, &flags); - if (!sighand) - return err; - if (tsk == current || thread_group_leader(tsk)) err = cpu_clock_sample_group(which_clock, tsk, &rtn); - - unlock_task_sighand(tsk, &flags); } if (!err) -- cgit v0.10.2 From eb1b4af0a64ac7bb0ee36f579c1c7cefcbc3ac2c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 15 Aug 2014 16:05:38 -0400 Subject: sched, time: Atomically increment stime & utime The functions task_cputime_adjusted and thread_group_cputime_adjusted() can be called locklessly, as well as concurrently on many different CPUs. This can occasionally lead to the utime and stime reported by times(), and other syscalls like it, going backward. The cause for this appears to be multiple threads racing in cputime_adjust(), both with values for utime or stime that is larger than the original, but each with a different value. Sometimes the larger value gets saved first, only to be immediately overwritten with a smaller value by another thread. Using atomic exchange prevents that problem, and ensures time progresses monotonically. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: akpm@linux-foundation.org Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Cc: oleg@redhat.com Link: http://lkml.kernel.org/r/1408133138-22048-4-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 49b7cfe..2b57031 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -602,9 +602,12 @@ static void cputime_adjust(struct task_cputime *curr, * If the tick based count grows faster than the scheduler one, * the result of the scaling may go backward. * Let's enforce monotonicity. + * Atomic exchange protects against concurrent cputime_adjust(). */ - prev->stime = max(prev->stime, stime); - prev->utime = max(prev->utime, utime); + while (stime > (rtime = ACCESS_ONCE(prev->stime))) + cmpxchg(&prev->stime, rtime, stime); + while (utime > (rtime = ACCESS_ONCE(prev->utime))) + cmpxchg(&prev->utime, rtime, utime); out: *ut = prev->utime; -- cgit v0.10.2 From 5cd038f53ed9ec7a17ab7d536a727363080f4210 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 4 Jun 2014 16:25:15 +0800 Subject: sched: Migrate waking tasks Current code can fail to migrate a waking task (silently) when TTWU_QUEUE is enabled. When a task is waking, it is pending on the wake_list of the rq, but it is not queued (task->on_rq == 0). In this case, set_cpus_allowed_ptr() and __migrate_task() will not migrate it because its invisible to them. This behavior is incorrect, because the task has been already woken, it will be running on the wrong CPU without correct placement until the next wake-up or update for cpus_allowed. To fix this problem, we need to finish the wakeup (so they appear on the runqueue) before we migrate them. Reported-by: Sasha Levin Reported-by: Jason J. Herne Tested-by: Jason J. Herne Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/538ED7EB.5050303@cn.fujitsu.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a814b3c..78e5c83 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4666,7 +4666,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (task_on_rq_queued(p)) { + if (task_on_rq_queued(p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &flags); @@ -4799,6 +4799,12 @@ static int migration_cpu_stop(void *data) * be on another cpu but it doesn't matter. */ local_irq_disable(); + /* + * We need to explicitly wake pending tasks before running + * __migrate_task() such that we will not miss enforcing cpus_allowed + * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. + */ + sched_ttwu_pending(); __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); local_irq_enable(); return 0; -- cgit v0.10.2 From 8236d907ab3411ad452280faa8b26c1347327380 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 2 Sep 2014 00:41:24 -0700 Subject: sched: Reduce contention in update_cfs_rq_blocked_load() When running workloads on 2+ socket systems, based on perf profiles, the update_cfs_rq_blocked_load() function often shows up as taking up a noticeable % of run time. Much of the contention is in __update_cfs_rq_tg_load_contrib() when we update the tg load contribution stats. However, it turns out that in many cases, they don't need to be updated and "tg_contrib" is 0. This patch adds a check in __update_cfs_rq_tg_load_contrib() to skip updating tg load contribution stats when nothing needs to be updated. This reduces the cacheline contention that would be unnecessary. Reviewed-by: Ben Segall Reviewed-by: Waiman Long Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Paul Turner Cc: jason.low2@hp.com Cc: Yuyang Du Cc: Aswin Chandramouleeswaran Cc: Chegu Vinod Cc: Scott J Norton Cc: Tim Chen Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409643684.19197.15.camel@j-VirtualBox Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 50d2025..be9e97b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2382,6 +2382,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; tg_contrib -= cfs_rq->tg_load_contrib; + if (!tg_contrib) + return; + if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { atomic_long_add(tg_contrib, &tg->load_avg); cfs_rq->tg_load_contrib += tg_contrib; -- cgit v0.10.2 From ad67dc316f000df4756b027f3559ad0491497d9e Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Tue, 9 Sep 2014 10:57:12 +0100 Subject: Documentation/scheduler/sched-deadline.txt: Fix terminology and improve clarity Several small changes regarding SCHED_DEADLINE documentation that fix terminology and improve clarity and readability: - "current runtime" becomes "remaining runtime" - readablity of an equation is improved by introducing more spacing - clarify when admission control will certainly fail - new URL for CBS technical report - substitue "smallest" with "earliest" Signed-off-by: Luca Abeni Signed-off-by: Juri Lelli Reviewed-by: Henrik Austad Cc: Randy Dunlap Cc: Peter Zijlstra Cc: Dario Faggioli Cc: Juri Lelli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1410256636-26171-2-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index 18adc92..a029891 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt @@ -45,17 +45,17 @@ CONTENTS every time the task wakes up, the scheduler computes a "scheduling deadline" consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then scheduled using EDF[1] on these scheduling deadlines (the task with the - smallest scheduling deadline is selected for execution). Notice that this + earliest scheduling deadline is selected for execution). Notice that this guaranteed is respected if a proper "admission control" strategy (see Section "4. Bandwidth management") is used. Summing up, the CBS[2,3] algorithms assigns scheduling deadlines to tasks so that each task runs for at most its runtime every period, avoiding any interference between different tasks (bandwidth isolation), while the EDF[1] - algorithm selects the task with the smallest scheduling deadline as the one - to be executed first. Thanks to this feature, also tasks that do not - strictly comply with the "traditional" real-time task model (see Section 3) - can effectively use the new policy. + algorithm selects the task with the earliest scheduling deadline as the one + to be executed next. Thanks to this feature, tasks that do not strictly comply + with the "traditional" real-time task model (see Section 3) can effectively + use the new policy. In more details, the CBS algorithm assigns scheduling deadlines to tasks in the following way: @@ -64,45 +64,45 @@ CONTENTS "deadline", and "period" parameters; - The state of the task is described by a "scheduling deadline", and - a "current runtime". These two parameters are initially set to 0; + a "remaining runtime". These two parameters are initially set to 0; - When a SCHED_DEADLINE task wakes up (becomes ready for execution), the scheduler checks if - current runtime runtime - ---------------------------------- > ---------------- - scheduling deadline - current time period + remaining runtime runtime + ---------------------------------- > --------- + scheduling deadline - current time period then, if the scheduling deadline is smaller than the current time, or this condition is verified, the scheduling deadline and the - current budget are re-initialised as + remaining runtime are re-initialised as scheduling deadline = current time + deadline - current runtime = runtime + remaining runtime = runtime - otherwise, the scheduling deadline and the current runtime are + otherwise, the scheduling deadline and the remaining runtime are left unchanged; - When a SCHED_DEADLINE task executes for an amount of time t, its - current runtime is decreased as + remaining runtime is decreased as - current runtime = current runtime - t + remaining runtime = remaining runtime - t (technically, the runtime is decreased at every tick, or when the task is descheduled / preempted); - - When the current runtime becomes less or equal than 0, the task is + - When the remaining runtime becomes less or equal than 0, the task is said to be "throttled" (also known as "depleted" in real-time literature) and cannot be scheduled until its scheduling deadline. The "replenishment time" for this task (see next item) is set to be equal to the current value of the scheduling deadline; - When the current time is equal to the replenishment time of a - throttled task, the scheduling deadline and the current runtime are + throttled task, the scheduling deadline and the remaining runtime are updated as scheduling deadline = scheduling deadline + period - current runtime = current runtime + runtime + remaining runtime = remaining runtime + runtime 3. Scheduling Real-Time Tasks @@ -147,6 +147,8 @@ CONTENTS and the absolute deadlines (d_j) coincide, so a proper admission control allows to respect the jobs' absolute deadlines for this task (this is what is called "hard schedulability property" and is an extension of Lemma 1 of [2]). + Notice that if runtime > deadline the admission control will surely reject + this task, as it is not possible to respect its temporal constraints. References: 1 - C. L. Liu and J. W. Layland. Scheduling algorithms for multiprogram- @@ -156,7 +158,7 @@ CONTENTS Real-Time Systems. Proceedings of the 19th IEEE Real-time Systems Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf 3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab - Technical Report. http://xoomer.virgilio.it/lucabe72/pubs/tr-98-01.ps + Technical Report. http://disi.unitn.it/~abeni/tr-98-01.pdf 4. Bandwidth management ======================= -- cgit v0.10.2 From 0d9ba8b03cfaed2696de42fe15ed410ba2ec7dbe Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 9 Sep 2014 10:57:13 +0100 Subject: Documentation/scheduler/sched-deadline.txt: Rewrite section 4 intro Section 4 intro was still describing the old interface. Rewrite it. Signed-off-by: Juri Lelli Signed-off-by: Luca Abeni Reviewed-by: Henrik Austad Cc: Randy Dunlap Cc: Peter Zijlstra Cc: Dario Faggioli Cc: Juri Lelli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1410256636-26171-3-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index a029891..f75d832 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt @@ -165,39 +165,38 @@ CONTENTS In order for the -deadline scheduling to be effective and useful, it is important to have some method to keep the allocation of the available CPU - bandwidth to the tasks under control. - This is usually called "admission control" and if it is not performed at all, - no guarantee can be given on the actual scheduling of the -deadline tasks. - - Since when RT-throttling has been introduced each task group has a bandwidth - associated, calculated as a certain amount of runtime over a period. - Moreover, to make it possible to manipulate such bandwidth, readable/writable - controls have been added to both procfs (for system wide settings) and cgroupfs - (for per-group settings). - Therefore, the same interface is being used for controlling the bandwidth - distrubution to -deadline tasks. - - However, more discussion is needed in order to figure out how we want to manage - SCHED_DEADLINE bandwidth at the task group level. Therefore, SCHED_DEADLINE - uses (for now) a less sophisticated, but actually very sensible, mechanism to - ensure that a certain utilization cap is not overcome per each root_domain. - - Another main difference between deadline bandwidth management and RT-throttling + bandwidth to the tasks under control. This is usually called "admission + control" and if it is not performed at all, no guarantee can be given on + the actual scheduling of the -deadline tasks. + + The interface used to control the fraction of CPU bandwidth that can be + allocated to -deadline tasks is similar to the one already used for -rt + tasks with real-time group scheduling (a.k.a. RT-throttling - see + Documentation/scheduler/sched-rt-group.txt), and is based on readable/ + writable control files located in procfs (for system wide settings). + Notice that per-group settings (controlled through cgroupfs) are still not + defined for -deadline tasks, because more discussion is needed in order to + figure out how we want to manage SCHED_DEADLINE bandwidth at the task group + level. + + A main difference between deadline bandwidth management and RT-throttling is that -deadline tasks have bandwidth on their own (while -rt ones don't!), - and thus we don't need an higher level throttling mechanism to enforce the - desired bandwidth. + and thus we don't need a higher level throttling mechanism to enforce the + desired bandwidth. Therefore, using this simple interface we can put a cap + on total utilization of -deadline tasks (i.e., \Sum (runtime_i / period_i) < + global_dl_utilization_cap). 4.1 System wide settings ------------------------ The system wide settings are configured under the /proc virtual file system. - For now the -rt knobs are used for dl admission control and the -deadline - runtime is accounted against the -rt runtime. We realise that this isn't - entirely desirable; however, it is better to have a small interface for now, - and be able to change it easily later. The ideal situation (see 5.) is to run - -rt tasks from a -deadline server; in which case the -rt bandwidth is a direct - subset of dl_bw. + For now the -rt knobs are used for -deadline admission control and the + -deadline runtime is accounted against the -rt runtime. We realise that this + isn't entirely desirable; however, it is better to have a small interface for + now, and be able to change it easily later. The ideal situation (see 5.) is to + run -rt tasks from a -deadline server; in which case the -rt bandwidth is a + direct subset of dl_bw. This means that, for a root_domain comprising M CPUs, -deadline tasks can be created while the sum of their bandwidths stays below: -- cgit v0.10.2 From b56bfc6cd13c25264f614320de9183a5dbcab6ca Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Tue, 9 Sep 2014 10:57:14 +0100 Subject: Documentation/scheduler/sched-deadline.txt: Improve and clarify AC bits Admission control is of key importance for SCHED_DEADLINE, since it guarantees system schedulability (or tells us something about the degree of guarantees we can provide to the user). This patch improves and clarifies bits and pieces regarding AC, both for UP and SMP systems. Signed-off-by: Luca Abeni Signed-off-by: Juri Lelli Reviewed-by: Henrik Austad Cc: Randy Dunlap Cc: Peter Zijlstra Cc: Dario Faggioli Cc: Juri Lelli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1410256636-26171-4-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index f75d832..0ce5e2c 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt @@ -38,16 +38,17 @@ CONTENTS ================== SCHED_DEADLINE uses three parameters, named "runtime", "period", and - "deadline" to schedule tasks. A SCHED_DEADLINE task is guaranteed to receive + "deadline", to schedule tasks. A SCHED_DEADLINE task should receive "runtime" microseconds of execution time every "period" microseconds, and these "runtime" microseconds are available within "deadline" microseconds from the beginning of the period. In order to implement this behaviour, every time the task wakes up, the scheduler computes a "scheduling deadline" consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then scheduled using EDF[1] on these scheduling deadlines (the task with the - earliest scheduling deadline is selected for execution). Notice that this - guaranteed is respected if a proper "admission control" strategy (see Section - "4. Bandwidth management") is used. + earliest scheduling deadline is selected for execution). Notice that the + task actually receives "runtime" time units within "deadline" if a proper + "admission control" strategy (see Section "4. Bandwidth management") is used + (clearly, if the system is overloaded this guarantee cannot be respected). Summing up, the CBS[2,3] algorithms assigns scheduling deadlines to tasks so that each task runs for at most its runtime every period, avoiding any @@ -134,6 +135,50 @@ CONTENTS A real-time task can be periodic with period P if r_{j+1} = r_j + P, or sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally, d_j = r_j + D, where D is the task's relative deadline. + The utilisation of a real-time task is defined as the ratio between its + WCET and its period (or minimum inter-arrival time), and represents + the fraction of CPU time needed to execute the task. + + If the total utilisation sum_i(WCET_i/P_i) is larger than M (with M equal + to the number of CPUs), then the scheduler is unable to respect all the + deadlines. + Note that total utilisation is defined as the sum of the utilisations + WCET_i/P_i over all the real-time tasks in the system. When considering + multiple real-time tasks, the parameters of the i-th task are indicated + with the "_i" suffix. + Moreover, if the total utilisation is larger than M, then we risk starving + non- real-time tasks by real-time tasks. + If, instead, the total utilisation is smaller than M, then non real-time + tasks will not be starved and the system might be able to respect all the + deadlines. + As a matter of fact, in this case it is possible to provide an upper bound + for tardiness (defined as the maximum between 0 and the difference + between the finishing time of a job and its absolute deadline). + More precisely, it can be proven that using a global EDF scheduler the + maximum tardiness of each task is smaller or equal than + ((M − 1) · WCET_max − WCET_min)/(M − (M − 2) · U_max) + WCET_max + where WCET_max = max_i{WCET_i} is the maximum WCET, WCET_min=min_i{WCET_i} + is the minimum WCET, and U_max = max_i{WCET_i/P_i} is the maximum utilisation. + + If M=1 (uniprocessor system), or in case of partitioned scheduling (each + real-time task is statically assigned to one and only one CPU), it is + possible to formally check if all the deadlines are respected. + If D_i = P_i for all tasks, then EDF is able to respect all the deadlines + of all the tasks executing on a CPU if and only if the total utilisation + of the tasks running on such a CPU is smaller or equal than 1. + If D_i != P_i for some task, then it is possible to define the density of + a task as C_i/min{D_i,T_i}, and EDF is able to respect all the deadlines + of all the tasks running on a CPU if the sum sum_i C_i/min{D_i,T_i} of the + densities of the tasks running on such a CPU is smaller or equal than 1 + (notice that this condition is only sufficient, and not necessary). + + On multiprocessor systems with global EDF scheduling (non partitioned + systems), a sufficient test for schedulability can not be based on the + utilisations (it can be shown that task sets with utilisations slightly + larger than 1 can miss deadlines regardless of the number of CPUs M). + However, as previously stated, enforcing that the total utilisation is smaller + than M is enough to guarantee that non real-time tasks are not starved and + that the tardiness of real-time tasks has an upper bound. SCHED_DEADLINE can be used to schedule real-time tasks guaranteeing that the jobs' deadlines of a task are respected. In order to do this, a task @@ -163,14 +208,22 @@ CONTENTS 4. Bandwidth management ======================= - In order for the -deadline scheduling to be effective and useful, it is - important to have some method to keep the allocation of the available CPU - bandwidth to the tasks under control. This is usually called "admission - control" and if it is not performed at all, no guarantee can be given on - the actual scheduling of the -deadline tasks. - - The interface used to control the fraction of CPU bandwidth that can be - allocated to -deadline tasks is similar to the one already used for -rt + As previously mentioned, in order for -deadline scheduling to be + effective and useful (that is, to be able to provide "runtime" time units + within "deadline"), it is important to have some method to keep the allocation + of the available fractions of CPU time to the various tasks under control. + This is usually called "admission control" and if it is not performed, then + no guarantee can be given on the actual scheduling of the -deadline tasks. + + As already stated in Section 3, a necessary condition to be respected to + correctly schedule a set of real-time tasks is that the total utilisation + is smaller than M. When talking about -deadline tasks, this requires that + the sum of the ratio between runtime and period for all tasks is smaller + than M. Notice that the ratio runtime/period is equivalent to the utilisation + of a "traditional" real-time task, and is also often referred to as + "bandwidth". + The interface used to control the CPU bandwidth that can be allocated + to -deadline tasks is similar to the one already used for -rt tasks with real-time group scheduling (a.k.a. RT-throttling - see Documentation/scheduler/sched-rt-group.txt), and is based on readable/ writable control files located in procfs (for system wide settings). @@ -182,9 +235,13 @@ CONTENTS A main difference between deadline bandwidth management and RT-throttling is that -deadline tasks have bandwidth on their own (while -rt ones don't!), and thus we don't need a higher level throttling mechanism to enforce the - desired bandwidth. Therefore, using this simple interface we can put a cap - on total utilization of -deadline tasks (i.e., \Sum (runtime_i / period_i) < - global_dl_utilization_cap). + desired bandwidth. In other words, this means that interface parameters are + only used at admission control time (i.e., when the user calls + sched_setattr()). Scheduling is then performed considering actual tasks' + parameters, so that CPU bandwidth is allocated to SCHED_DEADLINE tasks + respecting their needs in terms of granularity. Therefore, using this simple + interface we can put a cap on total utilization of -deadline tasks (i.e., + \Sum (runtime_i / period_i) < global_dl_utilization_cap). 4.1 System wide settings ------------------------ @@ -232,8 +289,16 @@ CONTENTS 950000. With rt_period equal to 1000000, by default, it means that -deadline tasks can use at most 95%, multiplied by the number of CPUs that compose the root_domain, for each root_domain. - - A -deadline task cannot fork. + This means that non -deadline tasks will receive at least 5% of the CPU time, + and that -deadline tasks will receive their runtime with a guaranteed + worst-case delay respect to the "deadline" parameter. If "deadline" = "period" + and the cpuset mechanism is used to implement partitioned scheduling (see + Section 5), then this simple setting of the bandwidth management is able to + deterministically guarantee that -deadline tasks will receive their runtime + in a period. + + Finally, notice that in order not to jeopardize the admission control a + -deadline task cannot fork. 5. Tasks CPU affinity ===================== -- cgit v0.10.2 From f5801933ce595ba6eb77d170ab0dfbcd5c894e11 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 9 Sep 2014 10:57:15 +0100 Subject: Documentation/scheduler/sched-deadline.txt: Add tests suite appendix Add an appendix briefly describing tools that can be used to test SCHED_DEADLINE (and the scheduler in general). Links to where source code of the tools is hosted are also provided. Signed-off-by: Juri Lelli Reviewed-by: Henrik Austad Cc: Randy Dunlap Cc: Peter Zijlstra Cc: Dario Faggioli Cc: Juri Lelli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1410256636-26171-5-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index 0ce5e2c..b4aad31 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt @@ -15,6 +15,7 @@ CONTENTS 5. Tasks CPU affinity 5.1 SCHED_DEADLINE and cpusets HOWTO 6. Future plans + A. Test suite 0. WARNING @@ -345,3 +346,54 @@ CONTENTS throttling patches [https://lkml.org/lkml/2010/2/23/239] but we still are in the preliminary phases of the merge and we really seek feedback that would help us decide on the direction it should take. + +Appendix A. Test suite +====================== + + The SCHED_DEADLINE policy can be easily tested using two applications that + are part of a wider Linux Scheduler validation suite. The suite is + available as a GitHub repository: https://github.com/scheduler-tools. + + The first testing application is called rt-app and can be used to + start multiple threads with specific parameters. rt-app supports + SCHED_{OTHER,FIFO,RR,DEADLINE} scheduling policies and their related + parameters (e.g., niceness, priority, runtime/deadline/period). rt-app + is a valuable tool, as it can be used to synthetically recreate certain + workloads (maybe mimicking real use-cases) and evaluate how the scheduler + behaves under such workloads. In this way, results are easily reproducible. + rt-app is available at: https://github.com/scheduler-tools/rt-app. + + Thread parameters can be specified from the command line, with something like + this: + + # rt-app -t 100000:10000:d -t 150000:20000:f:10 -D5 + + The above creates 2 threads. The first one, scheduled by SCHED_DEADLINE, + executes for 10ms every 100ms. The second one, scheduled at SCHED_FIFO + priority 10, executes for 20ms every 150ms. The test will run for a total + of 5 seconds. + + More interestingly, configurations can be described with a json file that + can be passed as input to rt-app with something like this: + + # rt-app my_config.json + + The parameters that can be specified with the second method are a superset + of the command line options. Please refer to rt-app documentation for more + details (/doc/*.json). + + The second testing application is a modification of schedtool, called + schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a + certain pid/application. schedtool-dl is available at: + https://github.com/scheduler-tools/schedtool-dl.git. + + The usage is straightforward: + + # schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app + + With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation + of 10ms every 100ms (note that parameters are expressed in microseconds). + You can also use schedtool to create a reservation for an already running + application, given that you know its pid: + + # schedtool -E -t 10000000:100000000 my_app_pid -- cgit v0.10.2 From 13924d2a983fc1557eb737ea59e2324adb538fa2 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 9 Sep 2014 10:57:16 +0100 Subject: Documentation/scheduler/sched-deadline.txt: Add minimal main() appendix Add an appendix providing a simple self-contained code snippet showing how SCHED_DEADLINE reservations can be created by application developers. Signed-off-by: Juri Lelli Reviewed-by: Henrik Austad Cc: Randy Dunlap Cc: Peter Zijlstra Cc: Dario Faggioli Cc: Juri Lelli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1410256636-26171-6-git-send-email-juri.lelli@arm.com [ Fixed some whitespace inconsistencies. ] Signed-off-by: Ingo Molnar diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index b4aad31..21461a0 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt @@ -16,6 +16,7 @@ CONTENTS 5.1 SCHED_DEADLINE and cpusets HOWTO 6. Future plans A. Test suite + B. Minimal main() 0. WARNING @@ -397,3 +398,128 @@ Appendix A. Test suite application, given that you know its pid: # schedtool -E -t 10000000:100000000 my_app_pid + +Appendix B. Minimal main() +========================== + + We provide in what follows a simple (ugly) self-contained code snippet + showing how SCHED_DEADLINE reservations can be created by a real-time + application developer. + + #define _GNU_SOURCE + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #define gettid() syscall(__NR_gettid) + + #define SCHED_DEADLINE 6 + + /* XXX use the proper syscall numbers */ + #ifdef __x86_64__ + #define __NR_sched_setattr 314 + #define __NR_sched_getattr 315 + #endif + + #ifdef __i386__ + #define __NR_sched_setattr 351 + #define __NR_sched_getattr 352 + #endif + + #ifdef __arm__ + #define __NR_sched_setattr 380 + #define __NR_sched_getattr 381 + #endif + + static volatile int done; + + struct sched_attr { + __u32 size; + + __u32 sched_policy; + __u64 sched_flags; + + /* SCHED_NORMAL, SCHED_BATCH */ + __s32 sched_nice; + + /* SCHED_FIFO, SCHED_RR */ + __u32 sched_priority; + + /* SCHED_DEADLINE (nsec) */ + __u64 sched_runtime; + __u64 sched_deadline; + __u64 sched_period; + }; + + int sched_setattr(pid_t pid, + const struct sched_attr *attr, + unsigned int flags) + { + return syscall(__NR_sched_setattr, pid, attr, flags); + } + + int sched_getattr(pid_t pid, + struct sched_attr *attr, + unsigned int size, + unsigned int flags) + { + return syscall(__NR_sched_getattr, pid, attr, size, flags); + } + + void *run_deadline(void *data) + { + struct sched_attr attr; + int x = 0; + int ret; + unsigned int flags = 0; + + printf("deadline thread started [%ld]\n", gettid()); + + attr.size = sizeof(attr); + attr.sched_flags = 0; + attr.sched_nice = 0; + attr.sched_priority = 0; + + /* This creates a 10ms/30ms reservation */ + attr.sched_policy = SCHED_DEADLINE; + attr.sched_runtime = 10 * 1000 * 1000; + attr.sched_period = attr.sched_deadline = 30 * 1000 * 1000; + + ret = sched_setattr(0, &attr, flags); + if (ret < 0) { + done = 0; + perror("sched_setattr"); + exit(-1); + } + + while (!done) { + x++; + } + + printf("deadline thread dies [%ld]\n", gettid()); + return NULL; + } + + int main (int argc, char **argv) + { + pthread_t thread; + + printf("main thread [%ld]\n", gettid()); + + pthread_create(&thread, NULL, run_deadline, NULL); + + sleep(10); + + done = 1; + pthread_join(thread, NULL); + + printf("main dies [%ld]\n", gettid()); + return 0; + } -- cgit v0.10.2 From ba7e5a279e72f4b246dc7a419ac707e1936ede3e Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 4 Sep 2014 16:35:30 -0400 Subject: sched/numa: Use select_idle_sibling() to select a destination for task_numa_move() The code in task_numa_compare() will only examine at most one idle CPU per node, because they all have the same score. However, some idle CPUs are better candidates than others, due to busy or idle SMT siblings, etc... The scheduler has logic to find the best CPU within an LLC to place a task. The NUMA code should probably use it. This seems to reduce the standard deviation for single instance SPECjbb2005 with a low warehouse count on my 4 node test system. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: mgorman@suse.de Cc: Mike Galbraith Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140904163530.189d410a@cuia.bos.redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index be9e97b..96e7147 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -665,6 +665,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP +static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); static inline void __update_task_entity_contrib(struct sched_entity *se); @@ -1257,6 +1258,13 @@ balance: if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; + /* + * One idle CPU per node is evaluated for a task numa move. + * Call select_idle_sibling to maybe find a better one. + */ + if (!cur) + env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + assign: task_numa_assign(env, cur, imp); unlock: -- cgit v0.10.2 From f6be8af1c95de4a46e325e728900a70ceadb52cf Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Thu, 4 Sep 2014 15:17:53 +0800 Subject: sched: Add new API wake_up_if_idle() to wake up the idle cpu Implementing one new API wake_up_if_idle(), which is used to wake up the idle CPU. Suggested-by: Andy Lutomirski Signed-off-by: Chuansheng Liu Signed-off-by: Peter Zijlstra (Intel) Cc: daniel.lezcano@linaro.org Cc: rjw@rjwysocki.net Cc: linux-pm@vger.kernel.org Cc: changcheng.liu@intel.com Cc: xiaoming.wang@intel.com Cc: souvik.k.chakravarty@intel.com Cc: chuansheng.liu@intel.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409815075-4180-1-git-send-email-chuansheng.liu@intel.com Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index dd9eb48..82ff3d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1024,6 +1024,7 @@ struct sched_domain_topology_level { extern struct sched_domain_topology_level *sched_domain_topology; extern void set_sched_topology(struct sched_domain_topology_level *tl); +extern void wake_up_if_idle(int cpu); #ifdef CONFIG_SCHED_DEBUG # define SD_INIT_NAME(type) .name = #type diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78e5c83..f7c6ed2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1634,6 +1634,25 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) } } +void wake_up_if_idle(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + if (!is_idle_task(rq->curr)) + return; + + if (set_nr_if_polling(rq->idle)) { + trace_sched_wake_idle_without_ipi(cpu); + } else { + raw_spin_lock_irqsave(&rq->lock, flags); + if (is_idle_task(rq->curr)) + smp_send_reschedule(cpu); + /* Else cpu is not in idle, do nothing here */ + raw_spin_unlock_irqrestore(&rq->lock, flags); + } +} + bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -- cgit v0.10.2 From c6f4459fc3ba532e896cb678e29b45cb985f82bf Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Thu, 4 Sep 2014 15:17:54 +0800 Subject: smp: Add new wake_up_all_idle_cpus() function Currently kick_all_cpus_sync() can break non-polling idle cpus thru IPI interrupts. But sometimes we need to break the polling idle cpus immediately to reselect the suitable c-state, also for non-idle cpus, we need to do nothing if we try to wake up them. Here adding one new function wake_up_all_idle_cpus() to let all cpus out of idle based on function wake_up_if_idle(). Signed-off-by: Chuansheng Liu Signed-off-by: Peter Zijlstra (Intel) Cc: daniel.lezcano@linaro.org Cc: rjw@rjwysocki.net Cc: linux-pm@vger.kernel.org Cc: changcheng.liu@intel.com Cc: xiaoming.wang@intel.com Cc: souvik.k.chakravarty@intel.com Cc: luto@amacapital.net Cc: Andrew Morton Cc: Christoph Hellwig Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Jan Kara Cc: Jens Axboe Cc: Jens Axboe Cc: Linus Torvalds Cc: Michal Hocko Cc: Paul Gortmaker Cc: Roman Gushchin Cc: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/1409815075-4180-2-git-send-email-chuansheng.liu@intel.com Signed-off-by: Ingo Molnar diff --git a/include/linux/smp.h b/include/linux/smp.h index 34347f2..93dff5f 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -100,6 +100,7 @@ int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait); void kick_all_cpus_sync(void); +void wake_up_all_idle_cpus(void); /* * Generic and arch helpers @@ -148,6 +149,7 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, } static inline void kick_all_cpus_sync(void) { } +static inline void wake_up_all_idle_cpus(void) { } #endif /* !SMP */ diff --git a/kernel/smp.c b/kernel/smp.c index aff8aa1..9e0d0b2 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "smpboot.h" @@ -699,3 +700,24 @@ void kick_all_cpus_sync(void) smp_call_function(do_nothing, NULL, 1); } EXPORT_SYMBOL_GPL(kick_all_cpus_sync); + +/** + * wake_up_all_idle_cpus - break all cpus out of idle + * wake_up_all_idle_cpus try to break all cpus which is in idle state even + * including idle polling cpus, for non-idle cpus, we will do nothing + * for them. + */ +void wake_up_all_idle_cpus(void) +{ + int cpu; + + preempt_disable(); + for_each_online_cpu(cpu) { + if (cpu == smp_processor_id()) + continue; + + wake_up_if_idle(cpu); + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); -- cgit v0.10.2 From 2ed903c5485bad0eafdd3d59ff993598736e4f31 Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Thu, 4 Sep 2014 15:17:55 +0800 Subject: cpuidle: Use wake_up_all_idle_cpus() to wake up all idle cpus Currently kick_all_cpus_sync() or smp_call_function() can not break the polling idle cpu immediately. Instead using wake_up_all_idle_cpus() which can wake up the polling idle cpu quickly is much more helpful for power. Signed-off-by: Chuansheng Liu Signed-off-by: Peter Zijlstra (Intel) Cc: linux-pm@vger.kernel.org Cc: changcheng.liu@intel.com Cc: xiaoming.wang@intel.com Cc: souvik.k.chakravarty@intel.com Cc: luto@amacapital.net Cc: Daniel Lezcano Cc: Linus Torvalds Cc: Rafael J. Wysocki Cc: linux-pm@vger.kernel.org Link: http://lkml.kernel.org/r/1409815075-4180-3-git-send-email-chuansheng.liu@intel.com Signed-off-by: Ingo Molnar diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index ee9df5e..d31e04c 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -223,7 +223,7 @@ void cpuidle_uninstall_idle_handler(void) { if (enabled_devices) { initialized = 0; - kick_all_cpus_sync(); + wake_up_all_idle_cpus(); } } @@ -530,11 +530,6 @@ EXPORT_SYMBOL_GPL(cpuidle_register); #ifdef CONFIG_SMP -static void smp_callback(void *v) -{ - /* we already woke the CPU up, nothing more to do */ -} - /* * This function gets called when a part of the kernel has a new latency * requirement. This means we need to get all processors out of their C-state, @@ -544,7 +539,7 @@ static void smp_callback(void *v) static int cpuidle_latency_notify(struct notifier_block *b, unsigned long l, void *v) { - smp_call_function(smp_callback, NULL, 1); + wake_up_all_idle_cpus(); return NOTIFY_OK; } -- cgit v0.10.2 From ef8ac06359ddf95431cf6bb04ad2b36fff562328 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 12 Sep 2014 09:12:14 -0400 Subject: seqlock: Add irqsave variant of read_seqbegin_or_lock() There are cases where read_seqbegin_or_lock() needs to block irqs, because the seqlock in question nests inside a lock that is also be taken from irq context. Add read_seqbegin_or_lock_irqsave() and done_seqretry_irqrestore(), which are almost identical to read_seqbegin_or_lock() and done_seqretry(). Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: prarit@redhat.com Cc: oleg@redhat.com Cc: sgruszka@redhat.com Cc: Al Viro Cc: John Stultz Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Stephen Boyd Cc: Trond Myklebust Link: http://lkml.kernel.org/r/1410527535-9814-2-git-send-email-riel@redhat.com [ Improved the readability of the code a bit. ] Signed-off-by: Ingo Molnar diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index cc35963..f5df8f6 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -456,4 +456,23 @@ read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags) spin_unlock_irqrestore(&sl->lock, flags); } +static inline unsigned long +read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq) +{ + unsigned long flags = 0; + + if (!(*seq & 1)) /* Even */ + *seq = read_seqbegin(lock); + else /* Odd */ + read_seqlock_excl_irqsave(lock, flags); + + return flags; +} + +static inline void +done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags) +{ + if (seq & 1) + read_sequnlock_excl_irqrestore(lock, flags); +} #endif /* __LINUX_SEQLOCK_H */ -- cgit v0.10.2 From 9c368b5b6eccce1cbd7f68142106b3b4ddb1c5b5 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 12 Sep 2014 09:12:15 -0400 Subject: sched, time: Fix lock inversion in thread_group_cputime() The sig->stats_lock nests inside the tasklist_lock and the sighand->siglock in __exit_signal and wait_task_zombie. However, both of those locks can be taken from irq context, which means we need to use the interrupt safe variant of read_seqbegin_or_lock. This blocks interrupts when the "lock" branch is taken (seq is odd), preventing the lock inversion. On the first (lockless) pass through the loop, irqs are not blocked. Reported-by: Stanislaw Gruszka Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: prarit@redhat.com Cc: oleg@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1410527535-9814-3-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2b57031..64492df 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -289,13 +289,14 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) cputime_t utime, stime; struct task_struct *t; unsigned int seq, nextseq; + unsigned long flags; rcu_read_lock(); /* Attempt a lockless read on the first round. */ nextseq = 0; do { seq = nextseq; - read_seqbegin_or_lock(&sig->stats_lock, &seq); + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); times->utime = sig->utime; times->stime = sig->stime; times->sum_exec_runtime = sig->sum_sched_runtime; @@ -309,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) /* If lockless access failed, take the lock. */ nextseq = 1; } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry(&sig->stats_lock, seq); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); rcu_read_unlock(); } -- cgit v0.10.2 From f139caf2e89713687514d9db847a4fa2e29c87a2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Sep 2014 17:40:54 +0400 Subject: sched, cleanup, treewide: Remove set_current_state(TASK_RUNNING) after schedule() schedule(), io_schedule() and schedule_timeout() always return with TASK_RUNNING state set, so one more setting is unnecessary. (All places in patch are visible good, only exception is kiblnd_scheduler() from: drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c Its schedule() is one line above standard 3 lines of unified diff) No places where set_current_state() is used for mb(). Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1410529254.3569.23.camel@tkhai Cc: Alasdair Kergon Cc: Anil Belur Cc: Arnd Bergmann Cc: Dave Kleikamp Cc: David Airlie Cc: David Howells Cc: Dmitry Eremin Cc: Frank Blaschka Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Helge Deller Cc: Isaac Huang Cc: James E.J. Bottomley Cc: James E.J. Bottomley Cc: J. Bruce Fields Cc: Jeff Dike Cc: Jesper Nilsson Cc: Jiri Slaby Cc: Laura Abbott Cc: Liang Zhen Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Masaru Nomura Cc: Michael Opdenacker Cc: Mikael Starvik Cc: Mike Snitzer Cc: Neil Brown Cc: Oleg Drokin Cc: Peng Tao Cc: Richard Weinberger Cc: Robert Love Cc: Steven Rostedt Cc: Trond Myklebust Cc: Ursula Braun Cc: Zi Shen Lim Cc: devel@driverdev.osuosl.org Cc: dm-devel@redhat.com Cc: dri-devel@lists.freedesktop.org Cc: fcoe-devel@open-fcoe.org Cc: jfs-discussion@lists.sourceforge.net Cc: linux390@de.ibm.com Cc: linux-afs@lists.infradead.org Cc: linux-cris-kernel@axis.com Cc: linux-kernel@vger.kernel.org Cc: linux-nfs@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linux-raid@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: qla2xxx-upstream@qlogic.com Cc: user-mode-linux-devel@lists.sourceforge.net Cc: user-mode-linux-user@lists.sourceforge.net Signed-off-by: Ingo Molnar diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c index 29eb02a..0f39832 100644 --- a/arch/cris/arch-v10/drivers/sync_serial.c +++ b/arch/cris/arch-v10/drivers/sync_serial.c @@ -1086,7 +1086,6 @@ static ssize_t sync_serial_write(struct file *file, const char *buf, } local_irq_restore(flags); schedule(); - set_current_state(TASK_RUNNING); remove_wait_queue(&port->out_wait_q, &wait); if (signal_pending(current)) return -EINTR; diff --git a/arch/cris/arch-v32/drivers/sync_serial.c b/arch/cris/arch-v32/drivers/sync_serial.c index bbb806b..5a14913 100644 --- a/arch/cris/arch-v32/drivers/sync_serial.c +++ b/arch/cris/arch-v32/drivers/sync_serial.c @@ -1089,7 +1089,6 @@ static ssize_t sync_serial_write(struct file *file, const char *buf, } schedule(); - set_current_state(TASK_RUNNING); remove_wait_queue(&port->out_wait_q, &wait); if (signal_pending(current)) diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index 9e3a722..dd16c90 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -79,7 +79,6 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size, set_task_state(current, TASK_INTERRUPTIBLE); schedule(); - set_task_state(current, TASK_RUNNING); remove_wait_queue(&host_read_wait, &wait); if (atomic_dec_and_test(&host_sleep_count)) { diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c index d2077f0..d07f810 100644 --- a/drivers/gpu/vga/vgaarb.c +++ b/drivers/gpu/vga/vgaarb.c @@ -403,7 +403,6 @@ int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible) } schedule(); remove_wait_queue(&vga_wait_queue, &wait); - set_current_state(TASK_RUNNING); } return rc; } diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ab472c5..0505559 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -720,7 +720,6 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c) io_schedule(); - set_task_state(current, TASK_RUNNING); remove_wait_queue(&c->free_buffer_wait, &wait); dm_bufio_lock(c); diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c index 90cca5e..ef31b77 100644 --- a/drivers/parisc/power.c +++ b/drivers/parisc/power.c @@ -121,7 +121,6 @@ static int kpowerswd(void *param) unsigned long soft_power_reg = (unsigned long) param; schedule_timeout_interruptible(pwrsw_enabled ? HZ : HZ/POWERSWITCH_POLL_PER_SEC); - __set_current_state(TASK_RUNNING); if (unlikely(!pwrsw_enabled)) continue; diff --git a/drivers/s390/net/claw.c b/drivers/s390/net/claw.c index fbc6701..213e54e 100644 --- a/drivers/s390/net/claw.c +++ b/drivers/s390/net/claw.c @@ -481,7 +481,6 @@ claw_open(struct net_device *dev) spin_unlock_irqrestore( get_ccwdev_lock(privptr->channel[i].cdev), saveflags); schedule(); - set_current_state(TASK_RUNNING); remove_wait_queue(&privptr->channel[i].wait, &wait); if(rc != 0) ccw_check_return_code(privptr->channel[i].cdev, rc); @@ -828,7 +827,6 @@ claw_release(struct net_device *dev) spin_unlock_irqrestore( get_ccwdev_lock(privptr->channel[i].cdev), saveflags); schedule(); - set_current_state(TASK_RUNNING); remove_wait_queue(&privptr->channel[i].wait, &wait); if (rc != 0) { ccw_check_return_code(privptr->channel[i].cdev, rc); diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index 00ee0ed..4a8ac7d 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -1884,7 +1884,6 @@ retry: set_current_state(TASK_INTERRUPTIBLE); spin_unlock_bh(&p->fcoe_rx_list.lock); schedule(); - set_current_state(TASK_RUNNING); goto retry; } diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index be9698d..8b5a5dc 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -4853,7 +4853,6 @@ qla2x00_do_dpc(void *data) "DPC handler sleeping.\n"); schedule(); - __set_current_state(TASK_RUNNING); if (!base_vha->flags.init_done || ha->flags.mbox_busy) goto end_loop; diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c index 306d728..b94f743 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -3215,7 +3215,6 @@ kiblnd_connd (void *arg) schedule_timeout(timeout); - set_current_state(TASK_RUNNING); remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); } @@ -3432,7 +3431,6 @@ kiblnd_scheduler(void *arg) busy_loops = 0; remove_wait_queue(&sched->ibs_waitq, &wait); - set_current_state(TASK_RUNNING); spin_lock_irqsave(&sched->ibs_lock, flags); } @@ -3507,7 +3505,6 @@ kiblnd_failover_thread(void *arg) rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) : cfs_time_seconds(1)); - set_current_state(TASK_RUNNING); remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); write_lock_irqsave(glock, flags); diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c index 5214399..9994fc6 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c @@ -2233,7 +2233,6 @@ ksocknal_connd (void *arg) nloops = 0; schedule_timeout(timeout); - set_current_state(TASK_RUNNING); remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait); spin_lock_bh(connd_lock); } diff --git a/drivers/staging/lustre/lustre/libcfs/fail.c b/drivers/staging/lustre/lustre/libcfs/fail.c index 1bf9c90..e73ca3d 100644 --- a/drivers/staging/lustre/lustre/libcfs/fail.c +++ b/drivers/staging/lustre/lustre/libcfs/fail.c @@ -131,7 +131,6 @@ int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set) id, ms); set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(cfs_time_seconds(ms) / 1000); - set_current_state(TASK_RUNNING); CERROR("cfs_fail_timeout id %x awake\n", id); } return ret; diff --git a/drivers/tty/bfin_jtag_comm.c b/drivers/tty/bfin_jtag_comm.c index 8096fcb..d7b198c 100644 --- a/drivers/tty/bfin_jtag_comm.c +++ b/drivers/tty/bfin_jtag_comm.c @@ -77,7 +77,6 @@ bfin_jc_emudat_manager(void *arg) pr_debug("waiting for readers\n"); __set_current_state(TASK_UNINTERRUPTIBLE); schedule(); - __set_current_state(TASK_RUNNING); continue; } diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index b6df2e8..5297678 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -130,7 +130,6 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl, /* second+ BUSY - sleep a little bit */ set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(1); - __set_current_state(TASK_RUNNING); } continue; } diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 0acddf6..bc462dc 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1585,7 +1585,6 @@ void jfs_flush_journal(struct jfs_log *log, int wait) set_current_state(TASK_UNINTERRUPTIBLE); LOGGC_UNLOCK(log); schedule(); - __set_current_state(TASK_RUNNING); LOGGC_LOCK(log); remove_wait_queue(&target->gcwait, &__wait); } @@ -2359,7 +2358,6 @@ int jfsIOWait(void *arg) set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(&log_redrive_lock); schedule(); - __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 564c4f2..d595856 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -136,7 +136,6 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) set_current_state(TASK_UNINTERRUPTIBLE); TXN_UNLOCK(); io_schedule(); - __set_current_state(TASK_RUNNING); remove_wait_queue(event, &wait); } @@ -2808,7 +2807,6 @@ int jfs_lazycommit(void *arg) set_current_state(TASK_INTERRUPTIBLE); LAZY_UNLOCK(flags); schedule(); - __set_current_state(TASK_RUNNING); remove_wait_queue(&jfs_commit_thread_wait, &wq); } } while (!kthread_should_stop()); @@ -2996,7 +2994,6 @@ int jfs_sync(void *arg) set_current_state(TASK_INTERRUPTIBLE); TXN_UNLOCK(); schedule(); - __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c index 04303b5..9fde840 100644 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -146,7 +146,6 @@ nfs4_blk_decode_device(struct nfs_server *server, set_current_state(TASK_UNINTERRUPTIBLE); schedule(); - __set_current_state(TASK_RUNNING); remove_wait_queue(&nn->bl_wq, &wq); if (reply->status != BL_DEVICE_REQUEST_PROC) { diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c index 8999cfd..b18680e 100644 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -76,7 +76,6 @@ static void dev_remove(struct net *net, dev_t dev) set_current_state(TASK_UNINTERRUPTIBLE); schedule(); - __set_current_state(TASK_RUNNING); remove_wait_queue(&nn->bl_wq, &wq); out: diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 9c271f4..8f1af78 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -670,7 +670,6 @@ __cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) } schedule(); - set_current_state(TASK_RUNNING); if (msg.errno < 0) ret = msg.errno; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1c2fe7d..ab370ff 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, */ if (!expires) { schedule(); - __set_current_state(TASK_RUNNING); return -EINTR; } diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 0434ff1..3f9e328 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -205,7 +205,6 @@ static void ring_buffer_consumer(void) break; schedule(); - __set_current_state(TASK_RUNNING); } reader_finish = 0; complete(&read_done); @@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg) break; schedule(); - __set_current_state(TASK_RUNNING); } __set_current_state(TASK_RUNNING); @@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg) trace_printk("Sleeping for 10 secs\n"); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ * SLEEP_TIME); - __set_current_state(TASK_RUNNING); } if (kill_test) -- cgit v0.10.2 From a8edd075323cec607797fdd1d7b1222c987f4a47 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Sep 2014 17:41:16 +0400 Subject: sched/fair: cleanup: Remove useless assignment in select_task_rq_fair() new_cpu is reassigned below, so we do not need this here. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1410529276.3569.24.camel@tkhai Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96e7147..9807a99 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4521,11 +4521,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (p->nr_cpus_allowed == 1) return prev_cpu; - if (sd_flag & SD_BALANCE_WAKE) { - if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) - want_affine = 1; - new_cpu = prev_cpu; - } + if (sd_flag & SD_BALANCE_WAKE) + want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { -- cgit v0.10.2 From f3cd1c4ec059c956d3346705e453aff3ace3b494 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Sep 2014 17:41:40 +0400 Subject: sched/core: Use put_prev_task() accessor where possible Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1410529300.3569.25.camel@tkhai Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f7c6ed2..5536397 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3033,7 +3033,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); /* * Boosting condition are: @@ -3586,7 +3586,7 @@ change: if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); prev_class = p->sched_class; __setscheduler(rq, p, attr); @@ -4792,7 +4792,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); p->numa_preferred_nid = nid; @@ -7374,7 +7374,7 @@ void sched_move_task(struct task_struct *tsk) if (queued) dequeue_task(rq, tsk, 0); if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); + put_prev_task(rq, tsk); tg = container_of(task_css_check(tsk, cpu_cgrp_id, lockdep_is_held(&tsk->sighand->siglock)), -- cgit v0.10.2 From f3f1768f89d601ad29f4701deef91caaa82b9f57 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Sep 2014 17:42:01 +0400 Subject: sched/rt: Remove useless if from cleanup pick_next_task_rt() _pick_next_task_rt() never returns NULL. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1410529321.3569.26.camel@tkhai Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4feac8f..2e6a774 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1468,8 +1468,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) p = _pick_next_task_rt(rq); /* The running task is never eligible for pushing */ - if (p) - dequeue_pushable_task(rq, p); + dequeue_pushable_task(rq, p); set_post_schedule(rq); -- cgit v0.10.2 From 1ba93d42727c44001aa8ccffd39c8ab5705379e2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Sep 2014 17:42:20 +0400 Subject: sched/dl: Simplify pick_dl_task() 1) Nobody calls pick_dl_task() with negative cpu, it's old RT leftover. 2) If p->nr_cpus_allowed is 1, than the affinity has just been changed in set_cpus_allowed_ptr(); we'll pick it just earlier than migration thread. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1410529340.3569.27.camel@tkhai Signed-off-by: Ingo Molnar diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index cc4eb89..aaa5abb 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1121,10 +1121,8 @@ static void set_curr_task_dl(struct rq *rq) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && - (p->nr_cpus_allowed > 1)) + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) return 1; - return 0; } -- cgit v0.10.2 From a15b12ac36ad4e7b856a4ae54937ae26a51aebad Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Sep 2014 15:03:34 +0400 Subject: sched: Do not stop cpu in set_cpus_allowed_ptr() if task is not running If a task is queued but not running on it rq, we can simply migrate it without migration thread and switching of context. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1410519814.3569.7.camel@tkhai Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5536397..4b1ddeb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4629,6 +4629,33 @@ void init_idle(struct task_struct *idle, int cpu) } #ifdef CONFIG_SMP +/* + * move_queued_task - move a queued task to new rq. + * + * Returns (locked) new rq. Old rq's lock is released. + */ +static struct rq *move_queued_task(struct task_struct *p, int new_cpu) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_held(&rq->lock); + + dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, new_cpu); + raw_spin_unlock(&rq->lock); + + rq = cpu_rq(new_cpu); + + raw_spin_lock(&rq->lock); + BUG_ON(task_cpu(p) != new_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; + enqueue_task(rq, p, 0); + check_preempt_curr(rq, p, 0); + + return rq; +} + void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { if (p->sched_class && p->sched_class->set_cpus_allowed) @@ -4685,14 +4712,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (task_on_rq_queued(p) || p->state == TASK_WAKING) { + if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; - } + } else if (task_on_rq_queued(p)) + rq = move_queued_task(p, dest_cpu); out: task_rq_unlock(rq, p, &flags); @@ -4735,19 +4763,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (task_on_rq_queued(p)) { - dequeue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_MIGRATING; - set_task_cpu(p, dest_cpu); - raw_spin_unlock(&rq->lock); - - rq = cpu_rq(dest_cpu); - raw_spin_lock(&rq->lock); - BUG_ON(task_rq(p) != rq); - p->on_rq = TASK_ON_RQ_QUEUED; - enqueue_task(rq, p, 0); - check_preempt_curr(rq, p, 0); - } + if (task_on_rq_queued(p)) + rq = move_queued_task(p, dest_cpu); done: ret = 1; fail: -- cgit v0.10.2 From d4311ff1a8da48d609db9500f121c15580dfeeb7 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 12 Sep 2014 14:16:17 +0100 Subject: init/main.c: Give init_task a canary Tasks get their end of stack set to STACK_END_MAGIC with the aim to catch stack overruns. Currently this feature does not apply to init_task. This patch removes this restriction. Note that a similar patch was posted by Prarit Bhargava some time ago but was never merged: http://marc.info/?l=linux-kernel&m=127144305403241&w=2 Signed-off-by: Aaron Tomlin Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Acked-by: Michael Ellerman Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dzickus@redhat.com Cc: bmr@redhat.com Cc: jcastillo@redhat.com Cc: jgh@redhat.com Cc: minchan@kernel.org Cc: tglx@linutronix.de Cc: hannes@cmpxchg.org Cc: Alex Thorlton Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Daeseok Youn Cc: David Rientjes Cc: Fabian Frederick Cc: Geert Uytterhoeven Cc: Jiri Olsa Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael Opdenacker Cc: Paul Mackerras Cc: Prarit Bhargava Cc: Rik van Riel Cc: Rusty Russell Cc: Seiji Aguchi Cc: Steven Rostedt Cc: Vladimir Davydov Cc: Yasuaki Ishimatsu Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1410527779-8133-2-git-send-email-atomlin@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 51ab9e7..35d0760c 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include @@ -538,7 +537,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) regs->nip); stackend = end_of_stack(current); - if (current != &init_task && *stackend != STACK_END_MAGIC) + if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); die("Kernel access of bad area", regs, sig); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a241946..bc23a70 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -3,7 +3,6 @@ * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ -#include /* STACK_END_MAGIC */ #include /* test_thread_flag(), ... */ #include /* oops_begin/end, ... */ #include /* search_exception_table */ @@ -710,7 +709,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, show_fault_oops(regs, error_code, address); stackend = end_of_stack(tsk); - if (tsk != &init_task && *stackend != STACK_END_MAGIC) + if (*stackend != STACK_END_MAGIC) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; diff --git a/include/linux/sched.h b/include/linux/sched.h index 82ff3d6..118dca7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -57,6 +57,7 @@ struct sched_param { #include #include #include +#include #include @@ -2638,6 +2639,7 @@ static inline unsigned long stack_not_used(struct task_struct *p) return (unsigned long)n - (unsigned long)end_of_stack(p); } #endif +extern void set_task_stack_end_magic(struct task_struct *tsk); /* set thread flags in other task's structures * - see asm/thread_info.h for TIF_xxxx flags available diff --git a/init/main.c b/init/main.c index bb1aed9..5fc3fc7 100644 --- a/init/main.c +++ b/init/main.c @@ -508,6 +508,7 @@ asmlinkage __visible void __init start_kernel(void) * lockdep hash: */ lockdep_init(); + set_task_stack_end_magic(&init_task); smp_setup_processor_id(); debug_objects_early_init(); diff --git a/kernel/fork.c b/kernel/fork.c index 9387ae8..ad64248 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst, return 0; } +void set_task_stack_end_magic(struct task_struct *tsk) +{ + unsigned long *stackend; + + stackend = end_of_stack(tsk); + *stackend = STACK_END_MAGIC; /* for overflow detection */ +} + static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; - unsigned long *stackend; int node = tsk_fork_get_node(orig); int err; @@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); - stackend = end_of_stack(tsk); - *stackend = STACK_END_MAGIC; /* for overflow detection */ + set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 8a4e5cb..1636e41 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,7 +13,6 @@ #include #include #include -#include #include @@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - if ((current != &init_task && - *(end_of_stack(current)) != STACK_END_MAGIC)) { + if (*end_of_stack(current) != STACK_END_MAGIC) { print_max_stack(); BUG(); } -- cgit v0.10.2 From a70857e46dd13e87ae06bf0e64cb6a2d4f436265 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 12 Sep 2014 14:16:18 +0100 Subject: sched: Add helper for task stack page overrun checking This facility is used in a few places so let's introduce a helper function to improve code readability. Signed-off-by: Aaron Tomlin Signed-off-by: Peter Zijlstra (Intel) Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dzickus@redhat.com Cc: bmr@redhat.com Cc: jcastillo@redhat.com Cc: oleg@redhat.com Cc: riel@redhat.com Cc: prarit@redhat.com Cc: jgh@redhat.com Cc: minchan@kernel.org Cc: mpe@ellerman.id.au Cc: tglx@linutronix.de Cc: hannes@cmpxchg.org Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael Ellerman Cc: Paul Mackerras Cc: Seiji Aguchi Cc: Steven Rostedt Cc: Yasuaki Ishimatsu Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1410527779-8133-3-git-send-email-atomlin@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 35d0760c..99b2f27 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -507,7 +507,6 @@ bail: void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) { const struct exception_table_entry *entry; - unsigned long *stackend; /* Are we prepared to handle this fault? */ if ((entry = search_exception_tables(regs->nip)) != NULL) { @@ -536,8 +535,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", regs->nip); - stackend = end_of_stack(current); - if (*stackend != STACK_END_MAGIC) + if (task_stack_end_corrupted(current)) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); die("Kernel access of bad area", regs, sig); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index bc23a70..6240bc7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -648,7 +648,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code) { struct task_struct *tsk = current; - unsigned long *stackend; unsigned long flags; int sig; @@ -708,8 +707,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, show_fault_oops(regs, error_code, address); - stackend = end_of_stack(tsk); - if (*stackend != STACK_END_MAGIC) + if (task_stack_end_corrupted(tsk)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; diff --git a/include/linux/sched.h b/include/linux/sched.h index 118dca7..18f5262 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2617,6 +2617,8 @@ static inline unsigned long *end_of_stack(struct task_struct *p) } #endif +#define task_stack_end_corrupted(task) \ + (*(end_of_stack(task)) != STACK_END_MAGIC) static inline int object_is_on_stack(void *obj) { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 1636e41..16eddb3 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -170,7 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - if (*end_of_stack(current) != STACK_END_MAGIC) { + if (task_stack_end_corrupted(current)) { print_max_stack(); BUG(); } -- cgit v0.10.2 From 0d9e26329b0c9263d4d9e0422d80a0e73268c52f Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 12 Sep 2014 14:16:19 +0100 Subject: sched: Add default-disabled option to BUG() when stack end location is overwritten Currently in the event of a stack overrun a call to schedule() does not check for this type of corruption. This corruption is often silent and can go unnoticed. However once the corrupted region is examined at a later stage, the outcome is undefined and often results in a sporadic page fault which cannot be handled. This patch checks for a stack overrun and takes appropriate action since the damage is already done, there is no point in continuing. Signed-off-by: Aaron Tomlin Signed-off-by: Peter Zijlstra (Intel) Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dzickus@redhat.com Cc: bmr@redhat.com Cc: jcastillo@redhat.com Cc: oleg@redhat.com Cc: riel@redhat.com Cc: prarit@redhat.com Cc: jgh@redhat.com Cc: minchan@kernel.org Cc: mpe@ellerman.id.au Cc: tglx@linutronix.de Cc: rostedt@goodmis.org Cc: hannes@cmpxchg.org Cc: Alexei Starovoitov Cc: Al Viro Cc: Andi Kleen Cc: Andrew Morton Cc: Dan Streetman Cc: Davidlohr Bueso Cc: David S. Miller Cc: Kees Cook Cc: Linus Torvalds Cc: Lubomir Rintel Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/1410527779-8133-4-git-send-email-atomlin@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4b1ddeb..61ee2b3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2693,6 +2693,9 @@ static noinline void __schedule_bug(struct task_struct *prev) */ static inline void schedule_debug(struct task_struct *prev) { +#ifdef CONFIG_SCHED_STACK_END_CHECK + BUG_ON(unlikely(task_stack_end_corrupted(prev))); +#endif /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path. Otherwise whine diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a285900..e58163d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -824,6 +824,18 @@ config SCHEDSTATS application, you can say N to avoid the very slight overhead this adds. +config SCHED_STACK_END_CHECK + bool "Detect stack corruption on calls to schedule()" + depends on DEBUG_KERNEL + default n + help + This option checks for a stack overrun on calls to schedule(). + If the stack end location is found to be over written always panic as + the content of the corrupted region can no longer be trusted. + This is to ensure no erroneous behaviour occurs which could result in + data corruption or a sporadic crash at a later stage once the region + is examined. The runtime overhead introduced is minimal. + config TIMER_STATS bool "Collect kernel timers statistics" depends on DEBUG_KERNEL && PROC_FS -- cgit v0.10.2 From afdeee0510db918b31bb4aba47452df2ddbdbcf2 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Aug 2014 13:06:44 +0200 Subject: sched: Fix imbalance flag reset The imbalance flag can stay set whereas there is no imbalance. Let assume that we have 3 tasks that run on a dual cores /dual cluster system. We will have some idle load balance which are triggered during tick. Unfortunately, the tick is also used to queue background work so we can reach the situation where short work has been queued on a CPU which already runs a task. The load balance will detect this imbalance (2 tasks on 1 CPU and an idle CPU) and will try to pull the waiting task on the idle CPU. The waiting task is a worker thread that is pinned on a CPU so an imbalance due to pinned task is detected and the imbalance flag is set. Then, we will not be able to clear the flag because we have at most 1 task on each CPU but the imbalance flag will trig to useless active load balance between the idle CPU and the busy CPU. We need to reset of the imbalance flag as soon as we have reached a balanced state. If all tasks are pinned, we don't consider that as a balanced state and let the imbalance flag set. Signed-off-by: Vincent Guittot Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra (Intel) Cc: riel@redhat.com Cc: Morten.Rasmussen@arm.com Cc: efault@gmx.de Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409051215-16788-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9807a99..01856a8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6765,10 +6765,8 @@ more_balance: if (sd_parent) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) *group_imbalance = 1; - } else if (*group_imbalance) - *group_imbalance = 0; } /* All tasks on this runqueue were pinned by CPU affinity */ @@ -6779,7 +6777,7 @@ more_balance: env.loop_break = sched_nr_migrate_break; goto redo; } - goto out_balanced; + goto out_all_pinned; } } @@ -6853,6 +6851,23 @@ more_balance: goto out; out_balanced: + /* + * We reach balance although we may have faced some affinity + * constraints. Clear the imbalance flag if it was set. + */ + if (sd_parent) { + int *group_imbalance = &sd_parent->groups->sgc->imbalance; + + if (*group_imbalance) + *group_imbalance = 0; + } + +out_all_pinned: + /* + * We reach balance because all tasks are pinned at this level so + * we can't migrate them. Let the imbalance flag set so parent level + * can try to migrate them. + */ schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; -- cgit v0.10.2 From 05bfb65f52cbdabe26ebb629959416a6cffb034d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Aug 2014 13:06:45 +0200 Subject: sched: Remove a wake_affine() condition In wake_affine() I have tried to understand the meaning of the condition: (this_load <= load && this_load + target_load(prev_cpu, idx) <= tl_per_task) but I failed to find a use case that can take advantage of it and I haven't found clear description in the previous commit's log. Futhermore, the comment of the condition refers to the task_hot function that was used before being replaced by the current condition: /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and * there is no bad imbalance. */ If we look more deeply the below condition: this_load + target_load(prev_cpu, idx) <= tl_per_task When sync is clear, we have: tl_per_task = runnable_load_avg / nr_running this_load = max(runnable_load_avg, cpuload[idx]) target_load = max(runnable_load_avg', cpuload'[idx]) It implies that runnable_load_avg == 0 and nr_running <= 1 in order to match the condition. This implies that runnable_load_avg == 0 too because of the condition: this_load <= load. but if this _load is null, 'balanced' is already set and the test is redundant. If sync is set, it's not as straight forward as above (especially if cgroup are involved) but the policy should be similar as we have removed a task that's going to sleep in order to get a more accurate load and this_load values. The current conclusion is that these additional condition don't give any benefit so we can remove them. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Cc: Morten.Rasmussen@arm.com Cc: efault@gmx.de Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409051215-16788-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 01856a8..391eaf2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4285,7 +4285,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; int idx, this_cpu, prev_cpu; - unsigned long tl_per_task; struct task_group *tg; unsigned long weight; int balanced; @@ -4343,32 +4342,15 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) balanced = this_eff_load <= prev_eff_load; } else balanced = true; - - /* - * If the currently running task will sleep within - * a reasonable amount of time then attract this newly - * woken task: - */ - if (sync && balanced) - return 1; - schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - if (balanced || - (this_load <= load && - this_load + target_load(prev_cpu, idx) <= tl_per_task)) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.statistics.nr_wakeups_affine); + if (!balanced) + return 0; - return 1; - } - return 0; + schedstat_inc(sd, ttwu_move_affine); + schedstat_inc(p, se.statistics.nr_wakeups_affine); + + return 1; } /* -- cgit v0.10.2 From 65fdac08c264506ff95ee1e34ae066e308c9e6e3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Aug 2014 13:06:46 +0200 Subject: sched: Fix avg_load computation The computation of avg_load and avg_load_per_task should only take into account the number of CFS tasks. The non-CFS tasks are already taken into account by decreasing the CPU's capacity and they will be tracked in the CPU's utilization (group_utilization) of the next patches. Reviewed-by: Preeti U Murthy Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: riel@redhat.com Cc: Morten.Rasmussen@arm.com Cc: efault@gmx.de Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409051215-16788-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 391eaf2..eb87229 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4095,7 +4095,7 @@ static unsigned long capacity_of(int cpu) static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); unsigned long load_avg = rq->cfs.runnable_load_avg; if (nr_running) @@ -5985,7 +5985,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->sum_nr_running += rq->nr_running; + sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) *overload = true; -- cgit v0.10.2 From 26bc3c50d3b3984564c270da86f1fbbfb774dbcd Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Aug 2014 13:06:47 +0200 Subject: sched: Allow all architectures to set 'capacity_orig' 'capacity_orig' is only changed for systems with an SMT sched_domain level in order to reflect the lower capacity of CPUs. Heterogenous systems also have to reflect an original capacity that is different from the default value. Create a more generic function arch_scale_cpu_capacity that can be also used by non SMT platforms to set capacity_orig. The __weak implementation of arch_scale_cpu_capacity() is the previous SMT variant, in order to keep backward compatibility with the use of capacity_orig. arch_scale_smt_capacity() and default_scale_smt_capacity() have been removed as they were not used elsewhere than in arch_scale_cpu_capacity(). Signed-off-by: Vincent Guittot Reviewed-by: Kamalesh Babulal Reviewed-by: Preeti U. Murthy [ Added default_scale_cpu_capacity() back. ] Signed-off-by: Peter Zijlstra (Intel) Cc: riel@redhat.com Cc: Morten.Rasmussen@arm.com Cc: efault@gmx.de Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409051215-16788-5-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eb87229..be530e4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5705,19 +5705,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) return default_scale_capacity(sd, cpu); } -static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) +static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long weight = sd->span_weight; - unsigned long smt_gain = sd->smt_gain; + if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + return sd->smt_gain / sd->span_weight; - smt_gain /= weight; - - return smt_gain; + return SCHED_CAPACITY_SCALE; } -unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - return default_scale_smt_capacity(sd, cpu); + return default_scale_cpu_capacity(sd, cpu); } static unsigned long scale_rt_capacity(int cpu) @@ -5756,18 +5754,15 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long weight = sd->span_weight; unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; - if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_smt_capacity(sd, cpu); - else - capacity *= default_scale_smt_capacity(sd, cpu); + if (sched_feat(ARCH_CAPACITY)) + capacity *= arch_scale_cpu_capacity(sd, cpu); + else + capacity *= default_scale_cpu_capacity(sd, cpu); - capacity >>= SCHED_CAPACITY_SHIFT; - } + capacity >>= SCHED_CAPACITY_SHIFT; sdg->sgc->capacity_orig = capacity; -- cgit v0.10.2 From d3bfca1a7b028a57d648dbc0985492c6a4466ccf Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Aug 2014 13:06:48 +0200 Subject: ARM: topology: Use the new cpu_capacity interface Use the new arch_scale_cpu_capacity() scheduler facility in order to reflect the original capacity of a CPU instead of arch_scale_freq_capacity() which is more linked to a scaling of the capacity linked to the frequency. Signed-off-by: Vincent Guittot Acked-by: Nicolas Pitre Signed-off-by: Peter Zijlstra (Intel) Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Cc: Morten.Rasmussen@arm.com Cc: efault@gmx.de Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: Grant Likely Cc: Guenter Roeck Cc: Linus Torvalds Cc: Mark Brown Cc: Nicolas Pitre Cc: Rob Herring Cc: Russell King Cc: Vincent Guittot Cc: devicetree@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1409051215-16788-6-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index e35d880..89cfdd6 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -42,7 +42,7 @@ */ static DEFINE_PER_CPU(unsigned long, cpu_scale); -unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { return per_cpu(cpu_scale, cpu); } @@ -166,7 +166,7 @@ static void update_cpu_capacity(unsigned int cpu) set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity); printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n", - cpu, arch_scale_freq_capacity(NULL, cpu)); + cpu, arch_scale_cpu_capacity(NULL, cpu)); } #else -- cgit v0.10.2 From bd61c98f9b3f142cd63f9e15acfe203bec9e5f5a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Aug 2014 13:06:50 +0200 Subject: sched: Test the CPU's capacity in wake_affine() Currently the task always wakes affine on this_cpu if the latter is idle. Before waking up the task on this_cpu, we check that this_cpu capacity is not significantly reduced because of RT tasks or irq activity. Use case where the number of irq and/or the time spent under irq is important will take benefit of this because the task that is woken up by irq or softirq will not use the same CPU than irq (and softirq) but a idle one. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Cc: Morten.Rasmussen@arm.com Cc: efault@gmx.de Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409051215-16788-8-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index be530e4..74fa2c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4284,6 +4284,7 @@ static int wake_wide(struct task_struct *p) static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; + s64 this_eff_load, prev_eff_load; int idx, this_cpu, prev_cpu; struct task_group *tg; unsigned long weight; @@ -4327,21 +4328,21 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - if (this_load > 0) { - s64 this_eff_load, prev_eff_load; + this_eff_load = 100; + this_eff_load *= capacity_of(prev_cpu); + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= capacity_of(this_cpu); - this_eff_load = 100; - this_eff_load *= capacity_of(prev_cpu); + if (this_load > 0) { this_eff_load *= this_load + effective_load(tg, this_cpu, weight, weight); - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= capacity_of(this_cpu); prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + } + + balanced = this_eff_load <= prev_eff_load; - balanced = this_eff_load <= prev_eff_load; - } else - balanced = true; schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); if (!balanced) -- cgit v0.10.2 From 9c58c79a8a76c510cd3a5012c536d4fe3c81ec3b Mon Sep 17 00:00:00 2001 From: Zhihui Zhang Date: Sat, 20 Sep 2014 21:24:36 -0400 Subject: sched: Clean up some typos and grammatical errors in code/comments Signed-off-by: Zhihui Zhang Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1411262676-19928-1-git-send-email-zzhsuny@gmail.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 61ee2b3..a284190 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8069,7 +8069,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; quota = normalize_cfs_quota(tg, d); - parent_quota = parent_b->hierarchal_quota; + parent_quota = parent_b->hierarchical_quota; /* * ensure max(child_quota) <= parent_quota, inherit when no @@ -8080,7 +8080,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) else if (parent_quota != RUNTIME_INF && quota > parent_quota) return -EINVAL; } - cfs_b->hierarchal_quota = quota; + cfs_b->hierarchical_quota = quota; return 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 74fa2c2..2a1e6ac 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2224,8 +2224,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) /* * As y^PERIOD = 1/2, we can combine - * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) - * With a look-up table which covers k^n (navg_load >= busiest->avg_load) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index aa0f73b..1bc6aad 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -188,7 +188,7 @@ struct cfs_bandwidth { raw_spinlock_t lock; ktime_t period; u64 quota, runtime; - s64 hierarchal_quota; + s64 hierarchical_quota; u64 runtime_expires; int idle, timer_active; -- cgit v0.10.2 From be34f0f3e6aed6e828a8059247d169d38da128d7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 21 Sep 2014 21:47:43 +0200 Subject: sched/numa: Kill the wrong/dead TASK_DEAD check in task_numa_fault() current->state == TASK_DEAD means that the task is doing its last schedule(), page fault is obviously impossible at this stage. Signed-off-by: Oleg Nesterov Acked-by: Mel Gorman Acked-by: Rik van Riel Cc: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140921194743.GA30114@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2a1e6ac..9ee3d4f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1817,10 +1817,6 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (!p->mm) return; - /* Do not worry about placement if exiting */ - if (p->state == TASK_DEAD) - return; - /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults_memory)) { int size = sizeof(*p->numa_faults_memory) * -- cgit v0.10.2 From a5e7be3b28a235108c59561bea55eea1072b23b0 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Sep 2014 10:22:39 +0100 Subject: sched/deadline: Clear dl_entity params when setscheduling to different class When a task is using SCHED_DEADLINE and the user setschedules it to a different class its sched_dl_entity static parameters are not cleaned up. This causes a bug if the user sets it back to SCHED_DEADLINE with the same parameters again. The problem resides in the check we perform at the very beginning of dl_overflow(): if (new_bw == p->dl.dl_bw) return 0; This condition is met in the case depicted above, so the function returns and dl_b->total_bw is not updated (the p->dl.dl_bw is not added to it). After this, admission control is broken. This patch fixes the thing, properly clearing static parameters for a task that ceases to use SCHED_DEADLINE. Reported-by: Daniele Alessandrelli Reported-by: Daniel Wagner Reported-by: Vincent Legout Tested-by: Luca Abeni Tested-by: Daniel Wagner Tested-by: Vincent Legout Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Fabio Checconi Cc: Dario Faggioli Cc: Michael Trimarchi Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1411118561-26323-2-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a284190..09bde2a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1809,6 +1809,20 @@ int wake_up_state(struct task_struct *p, unsigned int state) } /* + * This function clears the sched_dl_entity static params. + */ +void __dl_clear_params(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + + dl_se->dl_runtime = 0; + dl_se->dl_deadline = 0; + dl_se->dl_period = 0; + dl_se->flags = 0; + dl_se->dl_bw = 0; +} + +/* * Perform scheduler related setup for a newly forked process p. * p is forked by current. * @@ -1832,10 +1846,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) RB_CLEAR_NODE(&p->dl.rb_node); hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - p->dl.dl_runtime = p->dl.runtime = 0; - p->dl.dl_deadline = p->dl.deadline = 0; - p->dl.dl_period = 0; - p->dl.flags = 0; + __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index aaa5abb..efb9412 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1565,6 +1565,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) hrtimer_try_to_cancel(&p->dl.dl_timer); + __dl_clear_params(p); + #ifdef CONFIG_SMP /* * Since this might be the only -deadline task on the rq, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1bc6aad..76f3a38 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -130,6 +130,9 @@ struct rt_bandwidth { u64 rt_runtime; struct hrtimer rt_period_timer; }; + +void __dl_clear_params(struct task_struct *p); + /* * To keep the bandwidth of -deadline tasks and groups under control * we need some place where: -- cgit v0.10.2 From 91ec6778ec4f963fcb2c2793610919b572f633b0 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Sep 2014 10:22:41 +0100 Subject: sched/deadline: Fix inter- exclusive cpusets migrations Users can perform clustered scheduling using the cpuset facility. After an exclusive cpuset is created, task migrations happen only between CPUs belonging to the same cpuset. Inter- cpuset migrations can only happen when the user requires so, moving a task between different cpusets. This behaviour is broken in SCHED_DEADLINE, as currently spurious inter- cpuset migration may happen without user intervention. This patch fix the problem (and shuffles the code a bit to improve clarity). Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: raistlin@linux.it Cc: michael@amarulasolutions.com Cc: fchecconi@gmail.com Cc: daniel.wagner@bmw-carit.de Cc: vincent@legout.info Cc: luca.abeni@unitn.it Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1411118561-26323-4-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index bd95963..539ca3c 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, int best_cpu = -1; const struct sched_dl_entity *dl_se = &p->dl; - if (later_mask && cpumask_and(later_mask, cp->free_cpus, - &p->cpus_allowed) && cpumask_and(later_mask, - later_mask, cpu_active_mask)) { + if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { best_cpu = cpumask_any(later_mask); goto out; } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index efb9412..abfaf3d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1164,6 +1164,13 @@ static int find_later_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; + /* + * We have to consider system topology and task affinity + * first, then we can look for a suitable cpu. + */ + cpumask_copy(later_mask, task_rq(task)->rd->span); + cpumask_and(later_mask, later_mask, cpu_active_mask); + cpumask_and(later_mask, later_mask, &task->cpus_allowed); best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask); if (best_cpu == -1) -- cgit v0.10.2 From 442bf3aaf55a91ebfec71da46a4ee10a3c905bcc Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 4 Sep 2014 11:32:09 -0400 Subject: sched: Let the scheduler see CPU idle states When the cpu enters idle, it stores the cpuidle state pointer in its struct rq instance which in turn could be used to make a better decision when balancing tasks. As soon as the cpu exits its idle state, the struct rq reference is cleared. There are a couple of situations where the idle state pointer could be changed while it is being consulted: 1. For x86/acpi with dynamic c-states, when a laptop switches from battery to AC that could result on removing the deeper idle state. The acpi driver triggers: 'acpi_processor_cst_has_changed' 'cpuidle_pause_and_lock' 'cpuidle_uninstall_idle_handler' 'kick_all_cpus_sync'. All cpus will exit their idle state and the pointed object will be set to NULL. 2. The cpuidle driver is unloaded. Logically that could happen but not in practice because the drivers are always compiled in and 95% of them are not coded to unregister themselves. In any case, the unloading code must call 'cpuidle_unregister_device', that calls 'cpuidle_pause_and_lock' leading to 'kick_all_cpus_sync' as mentioned above. A race can happen if we use the pointer and then one of these two scenarios occurs at the same moment. In order to be safe, the idle state pointer stored in the rq must be used inside a rcu_read_lock section where we are protected with the 'rcu_barrier' in the 'cpuidle_uninstall_idle_handler' function. The idle_get_state() and idle_put_state() accessors should be used to that effect. Signed-off-by: Daniel Lezcano Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra (Intel) Cc: "Rafael J. Wysocki" Cc: linux-pm@vger.kernel.org Cc: linaro-kernel@lists.linaro.org Cc: Daniel Lezcano Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index d31e04c..125150d 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -225,6 +225,12 @@ void cpuidle_uninstall_idle_handler(void) initialized = 0; wake_up_all_idle_cpus(); } + + /* + * Make sure external observers (such as the scheduler) + * are done looking at pointed idle states. + */ + synchronize_rcu(); } /** diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 11e7bc4..c47fce7 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -147,6 +147,9 @@ use_default: clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) goto use_default; + /* Take note of the planned idle state. */ + idle_set_state(this_rq(), &drv->states[next_state]); + /* * Enter the idle state previously returned by the governor decision. * This function will block until an interrupt occurs and will take @@ -154,6 +157,9 @@ use_default: */ entered_state = cpuidle_enter(drv, dev, next_state); + /* The cpu is no longer idle or about to enter idle. */ + idle_set_state(this_rq(), NULL); + if (broadcast) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 76f3a38..16e1ca9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -14,6 +14,7 @@ #include "cpuacct.h" struct rq; +struct cpuidle_state; /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 @@ -643,6 +644,11 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif + +#ifdef CONFIG_CPU_IDLE + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; +#endif }; static inline int cpu_of(struct rq *rq) @@ -1196,6 +1202,30 @@ static inline void idle_exit_fair(struct rq *rq) { } #endif +#ifdef CONFIG_CPU_IDLE +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ + rq->idle_state = idle_state; +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + WARN_ON(!rcu_read_lock_held()); + return rq->idle_state; +} +#else +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + return NULL; +} +#endif + extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); -- cgit v0.10.2 From 83a0a96a5f26d974580fd7251043ff70c8f1823d Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 4 Sep 2014 11:32:10 -0400 Subject: sched/fair: Leverage the idle state info when choosing the "idlest" cpu The code in find_idlest_cpu() looks for the CPU with the smallest load. However, if multiple CPUs are idle, the first idle CPU is selected irrespective of the depth of its idle state. Among the idle CPUs we should pick the one with with the shallowest idle state, or the latest to have gone idle if all idle CPUs are in the same state. The later applies even when cpuidle is configured out. This patch doesn't cover the following issues: - The idle exit latency of a CPU might be larger than the time needed to migrate the waking task to an already running CPU with sufficient capacity, and therefore performance would benefit from task packing in such case (in most cases task packing is about power saving). - Some idle states have a non negligible and non abortable entry latency which needs to run to completion before the exit latency can start. A concurrent patch series is making this info available to the cpuidle core. Once available, the entry latency with the idle timestamp could determine when the exit latency may be effective. Those issues will be handled in due course. In the mean time, what is implemented here should improve things already compared to the current state of affairs. Based on an initial patch from Daniel Lezcano. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra (Intel) Cc: Daniel Lezcano Cc: "Rafael J. Wysocki" Cc: Linus Torvalds Cc: linux-pm@vger.kernel.org Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/n/tip-@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9ee3d4f..8cb32f8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -4415,20 +4416,46 @@ static int find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; - int idlest = -1; + unsigned int min_exit_latency = UINT_MAX; + u64 latest_idle_timestamp = 0; + int least_loaded_cpu = this_cpu; + int shallowest_idle_cpu = -1; int i; /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; + if (idle_cpu(i)) { + struct rq *rq = cpu_rq(i); + struct cpuidle_state *idle = idle_get_state(rq); + if (idle && idle->exit_latency < min_exit_latency) { + /* + * We give priority to a CPU whose idle state + * has the smallest exit latency irrespective + * of any idle timestamp. + */ + min_exit_latency = idle->exit_latency; + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } else if ((!idle || idle->exit_latency == min_exit_latency) && + rq->idle_stamp > latest_idle_timestamp) { + /* + * If equal or no active idle state, then + * the most recently idled CPU might have + * a warmer cache. + */ + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } + } else { + load = weighted_cpuload(i); + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + least_loaded_cpu = i; + } } } - return idlest; + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } /* -- cgit v0.10.2 From 8651c65844e93af44554272b7e0d2b142837b244 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 21 Sep 2014 21:33:36 +0200 Subject: sched: Fix the task-group check in tg_has_rt_tasks() tg_has_rt_tasks() wants to find an RT task in this task_group, but task_rq(p)->rt.tg wrongly checks the root rt_rq. Signed-off-by: Oleg Nesterov Reviewed-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Mike Galbraith Link: http://lkml.kernel.org/r/20140921193336.GA28618@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 09bde2a..0abfb7e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7441,7 +7441,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg) struct task_struct *g, *p; for_each_process_thread(g, p) { - if (rt_task(p) && task_rq(p)->rt.tg == tg) + if (rt_task(p) && task_group(p) == tg) return 1; } -- cgit v0.10.2 From 3472eaa1f12e217e2b8b0ef658ff861b2308cbbd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 21 Sep 2014 21:33:38 +0200 Subject: sched: normalize_rt_tasks(): Don't use _irqsave for tasklist_lock, use task_rq_lock() 1. read_lock(tasklist_lock) does not need to disable irqs. 2. ->mm != NULL is a common mistake, use PF_KTHREAD. 3. The second ->mm check can be simply removed. 4. task_rq_lock() looks better than raw_spin_lock(&p->pi_lock) + __task_rq_lock(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Kirill Tkhai Cc: Mike Galbraith Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140921193338.GA28621@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0abfb7e..d65566d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7220,12 +7220,12 @@ void normalize_rt_tasks(void) unsigned long flags; struct rq *rq; - read_lock_irqsave(&tasklist_lock, flags); + read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* * Only normalize user tasks: */ - if (!p->mm) + if (p->flags & PF_KTHREAD) continue; p->se.exec_start = 0; @@ -7240,20 +7240,16 @@ void normalize_rt_tasks(void) * Renice negative nice level userspace * tasks back to 0: */ - if (task_nice(p) < 0 && p->mm) + if (task_nice(p) < 0) set_user_nice(p, 0); continue; } - raw_spin_lock(&p->pi_lock); - rq = __task_rq_lock(p); - + rq = task_rq_lock(p, &flags); normalize_task(rq, p); - - __task_rq_unlock(rq); - raw_spin_unlock(&p->pi_lock); + task_rq_unlock(rq, p, &flags); } - read_unlock_irqrestore(&tasklist_lock, flags); + read_unlock(&tasklist_lock); } #endif /* CONFIG_MAGIC_SYSRQ */ -- cgit v0.10.2 From 5bd96ab6fef66ec6b9f54134364e618fd0f8f2f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 21 Sep 2014 21:33:41 +0200 Subject: sched: print_rq(): Don't use tasklist_lock read_lock_irqsave(tasklist_lock) in print_rq() looks strange. We do not need to disable irqs, and they are already disabled by the caller. And afaics this lock buys nothing, we can rely on rcu_read_lock(). In this case it makes sense to also move rcu_read_lock/unlock from the caller to print_rq(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Kirill Tkhai Cc: Mike Galbraith Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140921193341.GA28628@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c7fe1ea0..ce33780 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) { struct task_struct *g, *p; - unsigned long flags; SEQ_printf(m, "\nrunnable tasks:\n" @@ -159,14 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) "------------------------------------------------------" "----------------------------------------------------\n"); - read_lock_irqsave(&tasklist_lock, flags); + rcu_read_lock(); for_each_process_thread(g, p) { if (task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); } - read_unlock_irqrestore(&tasklist_lock, flags); + rcu_read_unlock(); } void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) @@ -331,9 +330,7 @@ do { \ print_cfs_stats(m, cpu); print_rt_stats(m, cpu); - rcu_read_lock(); print_rq(m, rq, cpu); - rcu_read_unlock(); spin_unlock_irqrestore(&sched_debug_lock, flags); SEQ_printf(m, "\n"); } -- cgit v0.10.2 From c55f5158f5606f8a62e694b7e009f59b92ac6258 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2014 17:06:41 +0200 Subject: sched, mips, ia64: Remove __ARCH_WANT_UNLOCKED_CTXSW Kirill found that there's a subtle race in the __ARCH_WANT_UNLOCKED_CTXSW code, and instead of fixing it, remove the entire exception because neither arch that uses it seems to actually still require it. Boot tested on mips64el (qemu) only. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kirill Tkhai Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Fenghua Yu Cc: James Hogan Cc: Kees Cook Cc: Linus Torvalds Cc: Paul Burton Cc: Qais Yousef Cc: Ralf Baechle Cc: Tony Luck Cc: oleg@redhat.com Cc: linux@roeck-us.net Cc: linux-ia64@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Link: http://lkml.kernel.org/r/20140923150641.GH3312@worktop.programming.kicks-ass.net Signed-off-by: Ingo Molnar diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index c736713..ce53c50 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -19,7 +19,6 @@ #include #include -#define __ARCH_WANT_UNLOCKED_CTXSW #define ARCH_HAS_PREFETCH_SWITCH_STACK #define IA64_NUM_PHYS_STACK_REG 96 diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index 05f0843..f1df4cb 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -397,12 +397,6 @@ unsigned long get_wchan(struct task_struct *p); #define ARCH_HAS_PREFETCHW #define prefetchw(x) __builtin_prefetch((x), 1, 1) -/* - * See Documentation/scheduler/sched-arch.txt; prevents deadlock on SMP - * systems. - */ -#define __ARCH_WANT_UNLOCKED_CTXSW - #endif #endif /* _ASM_PROCESSOR_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d65566d..5b0eac9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2331,10 +2331,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ post_schedule(rq); -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - /* In this case, finish_task_switch does not reenable preemption */ - preempt_enable(); -#endif if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } @@ -2377,9 +2373,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ -#ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -#endif context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 16e1ca9..6130251 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -975,7 +975,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) # define finish_arch_post_lock_switch() do { } while (0) #endif -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP @@ -1013,35 +1012,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) raw_spin_unlock_irq(&rq->lock); } -#else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif - raw_spin_unlock(&rq->lock); -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif - local_irq_enable(); -} -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ - /* * wake flags */ -- cgit v0.10.2 From 7a96c231ca23f0f5622852307df4209afc502ec3 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 22 Sep 2014 22:36:12 +0400 Subject: sched/fair: Remove duplicate code from can_migrate_task() Combine two branches which do the same. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140922183612.11015.64200.stgit@localhost Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8cb32f8..10a5a28 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5315,24 +5315,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); - if (migrate_improves_locality(p, env)) { -#ifdef CONFIG_SCHEDSTATS + if (migrate_improves_locality(p, env) || !tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } -#endif - return 1; - } - - if (!tsk_cache_hot || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - - if (tsk_cache_hot) { - schedstat_inc(env->sd, lb_hot_gained[env->idle]); - schedstat_inc(p, se.statistics.nr_forced_migrations); - } - return 1; } -- cgit v0.10.2 From 66339c31bc3978d5fff9c4b4cb590a861def4db2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 22 Sep 2014 22:36:24 +0400 Subject: sched: Use dl_bw_of() under RCU read lock dl_bw_of() dereferences rq->rd which has to have RCU read lock held. Probability of use-after-free isn't zero here. Also add lockdep assert into dl_bw_cpus(). Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: # v3.14+ Cc: Paul E. McKenney Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140922183624.11015.71558.stgit@localhost Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5b0eac9..f0adb03 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2021,6 +2021,8 @@ unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_SMP inline struct dl_bw *dl_bw_of(int i) { + rcu_lockdep_assert(rcu_read_lock_sched_held(), + "sched RCU must be held"); return &cpu_rq(i)->rd->dl_bw; } @@ -2029,6 +2031,8 @@ static inline int dl_bw_cpus(int i) struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; + rcu_lockdep_assert(rcu_read_lock_sched_held(), + "sched RCU must be held"); for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; @@ -7645,6 +7649,8 @@ static int sched_dl_global_constraints(void) int cpu, ret = 0; unsigned long flags; + rcu_read_lock(); + /* * Here we want to check the bandwidth not being set to some * value smaller than the currently allocated bandwidth in @@ -7666,6 +7672,8 @@ static int sched_dl_global_constraints(void) break; } + rcu_read_unlock(); + return ret; } @@ -7681,6 +7689,7 @@ static void sched_dl_do_global(void) if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + rcu_read_lock(); /* * FIXME: As above... */ @@ -7691,6 +7700,7 @@ static void sched_dl_do_global(void) dl_b->bw = new_bw; raw_spin_unlock_irqrestore(&dl_b->lock, flags); } + rcu_read_unlock(); } static int sched_rt_global_validate(void) -- cgit v0.10.2 From 16303ab2fe214635240a8f57cad2cd29792d4e3b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 22 Sep 2014 22:36:30 +0400 Subject: sched: cleanup: Rename 'out_unlock' to 'out_free_new_mask' Nothing is locked there, so label's name only confuses a reader. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20140922183630.11015.59500.stgit@localhost Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0adb03..316127a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4029,14 +4029,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); - goto out_unlock; + goto out_free_new_mask; } rcu_read_unlock(); } retval = security_task_setscheduler(p); if (retval) - goto out_unlock; + goto out_free_new_mask; cpuset_cpus_allowed(p, cpus_allowed); @@ -4054,7 +4054,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { retval = -EBUSY; - goto out_unlock; + goto out_free_new_mask; } } #endif @@ -4073,7 +4073,7 @@ again: goto again; } } -out_unlock: +out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: free_cpumask_var(cpus_allowed); -- cgit v0.10.2 From f1e3a0932f3a9554371792a7daaf1e0eb19f66d5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 22 Sep 2014 22:36:36 +0400 Subject: sched: Use rq->rd in sched_setaffinity() under RCU read lock Probability of use-after-free isn't zero in this place. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: # v3.14+ Cc: Paul E. McKenney Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140922183636.11015.83611.stgit@localhost Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 316127a..b5349fe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4049,13 +4049,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) * root_domain. */ #ifdef CONFIG_SMP - if (task_has_dl_policy(p)) { - const struct cpumask *span = task_rq(p)->rd->span; - - if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + rcu_read_lock(); + if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { retval = -EBUSY; + rcu_read_unlock(); goto out_free_new_mask; } + rcu_read_unlock(); } #endif again: -- cgit v0.10.2 From 8aa6f0ebf41b5fdd186276394bf07e7bd6884d94 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 22 Sep 2014 22:36:43 +0400 Subject: sched/rt: Use resched_curr() in task_tick_rt() Some time ago PREEMPT_NEED_RESCHED was implemented, so reschedule technics is a little more difficult now. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140922183642.11015.66039.stgit@localhost Signed-off-by: Ingo Molnar diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2e6a774..87ea5bf 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2072,7 +2072,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) for_each_sched_rt_entity(rt_se) { if (rt_se->run_list.prev != rt_se->run_list.next) { requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); + resched_curr(rq); return; } } -- cgit v0.10.2 From cebf15eb09a2fd2fa73ee4faa9c4d2f813cf0f09 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 18 Sep 2014 12:33:34 -0700 Subject: x86, sched: Add new topology for multi-NUMA-node CPUs I'm getting the spew below when booting with Haswell (Xeon E5-2699 v3) CPUs and the "Cluster-on-Die" (CoD) feature enabled in the BIOS. It seems similar to the issue that some folks from AMD ran in to on their systems and addressed in this commit: 161270fc1f9d ("x86/smp: Fix topology checks on AMD MCM CPUs") Both these Intel and AMD systems break an assumption which is being enforced by topology_sane(): a socket may not contain more than one NUMA node. AMD special-cased their system by looking for a cpuid flag. The Intel mode is dependent on BIOS options and I do not know of a way which it is enumerated other than the tables being parsed during the CPU bringup process. In other words, we have to trust the ACPI tables . This detects the situation where a NUMA node occurs at a place in the middle of the "CPU" sched domains. It replaces the default topology with one that relies on the NUMA information from the firmware (SRAT table) for all levels of sched domains above the hyperthreads. This also fixes a sysfs bug. We used to freak out when we saw the "mc" group cross a node boundary, so we stopped building the MC group. MC gets exported as the 'core_siblings_list' in /sys/devices/system/cpu/cpu*/topology/ and this caused CPUs with the same 'physical_package_id' to not be listed together in 'core_siblings_list'. This violates a statement from Documentation/ABI/testing/sysfs-devices-system-cpu: core_siblings: internal kernel map of cpu#'s hardware threads within the same physical_package_id. core_siblings_list: human-readable list of the logical CPU numbers within the same physical_package_id as cpu#. The sysfs effects here cause an issue with the hwloc tool where it gets confused and thinks there are more sockets than are physically present. Before this patch, there are two packages: # cd /sys/devices/system/cpu/ # cat cpu*/topology/physical_package_id | sort | uniq -c 18 0 18 1 But 4 _sets_ of core siblings: # cat cpu*/topology/core_siblings_list | sort | uniq -c 9 0-8 9 18-26 9 27-35 9 9-17 After this set, there are only 2 sets of core siblings, which is what we expect for a 2-socket system. # cat cpu*/topology/physical_package_id | sort | uniq -c 18 0 18 1 # cat cpu*/topology/core_siblings_list | sort | uniq -c 18 0-17 18 18-35 Example spew: ... NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. #2 #3 #4 #5 #6 #7 #8 .... node #1, CPUs: #9 ------------[ cut here ]------------ WARNING: CPU: 9 PID: 0 at /home/ak/hle/linux-hle-2.6/arch/x86/kernel/smpboot.c:306 topology_sane.isra.2+0x74/0x90() sched: CPU #9's mc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency. Modules linked in: CPU: 9 PID: 0 Comm: swapper/9 Not tainted 3.17.0-rc1-00293-g8e01c4d-dirty #631 Hardware name: Intel Corporation S2600WTT/S2600WTT, BIOS GRNDSDP1.86B.0036.R05.1407140519 07/14/2014 0000000000000009 ffff88046ddabe00 ffffffff8172e485 ffff88046ddabe48 ffff88046ddabe38 ffffffff8109691d 000000000000b001 0000000000000009 ffff88086fc12580 000000000000b020 0000000000000009 ffff88046ddabe98 Call Trace: [] dump_stack+0x45/0x56 [] warn_slowpath_common+0x7d/0xa0 [] warn_slowpath_fmt+0x4c/0x50 [] topology_sane.isra.2+0x74/0x90 [] set_cpu_sibling_map+0x31e/0x4f0 [] start_secondary+0x1ad/0x240 ---[ end trace 3fe5f587a9fcde61 ]--- #10 #11 #12 #13 #14 #15 #16 #17 .... node #2, CPUs: #18 #19 #20 #21 #22 #23 #24 #25 #26 .... node #3, CPUs: #27 #28 #29 #30 #31 #32 #33 #34 #35 Signed-off-by: Dave Hansen [ Added LLC domain and s/match_mc/match_die/ ] Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: David Rientjes Cc: Igor Mammedov Cc: Linus Torvalds Cc: Prarit Bhargava Cc: Toshi Kani Cc: brice.goglin@gmail.com Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/20140918193334.C065EBCE@viggo.jf.intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2d872e0..8de8eb7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -296,11 +296,19 @@ void smp_store_cpu_info(int id) } static bool +topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + return (cpu_to_node(cpu1) == cpu_to_node(cpu2)); +} + +static bool topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; - return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), + return !WARN_ONCE(!topology_same_node(c, o), "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " "[node: %d != %d]. Ignoring dependency.\n", cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); @@ -341,17 +349,44 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } -static bool match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +/* + * Unlike the other levels, we do not enforce keeping a + * multicore group inside a NUMA node. If this happens, we will + * discard the MC level of the topology later. + */ +static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (c->phys_proc_id == o->phys_proc_id) { - if (cpu_has(c, X86_FEATURE_AMD_DCM)) - return true; - - return topology_sane(c, o, "mc"); - } + if (c->phys_proc_id == o->phys_proc_id) + return true; return false; } +static struct sched_domain_topology_level numa_inside_package_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif + { NULL, }, +}; +/* + * set_sched_topology() sets the topology internal to a CPU. The + * NUMA topologies are layered on top of it to build the full + * system topology. + * + * If NUMA nodes are observed to occur within a CPU package, this + * function should be called. It forces the sched domain code to + * only use the SMT level for the CPU portion of the topology. + * This essentially falls back to relying on NUMA information + * from the SRAT table to describe the entire system topology + * (except for hyperthreads). + */ +static void primarily_use_numa_for_topology(void) +{ + set_sched_topology(numa_inside_package_topology); +} + void set_cpu_sibling_map(int cpu) { bool has_smt = smp_num_siblings > 1; @@ -388,7 +423,7 @@ void set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { o = &cpu_data(i); - if ((i == cpu) || (has_mp && match_mc(c, o))) { + if ((i == cpu) || (has_mp && match_die(c, o))) { link_mask(core, cpu, i); /* @@ -410,6 +445,8 @@ void set_cpu_sibling_map(int cpu) } else if (i != cpu && !c->booted_cores) c->booted_cores = cpu_data(i).booted_cores; } + if (match_die(c, o) == !topology_same_node(c, o)) + primarily_use_numa_for_topology(); } } -- cgit v0.10.2 From 728e5653e6fdb2a0892e94a600aef8c9a036c7eb Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 30 Sep 2014 14:45:46 -0700 Subject: sched/x86: Fix up typo in topology detection Commit: cebf15eb09a2 ("x86, sched: Add new topology for multi-NUMA-node CPUs") some code to try to detect the situation where we have a NUMA node inside of the "DIE" sched domain. It detected this by looking for cpus which match_die() but do not match NUMA nodes via topology_same_node(). I wrote it up as: if (match_die(c, o) == !topology_same_node(c, o)) which actually seemed to work some of the time, albiet accidentally. It should have been doing an &&, not an ==. This code essentially chopped off the "DIE" domain on one of Andrew Morton's systems. He reported that this patch fixed his issue. Signed-off-by: Dave Hansen Reported-by: Andrew Morton Signed-off-by: Peter Zijlstra (Intel) Cc: Dave Hansen Cc: Andrew Morton Cc: David Rientjes Cc: Igor Mammedov Cc: Jan Kiszka Cc: Lan Tianyu Cc: Linus Torvalds Cc: Prarit Bhargava Cc: Toshi Kani Link: http://lkml.kernel.org/r/20140930214546.FD481CFF@viggo.jf.intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8de8eb7..bae9e09 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -445,7 +445,7 @@ void set_cpu_sibling_map(int cpu) } else if (i != cpu && !c->booted_cores) c->booted_cores = cpu_data(i).booted_cores; } - if (match_die(c, o) == !topology_same_node(c, o)) + if (match_die(c, o) && !topology_same_node(c, o)) primarily_use_numa_for_topology(); } } -- cgit v0.10.2 From 43f4d66637bc752e93a77ff2536474a5a3888442 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 1 Oct 2014 15:38:55 +0200 Subject: sched: Improve sysbench performance by fixing spurious active migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit caeb178c60f4 ("sched/fair: Make update_sd_pick_busiest() ...") sd_pick_busiest returns a group that can be neither imbalanced nor overloaded but is only more loaded than others. This change has been introduced to ensure a better load balance in system that are not overloaded but as a side effect, it can also generate useless active migration between groups. Let take the example of 3 tasks on a quad cores system. We will always have an idle core so the load balance will find a busiest group (core) whenever an ILB is triggered and it will force an active migration (once above nr_balance_failed threshold) so the idle core becomes busy but another core will become idle. With the next ILB, the freshly idle core will try to pull the task of a busy CPU. The number of spurious active migration is not so huge in quad core system because the ILB is not triggered so much. But it becomes significant as soon as you have more than one sched_domain level like on a dual cluster of quad cores where the ILB is triggered every tick when you have more than 1 busy_cpu We need to ensure that the migration generate a real improveùent and will not only move the avg_load imbalance on another CPU. Before caeb178c60f4f93f1b45c0bc056b5cf6d217b67f, the filtering of such use case was ensured by the following test in f_b_g: if ((local->idle_cpus < busiest->idle_cpus) && busiest->sum_nr_running <= busiest->group_weight) This patch modified the condition to take into account situation where busiest group is not overloaded: If the diff between the number of idle cpus in 2 groups is less than or equal to 1 and the busiest group is not overloaded, moving a task will not improve the load balance but just move it. A test with sysbench on a dual clusters of quad cores gives the following results: command: sysbench --test=cpu --num-threads=5 --max-time=5 run The HZ is 200 which means that 1000 ticks has fired during the test. With Mainline, perf gives the following figures: Samples: 727 of event 'sched:sched_migrate_task' Event count (approx.): 727 Overhead Command Shared Object Symbol ........ ............... ............. .............. 12.52% migration/1 [unknown] [.] 00000000 12.52% migration/5 [unknown] [.] 00000000 12.52% migration/7 [unknown] [.] 00000000 12.10% migration/6 [unknown] [.] 00000000 11.83% migration/0 [unknown] [.] 00000000 11.83% migration/3 [unknown] [.] 00000000 11.14% migration/4 [unknown] [.] 00000000 10.87% migration/2 [unknown] [.] 00000000 2.75% sysbench [unknown] [.] 00000000 0.83% swapper [unknown] [.] 00000000 0.55% ktps65090charge [unknown] [.] 00000000 0.41% mmcqd/1 [unknown] [.] 00000000 0.14% perf [unknown] [.] 00000000 With this patch, perf gives the following figures Samples: 20 of event 'sched:sched_migrate_task' Event count (approx.): 20 Overhead Command Shared Object Symbol ........ ............... ............. .............. 80.00% sysbench [unknown] [.] 00000000 10.00% swapper [unknown] [.] 00000000 5.00% ktps65090charge [unknown] [.] 00000000 5.00% migration/1 [unknown] [.] 00000000 Signed-off-by: Vincent Guittot Reviewed-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1412170735-5356-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 10a5a28..dfdcbfd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6436,13 +6436,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (env->idle == CPU_IDLE) { /* - * This cpu is idle. If the busiest group load doesn't - * have more tasks than the number of available cpu's and - * there is no imbalance between this and busiest group - * wrt to idle cpu's, it is balanced. + * This cpu is idle. If the busiest group is not overloaded + * and there is no imbalance between this and busiest group + * wrt idle cpus, it is balanced. The imbalance becomes + * significant if the diff is greater than 1 otherwise we + * might end up to just move the imbalance on another group */ - if ((local->idle_cpus < busiest->idle_cpus) && - busiest->sum_nr_running <= busiest->group_weight) + if ((busiest->group_type != group_overloaded) && + (local->idle_cpus <= (busiest->idle_cpus + 1))) goto out_balanced; } else { /* -- cgit v0.10.2 From 347abad981c1ef815ea5ba861adba6a8c6aa1580 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 30 Sep 2014 15:59:47 -0400 Subject: sched, time: Fix build error with 64 bit cputime_t on 32 bit systems On 32 bit systems cmpxchg cannot handle 64 bit values, so some additional magic is required to allow a 32 bit system with CONFIG_VIRT_CPU_ACCOUNTING_GEN=y enabled to build. Make sure the correct cmpxchg function is used when doing an atomic swap of a cputime_t. Reported-by: Arnd Bergmann Signed-off-by: Rik van Riel Acked-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Cc: oleg@redhat.com Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: linux390@de.ibm.com Cc: linux-arch@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/20140930155947.070cdb1f@annuminas.surriel.com Signed-off-by: Ingo Molnar diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 607559a..6c840ce 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -32,6 +32,8 @@ static inline void setup_cputime_one_jiffy(void) { } typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) + #ifdef __KERNEL__ /* diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index f65bd36..3001887 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -18,6 +18,8 @@ typedef unsigned long long __nocast cputime_t; typedef unsigned long long __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) + static inline unsigned long __div(unsigned long long n, unsigned long base) { #ifndef CONFIG_64BIT diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h index d5cb78f5..fe386fc 100644 --- a/include/asm-generic/cputime_jiffies.h +++ b/include/asm-generic/cputime_jiffies.h @@ -3,6 +3,8 @@ typedef unsigned long __nocast cputime_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) + #define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) #define cputime_to_scaled(__ct) (__ct) diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index 4e81760..0419485 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -21,6 +21,8 @@ typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) + #define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_div(__ct, divisor) div_u64((__force u64)__ct, divisor) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 64492df..8394b1e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -555,6 +555,23 @@ drop_precision: } /* + * Atomically advance counter to the new value. Interrupts, vcpu + * scheduling, and scaling inaccuracies can cause cputime_advance + * to be occasionally called with a new value smaller than counter. + * Let's enforce atomicity. + * + * Normally a caller will only go through this loop once, or not + * at all in case a previous caller updated counter the same jiffy. + */ +static void cputime_advance(cputime_t *counter, cputime_t new) +{ + cputime_t old; + + while (new > (old = ACCESS_ONCE(*counter))) + cmpxchg_cputime(counter, old, new); +} + +/* * Adjust tick based cputime random precision against scheduler * runtime accounting. */ @@ -599,16 +616,8 @@ static void cputime_adjust(struct task_cputime *curr, utime = rtime - stime; } - /* - * If the tick based count grows faster than the scheduler one, - * the result of the scaling may go backward. - * Let's enforce monotonicity. - * Atomic exchange protects against concurrent cputime_adjust(). - */ - while (stime > (rtime = ACCESS_ONCE(prev->stime))) - cmpxchg(&prev->stime, rtime, stime); - while (utime > (rtime = ACCESS_ONCE(prev->utime))) - cmpxchg(&prev->utime, rtime, utime); + cputime_advance(&prev->stime, stime); + cputime_advance(&prev->utime, utime); out: *ut = prev->utime; -- cgit v0.10.2 From 10a12983b3d437a6998b3845870e52c1c752c101 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 1 Oct 2014 01:04:44 +0400 Subject: sched/fair: Delete resched_cpu() from idle_balance() We already reschedule env.dst_cpu in attach_tasks()->check_preempt_curr() if this is necessary. Furthermore, a higher priority class task may be current on dest rq, we shouldn't disturb it. Signed-off-by: Kirill Tkhai Cc: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20140930210441.5258.55054.stgit@localhost Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dfdcbfd..bd61cff 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6701,12 +6701,6 @@ more_balance: local_irq_restore(flags); - /* - * some other cpu did the load balance for us. - */ - if (cur_ld_moved && env.dst_cpu != smp_processor_id()) - resched_cpu(env.dst_cpu); - if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; goto more_balance; -- cgit v0.10.2 From f10e00f4bf360c36edbe6bf18a6c75b171cbe012 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 30 Sep 2014 12:23:37 +0400 Subject: sched/dl: Use dl_bw_of() under rcu_read_lock_sched() rq->rd is freed using call_rcu_sched(), so rcu_read_lock() to access it is not enough. We should use either rcu_read_lock_sched() or preempt_disable(). Reported-by: Sasha Levin Suggested-by: Peter Zijlstra Signed-off-by: Kirill Tkhai Fixes: 66339c31bc39 "sched: Use dl_bw_of() under RCU read lock" Link: http://lkml.kernel.org/r/1412065417.20287.24.camel@tkhai Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b5349fe..c84bdc0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5264,6 +5264,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb, { unsigned long flags; long cpu = (long)hcpu; + struct dl_bw *dl_b; switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: @@ -5271,15 +5272,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb, /* explicitly allow suspend */ if (!(action & CPU_TASKS_FROZEN)) { - struct dl_bw *dl_b = dl_bw_of(cpu); bool overflow; int cpus; + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); cpus = dl_bw_cpus(cpu); overflow = __dl_overflow(dl_b, cpus, 0, 0); raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + if (overflow) return notifier_from_errno(-EBUSY); } @@ -7647,11 +7652,10 @@ static int sched_dl_global_constraints(void) u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); + struct dl_bw *dl_b; int cpu, ret = 0; unsigned long flags; - rcu_read_lock(); - /* * Here we want to check the bandwidth not being set to some * value smaller than the currently allocated bandwidth in @@ -7662,25 +7666,27 @@ static int sched_dl_global_constraints(void) * solutions is welcome! */ for_each_possible_cpu(cpu) { - struct dl_bw *dl_b = dl_bw_of(cpu); + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); if (new_bw < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + if (ret) break; } - rcu_read_unlock(); - return ret; } static void sched_dl_do_global(void) { u64 new_bw = -1; + struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -7690,18 +7696,19 @@ static void sched_dl_do_global(void) if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); - rcu_read_lock(); /* * FIXME: As above... */ for_each_possible_cpu(cpu) { - struct dl_bw *dl_b = dl_bw_of(cpu); + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); dl_b->bw = new_bw; raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); } - rcu_read_unlock(); } static int sched_rt_global_validate(void) -- cgit v0.10.2