summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c66
-rw-r--r--kernel/sched/fair.c53
-rw-r--r--kernel/sched/loadavg.c4
-rw-r--r--kernel/sched/sched.h23
4 files changed, 50 insertions, 96 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 692c948..78181c0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1141,6 +1141,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
int ret = 0;
rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
if (p->flags & PF_KTHREAD) {
/*
@@ -6102,6 +6103,9 @@ enum s_alloc {
* Build an iteration mask that can exclude certain CPUs from the upwards
* domain traversal.
*
+ * Only CPUs that can arrive at this group should be considered to continue
+ * balancing.
+ *
* Asymmetric node setups can result in situations where the domain tree is of
* unequal depth, make sure to skip domains that already cover the entire
* range.
@@ -6113,18 +6117,31 @@ enum s_alloc {
*/
static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
{
- const struct cpumask *span = sched_domain_span(sd);
+ const struct cpumask *sg_span = sched_group_cpus(sg);
struct sd_data *sdd = sd->private;
struct sched_domain *sibling;
int i;
- for_each_cpu(i, span) {
+ for_each_cpu(i, sg_span) {
sibling = *per_cpu_ptr(sdd->sd, i);
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+
+ /*
+ * Can happen in the asymmetric case, where these siblings are
+ * unused. The mask will not be empty because those CPUs that
+ * do have the top domain _should_ span the domain.
+ */
+ if (!sibling->child)
+ continue;
+
+ /* If we would not end up here, we can't continue from here */
+ if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
continue;
cpumask_set_cpu(i, sched_group_mask(sg));
}
+
+ /* We must not have empty masks here */
+ WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg)));
}
/*
@@ -6148,7 +6165,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_clear(covered);
- for_each_cpu(i, span) {
+ for_each_cpu_wrap(i, span, cpu) {
struct cpumask *sg_span;
if (cpumask_test_cpu(i, covered))
@@ -7276,16 +7293,15 @@ static void cpuset_cpu_active(void)
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- num_cpus_frozen--;
- if (likely(num_cpus_frozen)) {
- partition_sched_domains(1, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL);
+ if (--num_cpus_frozen)
return;
- }
/*
* This is the last CPU online operation. So fall through and
* restore the original sched domains by considering the
* cpuset configurations.
*/
+ cpuset_force_rebuild();
}
cpuset_update_active_cpus(true);
}
@@ -7422,22 +7438,6 @@ int sched_cpu_dying(unsigned int cpu)
}
#endif
-#ifdef CONFIG_SCHED_SMT
-DEFINE_STATIC_KEY_FALSE(sched_smt_present);
-
-static void sched_init_smt(void)
-{
- /*
- * We've enumerated all CPUs and will assume that if any CPU
- * has SMT siblings, CPU0 will too.
- */
- if (cpumask_weight(cpu_smt_mask(0)) > 1)
- static_branch_enable(&sched_smt_present);
-}
-#else
-static inline void sched_init_smt(void) { }
-#endif
-
void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
@@ -7467,9 +7467,6 @@ void __init sched_init_smp(void)
init_sched_rt_class();
init_sched_dl_class();
-
- sched_init_smt();
-
sched_smp_initialized = true;
}
@@ -7964,6 +7961,7 @@ void sched_move_task(struct task_struct *tsk)
struct rq *rq;
rq = task_rq_lock(tsk, &rf);
+ update_rq_clock(rq);
running = task_current(rq, tsk);
queued = task_on_rq_queued(tsk);
@@ -8379,11 +8377,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
- sched_online_group(tg, parent);
-
return &tg->css;
}
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
@@ -8786,6 +8793,7 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.fork = cpu_cgroup_fork,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c242944..7a68c63 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5310,43 +5310,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
-/*
- * Implement a for_each_cpu() variant that starts the scan at a given cpu
- * (@start), and wraps around.
- *
- * This is used to scan for idle CPUs; such that not all CPUs looking for an
- * idle CPU find the same CPU. The down-side is that tasks tend to cycle
- * through the LLC domain.
- *
- * Especially tbench is found sensitive to this.
- */
-
-static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
-{
- int next;
-
-again:
- next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
-
- if (*wrapped) {
- if (next >= start)
- return nr_cpumask_bits;
- } else {
- if (next >= nr_cpumask_bits) {
- *wrapped = 1;
- n = -1;
- goto again;
- }
- }
-
- return next;
-}
-
-#define for_each_cpu_wrap(cpu, mask, start, wrap) \
- for ((wrap) = 0, (cpu) = (start)-1; \
- (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
- (cpu) < nr_cpumask_bits; )
-
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
@@ -5376,7 +5339,7 @@ static inline bool test_idle_cores(int cpu, bool def)
* Since SMT siblings share all cache levels, inspecting this limited remote
* state should be fairly cheap.
*/
-void __update_idle_core(struct rq *rq)
+void update_idle_core(struct rq *rq)
{
int core = cpu_of(rq);
int cpu;
@@ -5406,17 +5369,14 @@ unlock:
static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- int core, cpu, wrap;
-
- if (!static_branch_likely(&sched_smt_present))
- return -1;
+ int core, cpu;
if (!test_idle_cores(target, false))
return -1;
cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
- for_each_cpu_wrap(core, cpus, target, wrap) {
+ for_each_cpu_wrap(core, cpus, target) {
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -5444,9 +5404,6 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
{
int cpu;
- if (!static_branch_likely(&sched_smt_present))
- return -1;
-
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
continue;
@@ -5482,7 +5439,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
u64 avg_cost, avg_idle = this_rq()->avg_idle;
u64 time, cost;
s64 delta;
- int cpu, wrap;
+ int cpu;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -5499,7 +5456,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = local_clock();
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
continue;
if (idle_cpu(cpu))
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index a2d6eb7..ec91fcc 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -201,8 +201,9 @@ void calc_load_exit_idle(void)
struct rq *this_rq = this_rq();
/*
- * If we're still before the sample window, we're done.
+ * If we're still before the pending sample window, we're done.
*/
+ this_rq->calc_load_update = calc_load_update;
if (time_before(jiffies, this_rq->calc_load_update))
return;
@@ -211,7 +212,6 @@ void calc_load_exit_idle(void)
* accounted through the nohz accounting, so skip the entire deal and
* sync up for the next window.
*/
- this_rq->calc_load_update = calc_load_update;
if (time_before(jiffies, this_rq->calc_load_update + 10))
this_rq->calc_load_update += LOAD_FREQ;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 055f935..ad77d66 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -43,6 +43,12 @@ extern void cpu_load_update_active(struct rq *this_rq);
static inline void cpu_load_update_active(struct rq *this_rq) { }
#endif
+#ifdef CONFIG_SCHED_SMT
+extern void update_idle_core(struct rq *rq);
+#else
+static inline void update_idle_core(struct rq *rq) { }
+#endif
+
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
@@ -731,23 +737,6 @@ static inline int cpu_of(struct rq *rq)
#endif
}
-
-#ifdef CONFIG_SCHED_SMT
-
-extern struct static_key_false sched_smt_present;
-
-extern void __update_idle_core(struct rq *rq);
-
-static inline void update_idle_core(struct rq *rq)
-{
- if (static_branch_unlikely(&sched_smt_present))
- __update_idle_core(rq);
-}
-
-#else
-static inline void update_idle_core(struct rq *rq) { }
-#endif
-
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))