summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--kernel/sched/core.c4
-rw-r--r--kernel/sched/fair.c253
-rw-r--r--kernel/sched/sched.h13
3 files changed, 199 insertions, 71 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 32a2b29..1fe59da 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5236,6 +5236,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
DEFINE_PER_CPU(struct sched_domain *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
static void update_top_cache_domain(int cpu)
{
@@ -5252,6 +5253,9 @@ static void update_top_cache_domain(int cpu)
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
+
+ sd = lowest_flag_domain(cpu, SD_NUMA);
+ rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b1e5061..1422765 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_SMP
+static unsigned long task_h_load(struct task_struct *p);
+
static inline void __update_task_entity_contrib(struct sched_entity *se);
/* Give new task start runnable values to heavy its load in infant time */
@@ -906,12 +908,40 @@ static unsigned long target_load(int cpu, int type);
static unsigned long power_of(int cpu);
static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+/* Cached statistics for all CPUs within a node */
struct numa_stats {
+ unsigned long nr_running;
unsigned long load;
- s64 eff_load;
- unsigned long faults;
+
+ /* Total compute capacity of CPUs on a node */
+ unsigned long power;
+
+ /* Approximate capacity in terms of runnable tasks on a node */
+ unsigned long capacity;
+ int has_capacity;
};
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
+{
+ int cpu;
+
+ memset(ns, 0, sizeof(*ns));
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ ns->nr_running += rq->nr_running;
+ ns->load += weighted_cpuload(cpu);
+ ns->power += power_of(cpu);
+ }
+
+ ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
+ ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
+ ns->has_capacity = (ns->nr_running < ns->capacity);
+}
+
struct task_numa_env {
struct task_struct *p;
@@ -920,95 +950,178 @@ struct task_numa_env {
struct numa_stats src_stats, dst_stats;
- unsigned long best_load;
+ int imbalance_pct, idx;
+
+ struct task_struct *best_task;
+ long best_imp;
int best_cpu;
};
+static void task_numa_assign(struct task_numa_env *env,
+ struct task_struct *p, long imp)
+{
+ if (env->best_task)
+ put_task_struct(env->best_task);
+ if (p)
+ get_task_struct(p);
+
+ env->best_task = p;
+ env->best_imp = imp;
+ env->best_cpu = env->dst_cpu;
+}
+
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env, long imp)
+{
+ struct rq *src_rq = cpu_rq(env->src_cpu);
+ struct rq *dst_rq = cpu_rq(env->dst_cpu);
+ struct task_struct *cur;
+ long dst_load, src_load;
+ long load;
+
+ rcu_read_lock();
+ cur = ACCESS_ONCE(dst_rq->curr);
+ if (cur->pid == 0) /* idle */
+ cur = NULL;
+
+ /*
+ * "imp" is the fault differential for the source task between the
+ * source and destination node. Calculate the total differential for
+ * the source task and potential destination task. The more negative
+ * the value is, the more rmeote accesses that would be expected to
+ * be incurred if the tasks were swapped.
+ */
+ if (cur) {
+ /* Skip this swap candidate if cannot move to the source cpu */
+ if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+ goto unlock;
+
+ imp += task_faults(cur, env->src_nid) -
+ task_faults(cur, env->dst_nid);
+ }
+
+ if (imp < env->best_imp)
+ goto unlock;
+
+ if (!cur) {
+ /* Is there capacity at our destination? */
+ if (env->src_stats.has_capacity &&
+ !env->dst_stats.has_capacity)
+ goto unlock;
+
+ goto balance;
+ }
+
+ /* Balance doesn't matter much if we're running a task per cpu */
+ if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+ goto assign;
+
+ /*
+ * In the overloaded case, try and keep the load balanced.
+ */
+balance:
+ dst_load = env->dst_stats.load;
+ src_load = env->src_stats.load;
+
+ /* XXX missing power terms */
+ load = task_h_load(env->p);
+ dst_load += load;
+ src_load -= load;
+
+ if (cur) {
+ load = task_h_load(cur);
+ dst_load -= load;
+ src_load += load;
+ }
+
+ /* make src_load the smaller */
+ if (dst_load < src_load)
+ swap(dst_load, src_load);
+
+ if (src_load * env->imbalance_pct < dst_load * 100)
+ goto unlock;
+
+assign:
+ task_numa_assign(env, cur, imp);
+unlock:
+ rcu_read_unlock();
+}
+
static int task_numa_migrate(struct task_struct *p)
{
- int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid));
struct task_numa_env env = {
.p = p,
+
.src_cpu = task_cpu(p),
.src_nid = cpu_to_node(task_cpu(p)),
- .dst_cpu = node_cpu,
- .dst_nid = p->numa_preferred_nid,
- .best_load = ULONG_MAX,
- .best_cpu = task_cpu(p),
+
+ .imbalance_pct = 112,
+
+ .best_task = NULL,
+ .best_imp = 0,
+ .best_cpu = -1
};
struct sched_domain *sd;
- int cpu;
- struct task_group *tg = task_group(p);
- unsigned long weight;
- bool balanced;
- int imbalance_pct, idx = -1;
+ unsigned long faults;
+ int nid, cpu, ret;
/*
- * Find the lowest common scheduling domain covering the nodes of both
- * the CPU the task is currently running on and the target NUMA node.
+ * Pick the lowest SD_NUMA domain, as that would have the smallest
+ * imbalance and would be the first to start moving tasks about.
+ *
+ * And we want to avoid any moving of tasks about, as that would create
+ * random movement of tasks -- counter the numa conditions we're trying
+ * to satisfy here.
*/
rcu_read_lock();
- for_each_domain(env.src_cpu, sd) {
- if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) {
- /*
- * busy_idx is used for the load decision as it is the
- * same index used by the regular load balancer for an
- * active cpu.
- */
- idx = sd->busy_idx;
- imbalance_pct = sd->imbalance_pct;
- break;
- }
- }
+ sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+ env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
rcu_read_unlock();
- if (WARN_ON_ONCE(idx == -1))
- return 0;
+ faults = task_faults(p, env.src_nid);
+ update_numa_stats(&env.src_stats, env.src_nid);
- /*
- * XXX the below is mostly nicked from wake_affine(); we should
- * see about sharing a bit if at all possible; also it might want
- * some per entity weight love.
- */
- weight = p->se.load.weight;
- env.src_stats.load = source_load(env.src_cpu, idx);
- env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2;
- env.src_stats.eff_load *= power_of(env.src_cpu);
- env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight);
-
- for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) {
- env.dst_cpu = cpu;
- env.dst_stats.load = target_load(cpu, idx);
-
- /* If the CPU is idle, use it */
- if (!env.dst_stats.load) {
- env.best_cpu = cpu;
- goto migrate;
- }
+ /* Find an alternative node with relatively better statistics */
+ for_each_online_node(nid) {
+ long imp;
- /* Otherwise check the target CPU load */
- env.dst_stats.eff_load = 100;
- env.dst_stats.eff_load *= power_of(cpu);
- env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight);
+ if (nid == env.src_nid)
+ continue;
- /*
- * Destination is considered balanced if the destination CPU is
- * less loaded than the source CPU. Unfortunately there is a
- * risk that a task running on a lightly loaded CPU will not
- * migrate to its preferred node due to load imbalances.
- */
- balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load);
- if (!balanced)
+ /* Only consider nodes that recorded more faults */
+ imp = task_faults(p, nid) - faults;
+ if (imp < 0)
continue;
- if (env.dst_stats.eff_load < env.best_load) {
- env.best_load = env.dst_stats.eff_load;
- env.best_cpu = cpu;
+ env.dst_nid = nid;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ /* Skip this CPU if the source task cannot migrate */
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ continue;
+
+ env.dst_cpu = cpu;
+ task_numa_compare(&env, imp);
}
}
-migrate:
- return migrate_task_to(p, env.best_cpu);
+ /* No better CPU than the current one was found. */
+ if (env.best_cpu == -1)
+ return -EAGAIN;
+
+ if (env.best_task == NULL) {
+ int ret = migrate_task_to(p, env.best_cpu);
+ return ret;
+ }
+
+ ret = migrate_swap(p, env.best_task);
+ put_task_struct(env.best_task);
+ return ret;
}
/* Attempt to migrate a task to a CPU on the preferred node. */
@@ -1050,7 +1163,7 @@ static void task_numa_placement(struct task_struct *p)
/* Find the node with the highest number of faults */
for_each_online_node(nid) {
- unsigned long faults;
+ unsigned long faults = 0;
int priv, i;
for (priv = 0; priv < 2; priv++) {
@@ -1060,10 +1173,10 @@ static void task_numa_placement(struct task_struct *p)
p->numa_faults[i] >>= 1;
p->numa_faults[i] += p->numa_faults_buffer[i];
p->numa_faults_buffer[i] = 0;
+
+ faults += p->numa_faults[i];
}
- /* Find maximum private faults */
- faults = p->numa_faults[task_faults_idx(nid, 1)];
if (faults > max_faults) {
max_faults = faults;
max_nid = nid;
@@ -4455,8 +4568,6 @@ static int move_one_task(struct lb_env *env)
return 0;
}
-static unsigned long task_h_load(struct task_struct *p);
-
static const unsigned int sched_nr_migrate_break = 32;
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4dc92d0..691e969 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -610,9 +610,22 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
return hsd;
}
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd) {
+ if (sd->flags & flag)
+ break;
+ }
+
+ return sd;
+}
+
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain *, sd_numa);
struct sched_group_power {
atomic_t ref;