diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 1445 |
1 files changed, 205 insertions, 1240 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fd773ad..7c70201 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -681,8 +681,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP -static unsigned long task_h_load(struct task_struct *p); - static inline void __update_task_entity_contrib(struct sched_entity *se); /* Give new task start runnable values to heavy its load in infant time */ @@ -820,12 +818,11 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_NUMA_BALANCING /* - * Approximate time to scan a full NUMA task in ms. The task scan period is - * calculated based on the tasks virtual memory size and - * numa_balancing_scan_size. + * numa task sample period in ms */ -unsigned int sysctl_numa_balancing_scan_period_min = 1000; -unsigned int sysctl_numa_balancing_scan_period_max = 60000; +unsigned int sysctl_numa_balancing_scan_period_min = 100; +unsigned int sysctl_numa_balancing_scan_period_max = 100*50; +unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; /* Portion of address space to scan in MB */ unsigned int sysctl_numa_balancing_scan_size = 256; @@ -833,835 +830,41 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; -/* - * After skipping a page migration on a shared page, skip N more numa page - * migrations unconditionally. This reduces the number of NUMA migrations - * in shared memory workloads, and has the effect of pulling tasks towards - * where their memory lives, over pulling the memory towards the task. - */ -unsigned int sysctl_numa_balancing_migrate_deferred = 16; - -static unsigned int task_nr_scan_windows(struct task_struct *p) -{ - unsigned long rss = 0; - unsigned long nr_scan_pages; - - /* - * Calculations based on RSS as non-present and empty pages are skipped - * by the PTE scanner and NUMA hinting faults should be trapped based - * on resident pages - */ - nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); - rss = get_mm_rss(p->mm); - if (!rss) - rss = nr_scan_pages; - - rss = round_up(rss, nr_scan_pages); - return rss / nr_scan_pages; -} - -/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ -#define MAX_SCAN_WINDOW 2560 - -static unsigned int task_scan_min(struct task_struct *p) -{ - unsigned int scan, floor; - unsigned int windows = 1; - - if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) - windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; - floor = 1000 / windows; - - scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); - return max_t(unsigned int, floor, scan); -} - -static unsigned int task_scan_max(struct task_struct *p) -{ - unsigned int smin = task_scan_min(p); - unsigned int smax; - - /* Watch for min being lower than max due to floor calculations */ - smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); - return max(smin, smax); -} - -/* - * Once a preferred node is selected the scheduler balancer will prefer moving - * a task to that node for sysctl_numa_balancing_settle_count number of PTE - * scans. This will give the process the chance to accumulate more faults on - * the preferred node but still allow the scheduler to move the task again if - * the nodes CPUs are overloaded. - */ -unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; - -static void account_numa_enqueue(struct rq *rq, struct task_struct *p) -{ - rq->nr_numa_running += (p->numa_preferred_nid != -1); - rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); -} - -static void account_numa_dequeue(struct rq *rq, struct task_struct *p) -{ - rq->nr_numa_running -= (p->numa_preferred_nid != -1); - rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); -} - -struct numa_group { - atomic_t refcount; - - spinlock_t lock; /* nr_tasks, tasks */ - int nr_tasks; - pid_t gid; - struct list_head task_list; - - struct rcu_head rcu; - unsigned long total_faults; - unsigned long faults[0]; -}; - -pid_t task_numa_group_id(struct task_struct *p) -{ - return p->numa_group ? p->numa_group->gid : 0; -} - -static inline int task_faults_idx(int nid, int priv) -{ - return 2 * nid + priv; -} - -static inline unsigned long task_faults(struct task_struct *p, int nid) -{ - if (!p->numa_faults) - return 0; - - return p->numa_faults[task_faults_idx(nid, 0)] + - p->numa_faults[task_faults_idx(nid, 1)]; -} - -static inline unsigned long group_faults(struct task_struct *p, int nid) -{ - if (!p->numa_group) - return 0; - - return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; -} - -/* - * These return the fraction of accesses done by a particular task, or - * task group, on a particular numa node. The group weight is given a - * larger multiplier, in order to group tasks together that are almost - * evenly spread out between numa nodes. - */ -static inline unsigned long task_weight(struct task_struct *p, int nid) -{ - unsigned long total_faults; - - if (!p->numa_faults) - return 0; - - total_faults = p->total_numa_faults; - - if (!total_faults) - return 0; - - return 1000 * task_faults(p, nid) / total_faults; -} - -static inline unsigned long group_weight(struct task_struct *p, int nid) -{ - if (!p->numa_group || !p->numa_group->total_faults) - return 0; - - return 1000 * group_faults(p, nid) / p->numa_group->total_faults; -} - -static unsigned long weighted_cpuload(const int cpu); -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static unsigned long power_of(int cpu); -static long effective_load(struct task_group *tg, int cpu, long wl, long wg); - -/* Cached statistics for all CPUs within a node */ -struct numa_stats { - unsigned long nr_running; - unsigned long load; - - /* Total compute capacity of CPUs on a node */ - unsigned long power; - - /* Approximate capacity in terms of runnable tasks on a node */ - unsigned long capacity; - int has_capacity; -}; - -/* - * XXX borrowed from update_sg_lb_stats - */ -static void update_numa_stats(struct numa_stats *ns, int nid) -{ - int cpu, cpus = 0; - - memset(ns, 0, sizeof(*ns)); - for_each_cpu(cpu, cpumask_of_node(nid)) { - struct rq *rq = cpu_rq(cpu); - - ns->nr_running += rq->nr_running; - ns->load += weighted_cpuload(cpu); - ns->power += power_of(cpu); - - cpus++; - } - - /* - * If we raced with hotplug and there are no CPUs left in our mask - * the @ns structure is NULL'ed and task_numa_compare() will - * not find this node attractive. - * - * We'll either bail at !has_capacity, or we'll detect a huge imbalance - * and bail there. - */ - if (!cpus) - return; - - ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; - ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); - ns->has_capacity = (ns->nr_running < ns->capacity); -} - -struct task_numa_env { - struct task_struct *p; - - int src_cpu, src_nid; - int dst_cpu, dst_nid; - - struct numa_stats src_stats, dst_stats; - - int imbalance_pct, idx; - - struct task_struct *best_task; - long best_imp; - int best_cpu; -}; - -static void task_numa_assign(struct task_numa_env *env, - struct task_struct *p, long imp) -{ - if (env->best_task) - put_task_struct(env->best_task); - if (p) - get_task_struct(p); - - env->best_task = p; - env->best_imp = imp; - env->best_cpu = env->dst_cpu; -} - -/* - * This checks if the overall compute and NUMA accesses of the system would - * be improved if the source tasks was migrated to the target dst_cpu taking - * into account that it might be best if task running on the dst_cpu should - * be exchanged with the source task - */ -static void task_numa_compare(struct task_numa_env *env, - long taskimp, long groupimp) -{ - struct rq *src_rq = cpu_rq(env->src_cpu); - struct rq *dst_rq = cpu_rq(env->dst_cpu); - struct task_struct *cur; - long dst_load, src_load; - long load; - long imp = (groupimp > 0) ? groupimp : taskimp; - - rcu_read_lock(); - cur = ACCESS_ONCE(dst_rq->curr); - if (cur->pid == 0) /* idle */ - cur = NULL; - - /* - * "imp" is the fault differential for the source task between the - * source and destination node. Calculate the total differential for - * the source task and potential destination task. The more negative - * the value is, the more rmeote accesses that would be expected to - * be incurred if the tasks were swapped. - */ - if (cur) { - /* Skip this swap candidate if cannot move to the source cpu */ - if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) - goto unlock; - - /* - * If dst and source tasks are in the same NUMA group, or not - * in any group then look only at task weights. - */ - if (cur->numa_group == env->p->numa_group) { - imp = taskimp + task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); - /* - * Add some hysteresis to prevent swapping the - * tasks within a group over tiny differences. - */ - if (cur->numa_group) - imp -= imp/16; - } else { - /* - * Compare the group weights. If a task is all by - * itself (not part of a group), use the task weight - * instead. - */ - if (env->p->numa_group) - imp = groupimp; - else - imp = taskimp; - - if (cur->numa_group) - imp += group_weight(cur, env->src_nid) - - group_weight(cur, env->dst_nid); - else - imp += task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); - } - } - - if (imp < env->best_imp) - goto unlock; - - if (!cur) { - /* Is there capacity at our destination? */ - if (env->src_stats.has_capacity && - !env->dst_stats.has_capacity) - goto unlock; - - goto balance; - } - - /* Balance doesn't matter much if we're running a task per cpu */ - if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) - goto assign; - - /* - * In the overloaded case, try and keep the load balanced. - */ -balance: - dst_load = env->dst_stats.load; - src_load = env->src_stats.load; - - /* XXX missing power terms */ - load = task_h_load(env->p); - dst_load += load; - src_load -= load; - - if (cur) { - load = task_h_load(cur); - dst_load -= load; - src_load += load; - } - - /* make src_load the smaller */ - if (dst_load < src_load) - swap(dst_load, src_load); - - if (src_load * env->imbalance_pct < dst_load * 100) - goto unlock; - -assign: - task_numa_assign(env, cur, imp); -unlock: - rcu_read_unlock(); -} - -static void task_numa_find_cpu(struct task_numa_env *env, - long taskimp, long groupimp) -{ - int cpu; - - for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { - /* Skip this CPU if the source task cannot migrate */ - if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) - continue; - - env->dst_cpu = cpu; - task_numa_compare(env, taskimp, groupimp); - } -} - -static int task_numa_migrate(struct task_struct *p) -{ - struct task_numa_env env = { - .p = p, - - .src_cpu = task_cpu(p), - .src_nid = task_node(p), - - .imbalance_pct = 112, - - .best_task = NULL, - .best_imp = 0, - .best_cpu = -1 - }; - struct sched_domain *sd; - unsigned long taskweight, groupweight; - int nid, ret; - long taskimp, groupimp; - - /* - * Pick the lowest SD_NUMA domain, as that would have the smallest - * imbalance and would be the first to start moving tasks about. - * - * And we want to avoid any moving of tasks about, as that would create - * random movement of tasks -- counter the numa conditions we're trying - * to satisfy here. - */ - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); - if (sd) - env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; - rcu_read_unlock(); - - /* - * Cpusets can break the scheduler domain tree into smaller - * balance domains, some of which do not cross NUMA boundaries. - * Tasks that are "trapped" in such domains cannot be migrated - * elsewhere, so there is no point in (re)trying. - */ - if (unlikely(!sd)) { - p->numa_preferred_nid = cpu_to_node(task_cpu(p)); - return -EINVAL; - } - - taskweight = task_weight(p, env.src_nid); - groupweight = group_weight(p, env.src_nid); - update_numa_stats(&env.src_stats, env.src_nid); - env.dst_nid = p->numa_preferred_nid; - taskimp = task_weight(p, env.dst_nid) - taskweight; - groupimp = group_weight(p, env.dst_nid) - groupweight; - update_numa_stats(&env.dst_stats, env.dst_nid); - - /* If the preferred nid has capacity, try to use it. */ - if (env.dst_stats.has_capacity) - task_numa_find_cpu(&env, taskimp, groupimp); - - /* No space available on the preferred nid. Look elsewhere. */ - if (env.best_cpu == -1) { - for_each_online_node(nid) { - if (nid == env.src_nid || nid == p->numa_preferred_nid) - continue; - - /* Only consider nodes where both task and groups benefit */ - taskimp = task_weight(p, nid) - taskweight; - groupimp = group_weight(p, nid) - groupweight; - if (taskimp < 0 && groupimp < 0) - continue; - - env.dst_nid = nid; - update_numa_stats(&env.dst_stats, env.dst_nid); - task_numa_find_cpu(&env, taskimp, groupimp); - } - } - - /* No better CPU than the current one was found. */ - if (env.best_cpu == -1) - return -EAGAIN; - - sched_setnuma(p, env.dst_nid); - - /* - * Reset the scan period if the task is being rescheduled on an - * alternative node to recheck if the tasks is now properly placed. - */ - p->numa_scan_period = task_scan_min(p); - - if (env.best_task == NULL) { - int ret = migrate_task_to(p, env.best_cpu); - return ret; - } - - ret = migrate_swap(p, env.best_task); - put_task_struct(env.best_task); - return ret; -} - -/* Attempt to migrate a task to a CPU on the preferred node. */ -static void numa_migrate_preferred(struct task_struct *p) -{ - /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) - return; - - /* Periodically retry migrating the task to the preferred node */ - p->numa_migrate_retry = jiffies + HZ; - - /* Success if task is already running on preferred CPU */ - if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) - return; - - /* Otherwise, try migrate to a CPU on the preferred node */ - task_numa_migrate(p); -} - -/* - * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS - * increments. The more local the fault statistics are, the higher the scan - * period will be for the next scan window. If local/remote ratio is below - * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the - * scan period will decrease - */ -#define NUMA_PERIOD_SLOTS 10 -#define NUMA_PERIOD_THRESHOLD 3 - -/* - * Increase the scan period (slow down scanning) if the majority of - * our memory is already on our local node, or if the majority of - * the page accesses are shared with other processes. - * Otherwise, decrease the scan period. - */ -static void update_task_scan_period(struct task_struct *p, - unsigned long shared, unsigned long private) -{ - unsigned int period_slot; - int ratio; - int diff; - - unsigned long remote = p->numa_faults_locality[0]; - unsigned long local = p->numa_faults_locality[1]; - - /* - * If there were no record hinting faults then either the task is - * completely idle or all activity is areas that are not of interest - * to automatic numa balancing. Scan slower - */ - if (local + shared == 0) { - p->numa_scan_period = min(p->numa_scan_period_max, - p->numa_scan_period << 1); - - p->mm->numa_next_scan = jiffies + - msecs_to_jiffies(p->numa_scan_period); - - return; - } - - /* - * Prepare to scale scan period relative to the current period. - * == NUMA_PERIOD_THRESHOLD scan period stays the same - * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) - * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) - */ - period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); - ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); - if (ratio >= NUMA_PERIOD_THRESHOLD) { - int slot = ratio - NUMA_PERIOD_THRESHOLD; - if (!slot) - slot = 1; - diff = slot * period_slot; - } else { - diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; - - /* - * Scale scan rate increases based on sharing. There is an - * inverse relationship between the degree of sharing and - * the adjustment made to the scanning period. Broadly - * speaking the intent is that there is little point - * scanning faster if shared accesses dominate as it may - * simply bounce migrations uselessly - */ - period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); - diff = (diff * ratio) / NUMA_PERIOD_SLOTS; - } - - p->numa_scan_period = clamp(p->numa_scan_period + diff, - task_scan_min(p), task_scan_max(p)); - memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); -} - static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1, max_group_nid = -1; - unsigned long max_faults = 0, max_group_faults = 0; - unsigned long fault_types[2] = { 0, 0 }; - spinlock_t *group_lock = NULL; + int seq; + if (!p->mm) /* for example, ksmd faulting in a user's mm */ + return; seq = ACCESS_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; - p->numa_scan_period_max = task_scan_max(p); - - /* If the task is part of a group prevent parallel updates to group stats */ - if (p->numa_group) { - group_lock = &p->numa_group->lock; - spin_lock(group_lock); - } - - /* Find the node with the highest number of faults */ - for_each_online_node(nid) { - unsigned long faults = 0, group_faults = 0; - int priv, i; - - for (priv = 0; priv < 2; priv++) { - long diff; - - i = task_faults_idx(nid, priv); - diff = -p->numa_faults[i]; - - /* Decay existing window, copy faults since last scan */ - p->numa_faults[i] >>= 1; - p->numa_faults[i] += p->numa_faults_buffer[i]; - fault_types[priv] += p->numa_faults_buffer[i]; - p->numa_faults_buffer[i] = 0; - - faults += p->numa_faults[i]; - diff += p->numa_faults[i]; - p->total_numa_faults += diff; - if (p->numa_group) { - /* safe because we can only change our own group */ - p->numa_group->faults[i] += diff; - p->numa_group->total_faults += diff; - group_faults += p->numa_group->faults[i]; - } - } - - if (faults > max_faults) { - max_faults = faults; - max_nid = nid; - } - - if (group_faults > max_group_faults) { - max_group_faults = group_faults; - max_group_nid = nid; - } - } - - update_task_scan_period(p, fault_types[0], fault_types[1]); - - if (p->numa_group) { - /* - * If the preferred task and group nids are different, - * iterate over the nodes again to find the best place. - */ - if (max_nid != max_group_nid) { - unsigned long weight, max_weight = 0; - - for_each_online_node(nid) { - weight = task_weight(p, nid) + group_weight(p, nid); - if (weight > max_weight) { - max_weight = weight; - max_nid = nid; - } - } - } - - spin_unlock(group_lock); - } - /* Preferred node as the node with the most faults */ - if (max_faults && max_nid != p->numa_preferred_nid) { - /* Update the preferred nid and migrate task if possible */ - sched_setnuma(p, max_nid); - numa_migrate_preferred(p); - } -} - -static inline int get_numa_group(struct numa_group *grp) -{ - return atomic_inc_not_zero(&grp->refcount); -} - -static inline void put_numa_group(struct numa_group *grp) -{ - if (atomic_dec_and_test(&grp->refcount)) - kfree_rcu(grp, rcu); -} - -static void task_numa_group(struct task_struct *p, int cpupid, int flags, - int *priv) -{ - struct numa_group *grp, *my_grp; - struct task_struct *tsk; - bool join = false; - int cpu = cpupid_to_cpu(cpupid); - int i; - - if (unlikely(!p->numa_group)) { - unsigned int size = sizeof(struct numa_group) + - 2*nr_node_ids*sizeof(unsigned long); - - grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!grp) - return; - - atomic_set(&grp->refcount, 1); - spin_lock_init(&grp->lock); - INIT_LIST_HEAD(&grp->task_list); - grp->gid = p->pid; - - for (i = 0; i < 2*nr_node_ids; i++) - grp->faults[i] = p->numa_faults[i]; - - grp->total_faults = p->total_numa_faults; - - list_add(&p->numa_entry, &grp->task_list); - grp->nr_tasks++; - rcu_assign_pointer(p->numa_group, grp); - } - - rcu_read_lock(); - tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); - - if (!cpupid_match_pid(tsk, cpupid)) - goto no_join; - - grp = rcu_dereference(tsk->numa_group); - if (!grp) - goto no_join; - - my_grp = p->numa_group; - if (grp == my_grp) - goto no_join; - - /* - * Only join the other group if its bigger; if we're the bigger group, - * the other task will join us. - */ - if (my_grp->nr_tasks > grp->nr_tasks) - goto no_join; - - /* - * Tie-break on the grp address. - */ - if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) - goto no_join; - - /* Always join threads in the same process. */ - if (tsk->mm == current->mm) - join = true; - - /* Simple filter to avoid false positives due to PID collisions */ - if (flags & TNF_SHARED) - join = true; - - /* Update priv based on whether false sharing was detected */ - *priv = !join; - - if (join && !get_numa_group(grp)) - goto no_join; - - rcu_read_unlock(); - - if (!join) - return; - - double_lock(&my_grp->lock, &grp->lock); - - for (i = 0; i < 2*nr_node_ids; i++) { - my_grp->faults[i] -= p->numa_faults[i]; - grp->faults[i] += p->numa_faults[i]; - } - my_grp->total_faults -= p->total_numa_faults; - grp->total_faults += p->total_numa_faults; - - list_move(&p->numa_entry, &grp->task_list); - my_grp->nr_tasks--; - grp->nr_tasks++; - - spin_unlock(&my_grp->lock); - spin_unlock(&grp->lock); - - rcu_assign_pointer(p->numa_group, grp); - - put_numa_group(my_grp); - return; - -no_join: - rcu_read_unlock(); - return; -} - -void task_numa_free(struct task_struct *p) -{ - struct numa_group *grp = p->numa_group; - int i; - void *numa_faults = p->numa_faults; - - if (grp) { - spin_lock(&grp->lock); - for (i = 0; i < 2*nr_node_ids; i++) - grp->faults[i] -= p->numa_faults[i]; - grp->total_faults -= p->total_numa_faults; - - list_del(&p->numa_entry); - grp->nr_tasks--; - spin_unlock(&grp->lock); - rcu_assign_pointer(p->numa_group, NULL); - put_numa_group(grp); - } - - p->numa_faults = NULL; - p->numa_faults_buffer = NULL; - kfree(numa_faults); + /* FIXME: Scheduling placement policy hints go here */ } /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int last_cpupid, int node, int pages, int flags) +void task_numa_fault(int node, int pages, bool migrated) { struct task_struct *p = current; - bool migrated = flags & TNF_MIGRATED; - int priv; if (!numabalancing_enabled) return; - /* for example, ksmd faulting in a user's mm */ - if (!p->mm) - return; - - /* Do not worry about placement if exiting */ - if (p->state == TASK_DEAD) - return; - - /* Allocate buffer to track faults on a per-node basis */ - if (unlikely(!p->numa_faults)) { - int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; - - /* numa_faults and numa_faults_buffer share the allocation */ - p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); - if (!p->numa_faults) - return; - - BUG_ON(p->numa_faults_buffer); - p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); - p->total_numa_faults = 0; - memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); - } + /* FIXME: Allocate task-specific structure for placement policy here */ /* - * First accesses are treated as private, otherwise consider accesses - * to be private if the accessing pid has not changed + * If pages are properly placed (did not migrate) then scan slower. + * This is reset periodically in case of phase changes */ - if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { - priv = 1; - } else { - priv = cpupid_match_pid(p, last_cpupid); - if (!priv && !(flags & TNF_NO_GROUP)) - task_numa_group(p, last_cpupid, flags, &priv); - } + if (!migrated) + p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, + p->numa_scan_period + jiffies_to_msecs(10)); task_numa_placement(p); - - /* - * Retry task to preferred node migration periodically, in case it - * case it previously failed, or the scheduler moved us. - */ - if (time_after(jiffies, p->numa_migrate_retry)) - numa_migrate_preferred(p); - - if (migrated) - p->numa_pages_migrated += pages; - - p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; - p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; } static void reset_ptenuma_scan(struct task_struct *p) @@ -1681,7 +884,6 @@ void task_numa_work(struct callback_head *work) struct mm_struct *mm = p->mm; struct vm_area_struct *vma; unsigned long start, end; - unsigned long nr_pte_updates = 0; long pages; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -1698,9 +900,35 @@ void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; - if (!mm->numa_next_scan) { - mm->numa_next_scan = now + - msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + /* + * We do not care about task placement until a task runs on a node + * other than the first one used by the address space. This is + * largely because migrations are driven by what CPU the task + * is running on. If it's never scheduled on another node, it'll + * not migrate so why bother trapping the fault. + */ + if (mm->first_nid == NUMA_PTE_SCAN_INIT) + mm->first_nid = numa_node_id(); + if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { + /* Are we running on a new node yet? */ + if (numa_node_id() == mm->first_nid && + !sched_feat_numa(NUMA_FORCE)) + return; + + mm->first_nid = NUMA_PTE_SCAN_ACTIVE; + } + + /* + * Reset the scan period if enough time has gone by. Objective is that + * scanning will be reduced if pages are properly placed. As tasks + * can enter different phases this needs to be re-examined. Lacking + * proper tracking of reference behaviour, this blunt hammer is used. + */ + migrate = mm->numa_next_reset; + if (time_after(now, migrate)) { + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); + xchg(&mm->numa_next_reset, next_scan); } /* @@ -1710,20 +938,20 @@ void task_numa_work(struct callback_head *work) if (time_before(now, migrate)) return; - if (p->numa_scan_period == 0) { - p->numa_scan_period_max = task_scan_max(p); - p->numa_scan_period = task_scan_min(p); - } + if (p->numa_scan_period == 0) + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; next_scan = now + msecs_to_jiffies(p->numa_scan_period); if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) return; /* - * Delay this task enough that another task of this mm will likely win - * the next time around. + * Do not set pte_numa if the current running node is rate-limited. + * This loses statistics on the fault but if we are unwilling to + * migrate to this node, it is less likely we can do useful work */ - p->node_stamp += 2 * TICK_NSEC; + if (migrate_ratelimited(numa_node_id())) + return; start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; @@ -1739,32 +967,18 @@ void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) + if (!vma_migratable(vma)) continue; - /* - * Shared library pages mapped by multiple processes are not - * migrated as it is expected they are cache replicated. Avoid - * hinting faults in read-only file-backed mappings or the vdso - * as migrating the pages will be of marginal benefit. - */ - if (!vma->vm_mm || - (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + /* Skip small VMAs. They are not likely to be of relevance */ + if (vma->vm_end - vma->vm_start < HPAGE_SIZE) continue; do { start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); end = min(end, vma->vm_end); - nr_pte_updates += change_prot_numa(vma, start, end); - - /* - * Scan sysctl_numa_balancing_scan_size but ensure that - * at least one PTE is updated so that unused virtual - * address space is quickly skipped. - */ - if (nr_pte_updates) - pages -= (end - start) >> PAGE_SHIFT; + pages -= change_prot_numa(vma, start, end); start = end; if (pages <= 0) @@ -1774,10 +988,10 @@ void task_numa_work(struct callback_head *work) out: /* - * It is possible to reach the end of the VMA list but the last few - * VMAs are not guaranteed to the vma_migratable. If they are not, we - * would find the !migratable VMA on the next scan but not reset the - * scanner to the start so check it now. + * It is possible to reach the end of the VMA list but the last few VMAs are + * not guaranteed to the vma_migratable. If they are not, we would find the + * !migratable VMA on the next scan but not reset the scanner to the start + * so check it now. */ if (vma) mm->numa_scan_offset = start; @@ -1811,8 +1025,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) if (now - curr->node_stamp > period) { if (!curr->node_stamp) - curr->numa_scan_period = task_scan_min(curr); - curr->node_stamp += period; + curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; + curr->node_stamp = now; if (!time_before(jiffies, curr->mm->numa_next_scan)) { init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ @@ -1824,14 +1038,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) static void task_tick_numa(struct rq *rq, struct task_struct *curr) { } - -static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) -{ -} - -static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) -{ -} #endif /* CONFIG_NUMA_BALANCING */ static void @@ -1841,12 +1047,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) if (!parent_entity(se)) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP - if (entity_is_task(se)) { - struct rq *rq = rq_of(cfs_rq); - - account_numa_enqueue(rq, task_of(se)); - list_add(&se->group_node, &rq->cfs_tasks); - } + if (entity_is_task(se)) + list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); #endif cfs_rq->nr_running++; } @@ -1857,10 +1059,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - account_numa_dequeue(rq_of(cfs_rq), task_of(se)); + if (entity_is_task(se)) list_del_init(&se->group_node); - } cfs_rq->nr_running--; } @@ -2178,7 +1378,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, long contrib; /* The fraction of a cpu used by this cfs_rq */ - contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, + contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, sa->runnable_avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; @@ -2870,14 +2070,13 @@ static inline bool cfs_bandwidth_used(void) return static_key_false(&__cfs_bandwidth_used); } -void cfs_bandwidth_usage_inc(void) +void account_cfs_bandwidth_used(int enabled, int was_enabled) { - static_key_slow_inc(&__cfs_bandwidth_used); -} - -void cfs_bandwidth_usage_dec(void) -{ - static_key_slow_dec(&__cfs_bandwidth_used); + /* only need to count groups transitioning between enabled/!enabled */ + if (enabled && !was_enabled) + static_key_slow_inc(&__cfs_bandwidth_used); + else if (!enabled && was_enabled) + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) @@ -2885,8 +2084,7 @@ static bool cfs_bandwidth_used(void) return true; } -void cfs_bandwidth_usage_inc(void) {} -void cfs_bandwidth_usage_dec(void) {} +void account_cfs_bandwidth_used(int enabled, int was_enabled) {} #endif /* HAVE_JUMP_LABEL */ /* @@ -3137,8 +2335,6 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - if (!cfs_b->timer_active) - __start_cfs_bandwidth(cfs_b); raw_spin_unlock(&cfs_b->lock); } @@ -3252,13 +2448,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) if (idle) goto out_unlock; - /* - * if we have relooped after returning idle once, we need to update our - * status as actually running, so that other cpus doing - * __start_cfs_bandwidth will stop trying to cancel us. - */ - cfs_b->timer_active = 1; - __refill_cfs_bandwidth_runtime(cfs_b); if (!throttled) { @@ -3319,13 +2508,7 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; /* how long we wait to gather additional slack before distributing */ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; -/* - * Are we near the end of the current quota period? - * - * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the - * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of - * migrate_hrtimers, base is never cleared, so we are fine. - */ +/* are we near the end of the current quota period? */ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { struct hrtimer *refresh_timer = &cfs_b->period_timer; @@ -3401,12 +2584,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) u64 expires; /* confirm we're still not at a refresh boundary */ - raw_spin_lock(&cfs_b->lock); - if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { - raw_spin_unlock(&cfs_b->lock); + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) return; - } + raw_spin_lock(&cfs_b->lock); if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { runtime = cfs_b->runtime; cfs_b->runtime = 0; @@ -3527,11 +2708,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * (timer_active==0 becomes visible before the hrtimer call-back * terminates). In either case we ensure that it's re-programmed */ - while (unlikely(hrtimer_active(&cfs_b->period_timer)) && - hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { - /* bounce the lock to allow do_sched_cfs_period_timer to run */ + while (unlikely(hrtimer_active(&cfs_b->period_timer))) { raw_spin_unlock(&cfs_b->lock); - cpu_relax(); + /* ensure cfs_b->lock is available while we wait */ + hrtimer_cancel(&cfs_b->period_timer); + raw_spin_lock(&cfs_b->lock); /* if someone else restarted the timer then we're done */ if (cfs_b->timer_active) @@ -3932,7 +3113,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - if (!tg->parent || !wl) /* the trivial, non-cgroup case */ + if (!tg->parent) /* the trivial, non-cgroup case */ return wl; for_each_sched_entity(se) { @@ -3985,7 +3166,8 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) } #else -static long effective_load(struct task_group *tg, int cpu, long wl, long wg) +static inline unsigned long effective_load(struct task_group *tg, int cpu, + unsigned long wl, unsigned long wg) { return wl; } @@ -4238,10 +3420,11 @@ done: * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; @@ -4721,12 +3904,9 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp static unsigned long __read_mostly max_load_balance_interval = HZ/10; -enum fbq_type { regular, remote, all }; - #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 -#define LBF_DST_PINNED 0x04 -#define LBF_SOME_PINNED 0x08 +#define LBF_SOME_PINNED 0x04 struct lb_env { struct sched_domain *sd; @@ -4749,8 +3929,6 @@ struct lb_env { unsigned int loop; unsigned int loop_break; unsigned int loop_max; - - enum fbq_type fbq_type; }; /* @@ -4797,78 +3975,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } -#ifdef CONFIG_NUMA_BALANCING -/* Returns true if the destination node has incurred more faults */ -static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) -{ - int src_nid, dst_nid; - - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || - !(env->sd->flags & SD_NUMA)) { - return false; - } - - src_nid = cpu_to_node(env->src_cpu); - dst_nid = cpu_to_node(env->dst_cpu); - - if (src_nid == dst_nid) - return false; - - /* Always encourage migration to the preferred node. */ - if (dst_nid == p->numa_preferred_nid) - return true; - - /* If both task and group weight improve, this move is a winner. */ - if (task_weight(p, dst_nid) > task_weight(p, src_nid) && - group_weight(p, dst_nid) > group_weight(p, src_nid)) - return true; - - return false; -} - - -static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) -{ - int src_nid, dst_nid; - - if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) - return false; - - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) - return false; - - src_nid = cpu_to_node(env->src_cpu); - dst_nid = cpu_to_node(env->dst_cpu); - - if (src_nid == dst_nid) - return false; - - /* Migrating away from the preferred node is always bad. */ - if (src_nid == p->numa_preferred_nid) - return true; - - /* If either task or group weight get worse, don't do it. */ - if (task_weight(p, dst_nid) < task_weight(p, src_nid) || - group_weight(p, dst_nid) < group_weight(p, src_nid)) - return true; - - return false; -} - -#else -static inline bool migrate_improves_locality(struct task_struct *p, - struct lb_env *env) -{ - return false; -} - -static inline bool migrate_degrades_locality(struct task_struct *p, - struct lb_env *env) -{ - return false; -} -#endif - /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -4891,8 +3997,6 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) schedstat_inc(p, se.statistics.nr_failed_migrations_affine); - env->flags |= LBF_SOME_PINNED; - /* * Remember if this task can be migrated to any other cpu in * our sched_group. We may want to revisit it if we couldn't @@ -4901,13 +4005,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * Also avoid computing new_dst_cpu if we have already computed * one in current iteration. */ - if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) + if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) return 0; /* Prevent to re-select dst_cpu via env's cpus */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { - env->flags |= LBF_DST_PINNED; + env->flags |= LBF_SOME_PINNED; env->new_dst_cpu = cpu; break; } @@ -4926,24 +4030,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * Aggressive migration if: - * 1) destination numa is preferred - * 2) task is cache cold, or - * 3) too many balance attempts have failed. + * 1) task is cache cold, or + * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); - if (!tsk_cache_hot) - tsk_cache_hot = migrate_degrades_locality(p, env); - - if (migrate_improves_locality(p, env)) { -#ifdef CONFIG_SCHEDSTATS - if (tsk_cache_hot) { - schedstat_inc(env->sd, lb_hot_gained[env->idle]); - schedstat_inc(p, se.statistics.nr_forced_migrations); - } -#endif - return 1; - } + tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { @@ -4986,6 +4077,8 @@ static int move_one_task(struct lb_env *env) return 0; } +static unsigned long task_h_load(struct task_struct *p); + static const unsigned int sched_nr_migrate_break = 32; /* @@ -5198,10 +4291,6 @@ struct sg_lb_stats { unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ int group_has_capacity; /* Is there extra capacity in the group? */ -#ifdef CONFIG_NUMA_BALANCING - unsigned int nr_numa_running; - unsigned int nr_preferred_running; -#endif }; /* @@ -5241,7 +4330,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The idle status of the CPU for whose sd load_idx is obtained. + * @idle: The Idle status of the CPU for whose sd load_icx is obtained. * * Return: The load index. */ @@ -5358,7 +4447,7 @@ void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long power, power_orig; + unsigned long power; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -5370,7 +4459,7 @@ void update_group_power(struct sched_domain *sd, int cpu) return; } - power_orig = power = 0; + power = 0; if (child->flags & SD_OVERLAP) { /* @@ -5378,33 +4467,8 @@ void update_group_power(struct sched_domain *sd, int cpu) * span the current group. */ - for_each_cpu(cpu, sched_group_cpus(sdg)) { - struct sched_group_power *sgp; - struct rq *rq = cpu_rq(cpu); - - /* - * build_sched_domains() -> init_sched_groups_power() - * gets here before we've attached the domains to the - * runqueues. - * - * Use power_of(), which is set irrespective of domains - * in update_cpu_power(). - * - * This avoids power/power_orig from being 0 and - * causing divide-by-zero issues on boot. - * - * Runtime updates will correct power_orig. - */ - if (unlikely(!rq->sd)) { - power_orig += power_of(cpu); - power += power_of(cpu); - continue; - } - - sgp = rq->sd->groups->sgp; - power_orig += sgp->power_orig; - power += sgp->power; - } + for_each_cpu(cpu, sched_group_cpus(sdg)) + power += power_of(cpu); } else { /* * !SD_OVERLAP domains can assume that child groups @@ -5413,14 +4477,12 @@ void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { - power_orig += group->sgp->power_orig; power += group->sgp->power; group = group->next; } while (group != child->groups); } - sdg->sgp->power_orig = power_orig; - sdg->sgp->power = power; + sdg->sgp->power_orig = sdg->sgp->power = power; } /* @@ -5464,12 +4526,13 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * cpu 3 and leave one of the cpus in the second group unused. * * The current solution to this issue is detecting the skew in the first group - * by noticing the lower domain failed to reach balance and had difficulty - * moving tasks due to affinity constraints. + * by noticing it has a cpu that is overloaded while the remaining cpus are + * idle -- or rather, there's a distinct imbalance in the cpus; see + * sg_imbalanced(). * * When this is so detected; this group becomes a candidate for busiest; see - * update_sd_pick_busiest(). And calculate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditions to allow it + * update_sd_pick_busiest(). And calculcate_imbalance() and + * find_busiest_group() avoid some of the usual balance conditional to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the @@ -5477,36 +4540,49 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * subtle and fragile situation. */ -static inline int sg_imbalanced(struct sched_group *group) +struct sg_imb_stats { + unsigned long max_nr_running, min_nr_running; + unsigned long max_cpu_load, min_cpu_load; +}; + +static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) { - return group->sgp->imbalance; + sgi->max_cpu_load = sgi->max_nr_running = 0UL; + sgi->min_cpu_load = sgi->min_nr_running = ~0UL; } -/* - * Compute the group capacity. - * - * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by - * first dividing out the smt factor and computing the actual number of cores - * and limit power unit capacity with that. - */ -static inline int sg_capacity(struct lb_env *env, struct sched_group *group) +static inline void +update_sg_imb_stats(struct sg_imb_stats *sgi, + unsigned long load, unsigned long nr_running) { - unsigned int capacity, smt, cpus; - unsigned int power, power_orig; - - power = group->sgp->power; - power_orig = group->sgp->power_orig; - cpus = group->group_weight; + if (load > sgi->max_cpu_load) + sgi->max_cpu_load = load; + if (sgi->min_cpu_load > load) + sgi->min_cpu_load = load; - /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ - smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); - capacity = cpus / smt; /* cores */ + if (nr_running > sgi->max_nr_running) + sgi->max_nr_running = nr_running; + if (sgi->min_nr_running > nr_running) + sgi->min_nr_running = nr_running; +} - capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); - if (!capacity) - capacity = fix_small_capacity(env->sd, group); +static inline int +sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) +{ + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of a task. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && + (sgi->max_nr_running - sgi->min_nr_running) > 1) + return 1; - return capacity; + return 0; } /** @@ -5521,11 +4597,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs) { + struct sg_imb_stats sgi; unsigned long nr_running; unsigned long load; int i; - memset(sgs, 0, sizeof(*sgs)); + init_sg_imb_stats(&sgi); for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); @@ -5533,22 +4610,24 @@ static inline void update_sg_lb_stats(struct lb_env *env, nr_running = rq->nr_running; /* Bias balancing toward cpus of our domain */ - if (local_group) + if (local_group) { load = target_load(i, load_idx); - else + } else { load = source_load(i, load_idx); + update_sg_imb_stats(&sgi, load, nr_running); + } sgs->group_load += load; sgs->sum_nr_running += nr_running; -#ifdef CONFIG_NUMA_BALANCING - sgs->nr_numa_running += rq->nr_numa_running; - sgs->nr_preferred_running += rq->nr_preferred_running; -#endif sgs->sum_weighted_load += weighted_cpuload(i); if (idle_cpu(i)) sgs->idle_cpus++; } + if (local_group && (env->idle != CPU_NEWLY_IDLE || + time_after_eq(jiffies, group->sgp->next_update))) + update_group_power(env->sd, env->dst_cpu); + /* Adjust by relative CPU power of the group */ sgs->group_power = group->sgp->power; sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; @@ -5556,10 +4635,15 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - sgs->group_weight = group->group_weight; + sgs->group_imb = sg_imbalanced(sgs, &sgi); + + sgs->group_capacity = + DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); + + if (!sgs->group_capacity) + sgs->group_capacity = fix_small_capacity(env->sd, group); - sgs->group_imb = sg_imbalanced(group); - sgs->group_capacity = sg_capacity(env, group); + sgs->group_weight = group->group_weight; if (sgs->group_capacity > sgs->sum_nr_running) sgs->group_has_capacity = 1; @@ -5609,42 +4693,14 @@ static bool update_sd_pick_busiest(struct lb_env *env, return false; } -#ifdef CONFIG_NUMA_BALANCING -static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) -{ - if (sgs->sum_nr_running > sgs->nr_numa_running) - return regular; - if (sgs->sum_nr_running > sgs->nr_preferred_running) - return remote; - return all; -} - -static inline enum fbq_type fbq_classify_rq(struct rq *rq) -{ - if (rq->nr_running > rq->nr_numa_running) - return regular; - if (rq->nr_running > rq->nr_preferred_running) - return remote; - return all; -} -#else -static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) -{ - return all; -} - -static inline enum fbq_type fbq_classify_rq(struct rq *rq) -{ - return regular; -} -#endif /* CONFIG_NUMA_BALANCING */ - /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. + * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ -static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) +static inline void update_sd_lb_stats(struct lb_env *env, + struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; @@ -5664,17 +4720,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (local_group) { sds->local = sg; sgs = &sds->local_stat; - - if (env->idle != CPU_NEWLY_IDLE || - time_after_eq(jiffies, sg->sgp->next_update)) - update_group_power(env->sd, env->dst_cpu); } + memset(sgs, 0, sizeof(*sgs)); update_sg_lb_stats(env, sg, load_idx, local_group, sgs); - if (local_group) - goto next_group; - /* * In case the child domain prefers tasks go to siblings * first, lower the sg capacity to one so that we'll try @@ -5685,25 +4735,21 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling && sds->local && - sds->local_stat.group_has_capacity) + if (prefer_sibling && !local_group && + sds->local && sds->local_stat.group_has_capacity) sgs->group_capacity = min(sgs->group_capacity, 1U); - if (update_sd_pick_busiest(env, sds, sg, sgs)) { - sds->busiest = sg; - sds->busiest_stat = *sgs; - } - -next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_pwr += sgs->group_power; + if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { + sds->busiest = sg; + sds->busiest_stat = *sgs; + } + sg = sg->next; } while (sg != env->sd->groups); - - if (env->sd->flags & SD_NUMA) - env->fbq_type = fbq_classify_group(&sds->busiest_stat); } /** @@ -6007,39 +5053,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long power, capacity, wl; - enum fbq_type rt; - - rq = cpu_rq(i); - rt = fbq_classify_rq(rq); + unsigned long power = power_of(i); + unsigned long capacity = DIV_ROUND_CLOSEST(power, + SCHED_POWER_SCALE); + unsigned long wl; - /* - * We classify groups/runqueues into three groups: - * - regular: there are !numa tasks - * - remote: there are numa tasks that run on the 'wrong' node - * - all: there is no distinction - * - * In order to avoid migrating ideally placed numa tasks, - * ignore those when there's better options. - * - * If we ignore the actual busiest queue to migrate another - * task, the next balance pass can still reduce the busiest - * queue by moving tasks around inside the node. - * - * If we cannot move enough load due to this classification - * the next pass will adjust the group classification and - * allow migration of more tasks. - * - * Both cases only affect the total convergence complexity. - */ - if (rt > env->fbq_type) - continue; - - power = power_of(i); - capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); if (!capacity) capacity = fix_small_capacity(env->sd, group); + rq = cpu_rq(i); wl = weighted_cpuload(i); /* @@ -6142,7 +5164,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *continue_balancing) { int ld_moved, cur_ld_moved, active_balance = 0; - struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -6156,7 +5177,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, - .fbq_type = all, }; /* @@ -6248,17 +5268,17 @@ more_balance: * moreover subsequent load balance cycles should correct the * excess load moved. */ - if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { - - /* Prevent to re-select dst_cpu via env's cpus */ - cpumask_clear_cpu(env.dst_cpu, env.cpus); + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; - env.flags &= ~LBF_DST_PINNED; + env.flags &= ~LBF_SOME_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); + /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. @@ -6266,18 +5286,6 @@ more_balance: goto more_balance; } - /* - * We failed to reach balance because of affinity. - */ - if (sd_parent) { - int *group_imbalance = &sd_parent->groups->sgp->imbalance; - - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { - *group_imbalance = 1; - } else if (*group_imbalance) - *group_imbalance = 0; - } - /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); @@ -6385,7 +5393,6 @@ void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - u64 curr_cost = 0; this_rq->idle_stamp = rq_clock(this_rq); @@ -6402,27 +5409,15 @@ void idle_balance(int this_cpu, struct rq *this_rq) for_each_domain(this_cpu, sd) { unsigned long interval; int continue_balancing = 1; - u64 t0, domain_cost; if (!(sd->flags & SD_LOAD_BALANCE)) continue; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) - break; - if (sd->flags & SD_BALANCE_NEWIDLE) { - t0 = sched_clock_cpu(this_cpu); - /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); - - domain_cost = sched_clock_cpu(this_cpu) - t0; - if (domain_cost > sd->max_newidle_lb_cost) - sd->max_newidle_lb_cost = domain_cost; - - curr_cost += domain_cost; } interval = msecs_to_jiffies(sd->balance_interval); @@ -6444,9 +5439,6 @@ void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } - - if (curr_cost > this_rq->max_idle_balance_cost) - this_rq->max_idle_balance_cost = curr_cost; } /* @@ -6580,16 +5572,16 @@ static inline void nohz_balance_exit_idle(int cpu) static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; - int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); + sd = rcu_dereference_check_sched_domain(this_rq()->sd); if (!sd || !sd->nohz_idle) goto unlock; sd->nohz_idle = 0; - atomic_inc(&sd->groups->sgp->nr_busy_cpus); + for (; sd; sd = sd->parent) + atomic_inc(&sd->groups->sgp->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -6597,16 +5589,16 @@ unlock: void set_cpu_sd_state_idle(void) { struct sched_domain *sd; - int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); + sd = rcu_dereference_check_sched_domain(this_rq()->sd); if (!sd || sd->nohz_idle) goto unlock; sd->nohz_idle = 1; - atomic_dec(&sd->groups->sgp->nr_busy_cpus); + for (; sd; sd = sd->parent) + atomic_dec(&sd->groups->sgp->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -6670,39 +5662,15 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; - int need_serialize, need_decay = 0; - u64 max_cost = 0; + int need_serialize; update_blocked_averages(cpu); rcu_read_lock(); for_each_domain(cpu, sd) { - /* - * Decay the newidle max times here because this is a regular - * visit to all the domains. Decay ~1% per second. - */ - if (time_after(jiffies, sd->next_decay_max_lb_cost)) { - sd->max_newidle_lb_cost = - (sd->max_newidle_lb_cost * 253) / 256; - sd->next_decay_max_lb_cost = jiffies + HZ; - need_decay = 1; - } - max_cost += sd->max_newidle_lb_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) continue; - /* - * Stop the load balance at this level. There is another - * CPU in our sched group which is doing load balancing more - * actively. - */ - if (!continue_balancing) { - if (need_decay) - continue; - break; - } - interval = sd->balance_interval; if (idle != CPU_IDLE) interval *= sd->busy_factor; @@ -6721,7 +5689,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { /* - * The LBF_DST_PINNED logic could have changed + * The LBF_SOME_PINNED logic could have changed * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ @@ -6736,14 +5704,14 @@ out: next_balance = sd->last_balance + interval; update_next_balance = 1; } - } - if (need_decay) { + /* - * Ensure the rq-wide value also decays but keep it at a - * reasonable floor to avoid funnies with rq->avg_idle. + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. */ - rq->max_idle_balance_cost = - max((u64)sysctl_sched_migration_cost, max_cost); + if (!continue_balancing) + break; } rcu_read_unlock(); @@ -6813,8 +5781,6 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) { unsigned long now = jiffies; struct sched_domain *sd; - struct sched_group_power *sgp; - int nr_busy; if (unlikely(idle_cpu(cpu))) return 0; @@ -6840,22 +5806,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) goto need_kick; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); - - if (sd) { - sgp = sd->groups->sgp; - nr_busy = atomic_read(&sgp->nr_busy_cpus); + for_each_domain(cpu, sd) { + struct sched_group *sg = sd->groups; + struct sched_group_power *sgp = sg->sgp; + int nr_busy = atomic_read(&sgp->nr_busy_cpus); - if (nr_busy > 1) + if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) goto need_kick_unlock; - } - sd = rcu_dereference(per_cpu(sd_asym, cpu)); - - if (sd && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) - goto need_kick_unlock; + if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight + && (cpumask_first_and(nohz.idle_cpus_mask, + sched_domain_span(sd)) < cpu)) + goto need_kick_unlock; + if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) + break; + } rcu_read_unlock(); return 0; @@ -7248,8 +6214,7 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, se->cfs_rq = parent->my_q; se->my_q = cfs_rq; - /* guarantee group entities always have weight */ - update_load_set(&se->load, NICE_0_LOAD); + update_load_set(&se->load, 0); se->parent = parent; } |