summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile3
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/completion.c299
-rw-r--r--kernel/sched/core.c1596
-rw-r--r--kernel/sched/cpuacct.c51
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/cputime.c79
-rw-r--r--kernel/sched/debug.c111
-rw-r--r--kernel/sched/fair.c2125
-rw-r--r--kernel/sched/features.h19
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/proc.c591
-rw-r--r--kernel/sched/rt.c154
-rw-r--r--kernel/sched/sched.h139
-rw-r--r--kernel/sched/stats.h90
-rw-r--r--kernel/sched/stop_task.c10
-rw-r--r--kernel/sched/wait.c504
17 files changed, 3845 insertions, 1935 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index deaf90e..7b62140 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,8 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
+obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
+obj-y += wait.o completion.o
obj-$(CONFIG_SMP) += cpupri.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 64de5f8..4a07353 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void)
if (IS_ERR(tg))
goto out_free;
- sched_online_group(tg, &root_task_group);
-
kref_init(&ag->kref);
init_rwsem(&ag->lock);
ag->id = atomic_inc_return(&autogroup_seq_nr);
@@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void)
#endif
tg->autogroup = ag;
+ sched_online_group(tg, &root_task_group);
return ag;
out_free:
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
new file mode 100644
index 0000000..a63f4dc
--- /dev/null
+++ b/kernel/sched/completion.c
@@ -0,0 +1,299 @@
+/*
+ * Generic wait-for-completion handler;
+ *
+ * It differs from semaphores in that their default case is the opposite,
+ * wait_for_completion default blocks whereas semaphore default non-block. The
+ * interface also makes it easy to 'complete' multiple waiting threads,
+ * something which isn't entirely natural for semaphores.
+ *
+ * But more importantly, the primitive documents the usage. Semaphores would
+ * typically be used for exclusion which gives rise to priority inversion.
+ * Waiting for completion is a typically sync point, but not an exclusion point.
+ */
+
+#include <linux/sched.h>
+#include <linux/completion.h>
+
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done++;
+ __wake_up_locked(&x->wait, TASK_NORMAL, 1);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete_all(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done += UINT_MAX/2;
+ __wake_up_locked(&x->wait, TASK_NORMAL, 0);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+static inline long __sched
+do_wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
+{
+ if (!x->done) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ __add_wait_queue_tail_exclusive(&x->wait, &wait);
+ do {
+ if (signal_pending_state(state, current)) {
+ timeout = -ERESTARTSYS;
+ break;
+ }
+ __set_current_state(state);
+ spin_unlock_irq(&x->wait.lock);
+ timeout = action(timeout);
+ spin_lock_irq(&x->wait.lock);
+ } while (!x->done && timeout);
+ __remove_wait_queue(&x->wait, &wait);
+ if (!x->done)
+ return timeout;
+ }
+ x->done--;
+ return timeout ?: 1;
+}
+
+static inline long __sched
+__wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
+{
+ might_sleep();
+
+ spin_lock_irq(&x->wait.lock);
+ timeout = do_wait_for_common(x, action, timeout, state);
+ spin_unlock_irq(&x->wait.lock);
+ return timeout;
+}
+
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
+void __sched wait_for_completion(struct completion *x)
+{
+ wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO.
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+ wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x: holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_killable(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_killable);
+
+/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+
+/**
+ * try_wait_for_completion - try to decrement a completion without blocking
+ * @x: completion structure
+ *
+ * Return: 0 if a decrement cannot be done without blocking
+ * 1 if a decrement succeeded.
+ *
+ * If a completion is being used as a counting completion,
+ * attempt to decrement the counter without blocking. This
+ * enables us to avoid waiting if the resource the completion
+ * is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+ unsigned long flags;
+ int ret = 1;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ if (!x->done)
+ ret = 0;
+ else
+ x->done--;
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ * completion_done - Test to see if a completion has any waiters
+ * @x: completion structure
+ *
+ * Return: 0 if there are waiters (wait_for_completion() in progress)
+ * 1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+ unsigned long flags;
+ int ret = 1;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ if (!x->done)
+ ret = 0;
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e8b3350..e85cda2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -370,13 +370,6 @@ static struct rq *this_rq_lock(void)
#ifdef CONFIG_SCHED_HRTICK
/*
* Use HR-timers to deliver accurate preemption points.
- *
- * Its all a bit involved since we cannot program an hrt while holding the
- * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
- * reschedule event.
- *
- * When we get rescheduled we reprogram the hrtick_timer outside of the
- * rq->lock.
*/
static void hrtick_clear(struct rq *rq)
@@ -404,6 +397,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
}
#ifdef CONFIG_SMP
+
+static int __hrtick_restart(struct rq *rq)
+{
+ struct hrtimer *timer = &rq->hrtick_timer;
+ ktime_t time = hrtimer_get_softexpires(timer);
+
+ return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+}
+
/*
* called from hardirq (IPI) context
*/
@@ -412,7 +414,7 @@ static void __hrtick_start(void *arg)
struct rq *rq = arg;
raw_spin_lock(&rq->lock);
- hrtimer_restart(&rq->hrtick_timer);
+ __hrtick_restart(rq);
rq->hrtick_csd_pending = 0;
raw_spin_unlock(&rq->lock);
}
@@ -430,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay)
hrtimer_set_expires(timer, time);
if (rq == this_rq()) {
- hrtimer_restart(timer);
+ __hrtick_restart(rq);
} else if (!rq->hrtick_csd_pending) {
__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
rq->hrtick_csd_pending = 1;
@@ -511,12 +513,11 @@ static inline void init_hrtick(void)
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
-#ifdef CONFIG_SMP
void resched_task(struct task_struct *p)
{
int cpu;
- assert_raw_spin_locked(&task_rq(p)->lock);
+ lockdep_assert_held(&task_rq(p)->lock);
if (test_tsk_need_resched(p))
return;
@@ -524,8 +525,10 @@ void resched_task(struct task_struct *p)
set_tsk_need_resched(p);
cpu = task_cpu(p);
- if (cpu == smp_processor_id())
+ if (cpu == smp_processor_id()) {
+ set_preempt_need_resched();
return;
+ }
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
@@ -544,6 +547,7 @@ void resched_cpu(int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
+#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
/*
* In the semi idle case, use the nearest busy cpu for migrating timers
@@ -679,7 +683,7 @@ void sched_avg_update(struct rq *rq)
{
s64 period = sched_avg_period();
- while ((s64)(rq->clock - rq->age_stamp) > period) {
+ while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
/*
* Inline assembly required to prevent the compiler
* optimising this loop into a divmod call.
@@ -691,12 +695,6 @@ void sched_avg_update(struct rq *rq)
}
}
-#else /* !CONFIG_SMP */
-void resched_task(struct task_struct *p)
-{
- assert_raw_spin_locked(&task_rq(p)->lock);
- set_tsk_need_resched(p);
-}
#endif /* CONFIG_SMP */
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -765,14 +763,14 @@ static void set_load_weight(struct task_struct *p)
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_queued(p);
+ sched_info_queued(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_dequeued(p);
+ sched_info_dequeued(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -931,6 +929,8 @@ static int effective_prio(struct task_struct *p)
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
*/
inline int task_curr(const struct task_struct *p)
{
@@ -974,13 +974,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
rq->skip_clock_update = 1;
}
-static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
-
-void register_task_migration_notifier(struct notifier_block *n)
-{
- atomic_notifier_chain_register(&task_migration_notifier, n);
-}
-
#ifdef CONFIG_SMP
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
@@ -990,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* ttwu() will sort out the placement.
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+ !(task_preempt_count(p) & PREEMPT_ACTIVE));
#ifdef CONFIG_LOCKDEP
/*
@@ -1011,21 +1004,114 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
- struct task_migration_notifier tmn;
-
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+ }
+
+ __set_task_cpu(p, new_cpu);
+}
+
+static void __migrate_swap_task(struct task_struct *p, int cpu)
+{
+ if (p->on_rq) {
+ struct rq *src_rq, *dst_rq;
- tmn.task = p;
- tmn.from_cpu = task_cpu(p);
- tmn.to_cpu = new_cpu;
+ src_rq = task_rq(p);
+ dst_rq = cpu_rq(cpu);
- atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, cpu);
+ activate_task(dst_rq, p, 0);
+ check_preempt_curr(dst_rq, p, 0);
+ } else {
+ /*
+ * Task isn't running anymore; make it appear like we migrated
+ * it before it went to sleep. This means on wakeup we make the
+ * previous cpu our targer instead of where it really is.
+ */
+ p->wake_cpu = cpu;
}
+}
- __set_task_cpu(p, new_cpu);
+struct migration_swap_arg {
+ struct task_struct *src_task, *dst_task;
+ int src_cpu, dst_cpu;
+};
+
+static int migrate_swap_stop(void *data)
+{
+ struct migration_swap_arg *arg = data;
+ struct rq *src_rq, *dst_rq;
+ int ret = -EAGAIN;
+
+ src_rq = cpu_rq(arg->src_cpu);
+ dst_rq = cpu_rq(arg->dst_cpu);
+
+ double_raw_lock(&arg->src_task->pi_lock,
+ &arg->dst_task->pi_lock);
+ double_rq_lock(src_rq, dst_rq);
+ if (task_cpu(arg->dst_task) != arg->dst_cpu)
+ goto unlock;
+
+ if (task_cpu(arg->src_task) != arg->src_cpu)
+ goto unlock;
+
+ if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+ goto unlock;
+
+ if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+ goto unlock;
+
+ __migrate_swap_task(arg->src_task, arg->dst_cpu);
+ __migrate_swap_task(arg->dst_task, arg->src_cpu);
+
+ ret = 0;
+
+unlock:
+ double_rq_unlock(src_rq, dst_rq);
+ raw_spin_unlock(&arg->dst_task->pi_lock);
+ raw_spin_unlock(&arg->src_task->pi_lock);
+
+ return ret;
+}
+
+/*
+ * Cross migrate two tasks
+ */
+int migrate_swap(struct task_struct *cur, struct task_struct *p)
+{
+ struct migration_swap_arg arg;
+ int ret = -EINVAL;
+
+ arg = (struct migration_swap_arg){
+ .src_task = cur,
+ .src_cpu = task_cpu(cur),
+ .dst_task = p,
+ .dst_cpu = task_cpu(p),
+ };
+
+ if (arg.src_cpu == arg.dst_cpu)
+ goto out;
+
+ /*
+ * These three tests are all lockless; this is OK since all of them
+ * will be re-checked with proper locks held further down the line.
+ */
+ if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
+ goto out;
+
+ if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+ goto out;
+
+ if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+ goto out;
+
+ ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+
+out:
+ return ret;
}
struct migration_arg {
@@ -1247,9 +1333,9 @@ out:
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
- int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1340,13 +1426,14 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
p->sched_class->task_woken(rq, p);
if (rq->idle_stamp) {
- u64 delta = rq->clock - rq->idle_stamp;
- u64 max = 2*sysctl_sched_migration_cost;
+ u64 delta = rq_clock(rq) - rq->idle_stamp;
+ u64 max = 2*rq->max_idle_balance_cost;
+
+ update_avg(&rq->avg_idle, delta);
- if (delta > max)
+ if (rq->avg_idle > max)
rq->avg_idle = max;
- else
- update_avg(&rq->avg_idle, delta);
+
rq->idle_stamp = 0;
}
#endif
@@ -1377,6 +1464,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
rq = __task_rq_lock(p);
if (p->on_rq) {
+ /* check_preempt_curr() may use rq clock */
+ update_rq_clock(rq);
ttwu_do_wakeup(rq, p, wake_flags);
ret = 1;
}
@@ -1405,6 +1494,14 @@ static void sched_ttwu_pending(void)
void scheduler_ipi(void)
{
+ /*
+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+ * TIF_NEED_RESCHED remotely (for the first time) will also send
+ * this IPI.
+ */
+ if (tif_need_resched())
+ set_preempt_need_resched();
+
if (llist_empty(&this_rq()->wake_list)
&& !tick_nohz_full_cpu(smp_processor_id())
&& !got_nohz_idle_kick())
@@ -1478,7 +1575,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
* the simpler "current->state = TASK_RUNNING" to mark yourself
* runnable without the overhead of this.
*
- * Returns %true if @p was woken up, %false if it was already running
+ * Return: %true if @p was woken up, %false if it was already running.
* or @state didn't match @p's state.
*/
static int
@@ -1487,7 +1584,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
unsigned long flags;
int cpu, success = 0;
- smp_wmb();
+ /*
+ * If we are going to wake up a thread waiting for CONDITION we
+ * need to ensure that CONDITION=1 done by the caller can not be
+ * reordered with p->state check below. This pairs with mb() in
+ * set_current_state() the waiting thread does.
+ */
+ smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;
@@ -1516,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
- cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
@@ -1573,8 +1676,9 @@ out:
* @p: The process to be woken up.
*
* Attempt to wake up the nominated process and move it to the set of runnable
- * processes. Returns 1 if the process was woken up, 0 if it was already
- * running.
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
@@ -1597,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
*
* __sched_fork() is basic setup used by init_idle() too:
*/
-static void __sched_fork(struct task_struct *p)
+static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
{
p->on_rq = 0;
@@ -1609,15 +1713,6 @@ static void __sched_fork(struct task_struct *p)
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
- p->se.avg.runnable_avg_period = 0;
- p->se.avg.runnable_avg_sum = 0;
-#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
@@ -1630,16 +1725,24 @@ static void __sched_fork(struct task_struct *p)
#ifdef CONFIG_NUMA_BALANCING
if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
- p->mm->numa_next_scan = jiffies;
- p->mm->numa_next_reset = jiffies;
+ p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
p->mm->numa_scan_seq = 0;
}
+ if (clone_flags & CLONE_VM)
+ p->numa_preferred_nid = current->numa_preferred_nid;
+ else
+ p->numa_preferred_nid = -1;
+
p->node_stamp = 0ULL;
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
- p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
+ p->numa_faults = NULL;
+ p->numa_faults_buffer = NULL;
+
+ INIT_LIST_HEAD(&p->numa_entry);
+ p->numa_group = NULL;
#endif /* CONFIG_NUMA_BALANCING */
}
@@ -1665,12 +1768,12 @@ void set_numabalancing_state(bool enabled)
/*
* fork()/clone()-time setup:
*/
-void sched_fork(struct task_struct *p)
+void sched_fork(unsigned long clone_flags, struct task_struct *p)
{
unsigned long flags;
int cpu = get_cpu();
- __sched_fork(p);
+ __sched_fork(clone_flags, p);
/*
* We mark the process as running here. This guarantees that
* nobody will actually run it, and a signal or other external
@@ -1728,10 +1831,7 @@ void sched_fork(struct task_struct *p)
#if defined(CONFIG_SMP)
p->on_cpu = 0;
#endif
-#ifdef CONFIG_PREEMPT_COUNT
- /* Want to start with kernel preemption disabled. */
- task_thread_info(p)->preempt_count = 1;
-#endif
+ init_task_preempt_count(p);
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
#endif
@@ -1758,9 +1858,11 @@ void wake_up_new_task(struct task_struct *p)
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
*/
- set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
+ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
+ /* Initialize new task's runnable average */
+ init_task_runnable_average(p);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
p->on_rq = 1;
@@ -1847,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
trace_sched_switch(prev, next);
- sched_info_switch(prev, next);
+ sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
@@ -1899,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
+ task_numa_free(prev);
+
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
@@ -2069,575 +2173,6 @@ unsigned long nr_iowait_cpu(int cpu)
return atomic_read(&this->nr_iowait);
}
-unsigned long this_cpu_load(void)
-{
- struct rq *this = this_rq();
- return this->cpu_load[0];
-}
-
-
-/*
- * Global load-average calculations
- *
- * We take a distributed and async approach to calculating the global load-avg
- * in order to minimize overhead.
- *
- * The global load average is an exponentially decaying average of nr_running +
- * nr_uninterruptible.
- *
- * Once every LOAD_FREQ:
- *
- * nr_active = 0;
- * for_each_possible_cpu(cpu)
- * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
- *
- * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
- *
- * Due to a number of reasons the above turns in the mess below:
- *
- * - for_each_possible_cpu() is prohibitively expensive on machines with
- * serious number of cpus, therefore we need to take a distributed approach
- * to calculating nr_active.
- *
- * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
- * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
- *
- * So assuming nr_active := 0 when we start out -- true per definition, we
- * can simply take per-cpu deltas and fold those into a global accumulate
- * to obtain the same result. See calc_load_fold_active().
- *
- * Furthermore, in order to avoid synchronizing all per-cpu delta folding
- * across the machine, we assume 10 ticks is sufficient time for every
- * cpu to have completed this task.
- *
- * This places an upper-bound on the IRQ-off latency of the machine. Then
- * again, being late doesn't loose the delta, just wrecks the sample.
- *
- * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
- * this would add another cross-cpu cacheline miss and atomic operation
- * to the wakeup path. Instead we increment on whatever cpu the task ran
- * when it went into uninterruptible state and decrement on whatever cpu
- * did the wakeup. This means that only the sum of nr_uninterruptible over
- * all cpus yields the correct result.
- *
- * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
- */
-
-/* Variables and functions for calc_load */
-static atomic_long_t calc_load_tasks;
-static unsigned long calc_load_update;
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun); /* should be removed */
-
-/**
- * get_avenrun - get the load average array
- * @loads: pointer to dest load array
- * @offset: offset to add
- * @shift: shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
- loads[0] = (avenrun[0] + offset) << shift;
- loads[1] = (avenrun[1] + offset) << shift;
- loads[2] = (avenrun[2] + offset) << shift;
-}
-
-static long calc_load_fold_active(struct rq *this_rq)
-{
- long nr_active, delta = 0;
-
- nr_active = this_rq->nr_running;
- nr_active += (long) this_rq->nr_uninterruptible;
-
- if (nr_active != this_rq->calc_load_active) {
- delta = nr_active - this_rq->calc_load_active;
- this_rq->calc_load_active = nr_active;
- }
-
- return delta;
-}
-
-/*
- * a1 = a0 * e + a * (1 - e)
- */
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
- load *= exp;
- load += active * (FIXED_1 - exp);
- load += 1UL << (FSHIFT - 1);
- return load >> FSHIFT;
-}
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * Handle NO_HZ for the global load-average.
- *
- * Since the above described distributed algorithm to compute the global
- * load-average relies on per-cpu sampling from the tick, it is affected by
- * NO_HZ.
- *
- * The basic idea is to fold the nr_active delta into a global idle-delta upon
- * entering NO_HZ state such that we can include this as an 'extra' cpu delta
- * when we read the global state.
- *
- * Obviously reality has to ruin such a delightfully simple scheme:
- *
- * - When we go NO_HZ idle during the window, we can negate our sample
- * contribution, causing under-accounting.
- *
- * We avoid this by keeping two idle-delta counters and flipping them
- * when the window starts, thus separating old and new NO_HZ load.
- *
- * The only trick is the slight shift in index flip for read vs write.
- *
- * 0s 5s 10s 15s
- * +10 +10 +10 +10
- * |-|-----------|-|-----------|-|-----------|-|
- * r:0 0 1 1 0 0 1 1 0
- * w:0 1 1 0 0 1 1 0 0
- *
- * This ensures we'll fold the old idle contribution in this window while
- * accumlating the new one.
- *
- * - When we wake up from NO_HZ idle during the window, we push up our
- * contribution, since we effectively move our sample point to a known
- * busy state.
- *
- * This is solved by pushing the window forward, and thus skipping the
- * sample, for this cpu (effectively using the idle-delta for this cpu which
- * was in effect at the time the window opened). This also solves the issue
- * of having to deal with a cpu having been in NOHZ idle for multiple
- * LOAD_FREQ intervals.
- *
- * When making the ILB scale, we should try to pull this in as well.
- */
-static atomic_long_t calc_load_idle[2];
-static int calc_load_idx;
-
-static inline int calc_load_write_idx(void)
-{
- int idx = calc_load_idx;
-
- /*
- * See calc_global_nohz(), if we observe the new index, we also
- * need to observe the new update time.
- */
- smp_rmb();
-
- /*
- * If the folding window started, make sure we start writing in the
- * next idle-delta.
- */
- if (!time_before(jiffies, calc_load_update))
- idx++;
-
- return idx & 1;
-}
-
-static inline int calc_load_read_idx(void)
-{
- return calc_load_idx & 1;
-}
-
-void calc_load_enter_idle(void)
-{
- struct rq *this_rq = this_rq();
- long delta;
-
- /*
- * We're going into NOHZ mode, if there's any pending delta, fold it
- * into the pending idle delta.
- */
- delta = calc_load_fold_active(this_rq);
- if (delta) {
- int idx = calc_load_write_idx();
- atomic_long_add(delta, &calc_load_idle[idx]);
- }
-}
-
-void calc_load_exit_idle(void)
-{
- struct rq *this_rq = this_rq();
-
- /*
- * If we're still before the sample window, we're done.
- */
- if (time_before(jiffies, this_rq->calc_load_update))
- return;
-
- /*
- * We woke inside or after the sample window, this means we're already
- * accounted through the nohz accounting, so skip the entire deal and
- * sync up for the next window.
- */
- this_rq->calc_load_update = calc_load_update;
- if (time_before(jiffies, this_rq->calc_load_update + 10))
- this_rq->calc_load_update += LOAD_FREQ;
-}
-
-static long calc_load_fold_idle(void)
-{
- int idx = calc_load_read_idx();
- long delta = 0;
-
- if (atomic_long_read(&calc_load_idle[idx]))
- delta = atomic_long_xchg(&calc_load_idle[idx], 0);
-
- return delta;
-}
-
-/**
- * fixed_power_int - compute: x^n, in O(log n) time
- *
- * @x: base of the power
- * @frac_bits: fractional bits of @x
- * @n: power to raise @x to.
- *
- * By exploiting the relation between the definition of the natural power
- * function: x^n := x*x*...*x (x multiplied by itself for n times), and
- * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
- * (where: n_i \elem {0, 1}, the binary vector representing n),
- * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
- * of course trivially computable in O(log_2 n), the length of our binary
- * vector.
- */
-static unsigned long
-fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
-{
- unsigned long result = 1UL << frac_bits;
-
- if (n) for (;;) {
- if (n & 1) {
- result *= x;
- result += 1UL << (frac_bits - 1);
- result >>= frac_bits;
- }
- n >>= 1;
- if (!n)
- break;
- x *= x;
- x += 1UL << (frac_bits - 1);
- x >>= frac_bits;
- }
-
- return result;
-}
-
-/*
- * a1 = a0 * e + a * (1 - e)
- *
- * a2 = a1 * e + a * (1 - e)
- * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
- * = a0 * e^2 + a * (1 - e) * (1 + e)
- *
- * a3 = a2 * e + a * (1 - e)
- * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
- * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
- *
- * ...
- *
- * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
- * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
- * = a0 * e^n + a * (1 - e^n)
- *
- * [1] application of the geometric series:
- *
- * n 1 - x^(n+1)
- * S_n := \Sum x^i = -------------
- * i=0 1 - x
- */
-static unsigned long
-calc_load_n(unsigned long load, unsigned long exp,
- unsigned long active, unsigned int n)
-{
-
- return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
-}
-
-/*
- * NO_HZ can leave us missing all per-cpu ticks calling
- * calc_load_account_active(), but since an idle CPU folds its delta into
- * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
- * in the pending idle delta if our idle period crossed a load cycle boundary.
- *
- * Once we've updated the global active value, we need to apply the exponential
- * weights adjusted to the number of cycles missed.
- */
-static void calc_global_nohz(void)
-{
- long delta, active, n;
-
- if (!time_before(jiffies, calc_load_update + 10)) {
- /*
- * Catch-up, fold however many we are behind still
- */
- delta = jiffies - calc_load_update - 10;
- n = 1 + (delta / LOAD_FREQ);
-
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
-
- avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-
- calc_load_update += n * LOAD_FREQ;
- }
-
- /*
- * Flip the idle index...
- *
- * Make sure we first write the new time then flip the index, so that
- * calc_load_write_idx() will see the new time when it reads the new
- * index, this avoids a double flip messing things up.
- */
- smp_wmb();
- calc_load_idx++;
-}
-#else /* !CONFIG_NO_HZ_COMMON */
-
-static inline long calc_load_fold_idle(void) { return 0; }
-static inline void calc_global_nohz(void) { }
-
-#endif /* CONFIG_NO_HZ_COMMON */
-
-/*
- * calc_load - update the avenrun load estimates 10 ticks after the
- * CPUs have updated calc_load_tasks.
- */
-void calc_global_load(unsigned long ticks)
-{
- long active, delta;
-
- if (time_before(jiffies, calc_load_update + 10))
- return;
-
- /*
- * Fold the 'old' idle-delta to include all NO_HZ cpus.
- */
- delta = calc_load_fold_idle();
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
-
- avenrun[0] = calc_load(avenrun[0], EXP_1, active);
- avenrun[1] = calc_load(avenrun[1], EXP_5, active);
- avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-
- calc_load_update += LOAD_FREQ;
-
- /*
- * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
- */
- calc_global_nohz();
-}
-
-/*
- * Called from update_cpu_load() to periodically update this CPU's
- * active count.
- */
-static void calc_load_account_active(struct rq *this_rq)
-{
- long delta;
-
- if (time_before(jiffies, this_rq->calc_load_update))
- return;
-
- delta = calc_load_fold_active(this_rq);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-
- this_rq->calc_load_update += LOAD_FREQ;
-}
-
-/*
- * End of global load-average stuff
- */
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT 7
-static const unsigned char
- degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
- degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- {0, 0, 0, 0, 0, 0, 0, 0},
- {64, 32, 8, 0, 0, 0, 0, 0},
- {96, 72, 40, 12, 1, 0, 0},
- {112, 98, 75, 43, 15, 1, 0},
- {120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
- int j = 0;
-
- if (!missed_updates)
- return load;
-
- if (missed_updates >= degrade_zero_ticks[idx])
- return 0;
-
- if (idx == 1)
- return load >> missed_updates;
-
- while (missed_updates) {
- if (missed_updates % 2)
- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
- missed_updates >>= 1;
- j++;
- }
- return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates)
-{
- int i, scale;
-
- this_rq->nr_load_updates++;
-
- /* Update our load: */
- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
- unsigned long old_load, new_load;
-
- /* scale is effectively 1 << i now, and >> i divides by scale */
-
- old_load = this_rq->cpu_load[i];
- old_load = decay_load_missed(old_load, pending_updates - 1, i);
- new_load = this_load;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (new_load > old_load)
- new_load += scale - 1;
-
- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
- }
-
- sched_avg_update(this_rq);
-}
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
- unsigned long load = this_rq->load.weight;
- unsigned long pending_updates;
-
- /*
- * bail if there's load or we're actually up-to-date.
- */
- if (load || curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- this_rq->last_load_update_tick = curr_jiffies;
-
- __update_cpu_load(this_rq, load, pending_updates);
-}
-
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
- struct rq *this_rq = this_rq();
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
- unsigned long pending_updates;
-
- if (curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- raw_spin_lock(&this_rq->lock);
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- if (pending_updates) {
- this_rq->last_load_update_tick = curr_jiffies;
- /*
- * We were idle, this means load 0, the current load might be
- * !0 due to remote wakeups and the sort.
- */
- __update_cpu_load(this_rq, 0, pending_updates);
- }
- raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ_COMMON */
-
-/*
- * Called from scheduler_tick()
- */
-static void update_cpu_load_active(struct rq *this_rq)
-{
- /*
- * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
- */
- this_rq->last_load_update_tick = jiffies;
- __update_cpu_load(this_rq, this_rq->load.weight, 1);
-
- calc_load_account_active(this_rq);
-}
-
#ifdef CONFIG_SMP
/*
@@ -2651,7 +2186,7 @@ void sched_exec(void)
int dest_cpu;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
if (dest_cpu == smp_processor_id())
goto unlock;
@@ -2686,7 +2221,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
if (task_current(rq, p)) {
update_rq_clock(rq);
- ns = rq->clock_task - p->se.exec_start;
+ ns = rq_clock_task(rq) - p->se.exec_start;
if ((s64)ns < 0)
ns = 0;
}
@@ -2718,6 +2253,20 @@ unsigned long long task_sched_runtime(struct task_struct *p)
struct rq *rq;
u64 ns = 0;
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+ /*
+ * 64-bit doesn't need locks to atomically read a 64bit value.
+ * So we have a optimization chance when the task's delta_exec is 0.
+ * Reading ->on_cpu is racy, but this is ok.
+ *
+ * If we race with it leaving cpu, we'll take a lock. So we're correct.
+ * If we race with it entering cpu, unaccounted time is 0. This is
+ * indistinguishable from the read occurring a few cycles earlier.
+ */
+ if (!p->on_cpu)
+ return p->se.sum_exec_runtime;
+#endif
+
rq = task_rq_lock(p, &flags);
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
task_rq_unlock(rq, p, &flags);
@@ -2739,8 +2288,8 @@ void scheduler_tick(void)
raw_spin_lock(&rq->lock);
update_rq_clock(rq);
- update_cpu_load_active(rq);
curr->sched_class->task_tick(rq, curr, 0);
+ update_cpu_load_active(rq);
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
@@ -2763,6 +2312,8 @@ void scheduler_tick(void)
* This makes sure that uptime, CFS vruntime, load
* balancing, etc... continue to move forward, even
* with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
*/
u64 scheduler_tick_max_deferment(void)
{
@@ -2791,7 +2342,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
-void __kprobes add_preempt_count(int val)
+void __kprobes preempt_count_add(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2800,7 +2351,7 @@ void __kprobes add_preempt_count(int val)
if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
return;
#endif
- preempt_count() += val;
+ __preempt_count_add(val);
#ifdef CONFIG_DEBUG_PREEMPT
/*
* Spinlock count overflowing soon?
@@ -2811,9 +2362,9 @@ void __kprobes add_preempt_count(int val)
if (preempt_count() == val)
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
}
-EXPORT_SYMBOL(add_preempt_count);
+EXPORT_SYMBOL(preempt_count_add);
-void __kprobes sub_preempt_count(int val)
+void __kprobes preempt_count_sub(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2831,9 +2382,9 @@ void __kprobes sub_preempt_count(int val)
if (preempt_count() == val)
trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
- preempt_count() -= val;
+ __preempt_count_sub(val);
}
-EXPORT_SYMBOL(sub_preempt_count);
+EXPORT_SYMBOL(preempt_count_sub);
#endif
@@ -2966,6 +2517,12 @@ need_resched:
if (sched_feat(HRTICK))
hrtick_clear(rq);
+ /*
+ * Make sure that signal_pending_state()->signal_pending() below
+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+ * done by the caller to avoid the race with signal_wake_up().
+ */
+ smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
switch_count = &prev->nivcsw;
@@ -3000,6 +2557,7 @@ need_resched:
put_prev_task(rq, prev);
next = pick_next_task(rq);
clear_tsk_need_resched(prev);
+ clear_preempt_need_resched();
rq->skip_clock_update = 0;
if (likely(prev != next)) {
@@ -3082,19 +2640,17 @@ void __sched schedule_preempt_disabled(void)
*/
asmlinkage void __sched notrace preempt_schedule(void)
{
- struct thread_info *ti = current_thread_info();
-
/*
* If there is a non-zero preempt_count or interrupts are disabled,
* we do not want to preempt the current task. Just return..
*/
- if (likely(ti->preempt_count || irqs_disabled()))
+ if (likely(!preemptible()))
return;
do {
- add_preempt_count_notrace(PREEMPT_ACTIVE);
+ __preempt_count_add(PREEMPT_ACTIVE);
__schedule();
- sub_preempt_count_notrace(PREEMPT_ACTIVE);
+ __preempt_count_sub(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -3104,6 +2660,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
} while (need_resched());
}
EXPORT_SYMBOL(preempt_schedule);
+#endif /* CONFIG_PREEMPT */
/*
* this is the entry point to schedule() from kernel preemption
@@ -3113,20 +2670,19 @@ EXPORT_SYMBOL(preempt_schedule);
*/
asmlinkage void __sched preempt_schedule_irq(void)
{
- struct thread_info *ti = current_thread_info();
enum ctx_state prev_state;
/* Catch callers which need to be fixed */
- BUG_ON(ti->preempt_count || !irqs_disabled());
+ BUG_ON(preempt_count() || !irqs_disabled());
prev_state = exception_enter();
do {
- add_preempt_count(PREEMPT_ACTIVE);
+ __preempt_count_add(PREEMPT_ACTIVE);
local_irq_enable();
__schedule();
local_irq_disable();
- sub_preempt_count(PREEMPT_ACTIVE);
+ __preempt_count_sub(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -3138,8 +2694,6 @@ asmlinkage void __sched preempt_schedule_irq(void)
exception_exit(prev_state);
}
-#endif /* CONFIG_PREEMPT */
-
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
@@ -3147,393 +2701,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
}
EXPORT_SYMBOL(default_wake_function);
-/*
- * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
- * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
- *
- * There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
- * zero in this (rare) case, and we handle it by continuing to scan the queue.
- */
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, int wake_flags, void *key)
-{
- wait_queue_t *curr, *next;
-
- list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
- unsigned flags = curr->flags;
-
- if (curr->func(curr, mode, wake_flags, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
- break;
- }
-}
-
-/**
- * __wake_up - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: is directly passed to the wakeup function
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, 0, key);
- spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(__wake_up);
-
-/*
- * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
- */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
-{
- __wake_up_common(q, mode, nr, 0, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked);
-
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
-{
- __wake_up_common(q, mode, 1, 0, key);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key);
-
-/**
- * __wake_up_sync_key - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: opaque value to be passed to wakeup targets
- *
- * The sync wakeup differs that the waker knows that it will schedule
- * away soon, so while the target thread will be woken up, it will not
- * be migrated to another CPU - ie. the two threads are 'synchronized'
- * with each other. This can prevent needless bouncing between CPUs.
- *
- * On UP it can prevent extra preemption.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
-{
- unsigned long flags;
- int wake_flags = WF_SYNC;
-
- if (unlikely(!q))
- return;
-
- if (unlikely(!nr_exclusive))
- wake_flags = 0;
-
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
- spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync_key);
-
-/*
- * __wake_up_sync - see __wake_up_sync_key()
- */
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
-{
- __wake_up_sync_key(q, mode, nr_exclusive, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
-
-/**
- * complete: - signals a single thread waiting on this completion
- * @x: holds the state of this particular completion
- *
- * This will wake up a single thread waiting on this completion. Threads will be
- * awakened in the same order in which they were queued.
- *
- * See also complete_all(), wait_for_completion() and related routines.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete(struct completion *x)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- x->done++;
- __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
- spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete);
-
-/**
- * complete_all: - signals all threads waiting on this completion
- * @x: holds the state of this particular completion
- *
- * This will wake up all threads waiting on this particular completion event.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete_all(struct completion *x)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- x->done += UINT_MAX/2;
- __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
- spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete_all);
-
-static inline long __sched
-do_wait_for_common(struct completion *x,
- long (*action)(long), long timeout, int state)
-{
- if (!x->done) {
- DECLARE_WAITQUEUE(wait, current);
-
- __add_wait_queue_tail_exclusive(&x->wait, &wait);
- do {
- if (signal_pending_state(state, current)) {
- timeout = -ERESTARTSYS;
- break;
- }
- __set_current_state(state);
- spin_unlock_irq(&x->wait.lock);
- timeout = action(timeout);
- spin_lock_irq(&x->wait.lock);
- } while (!x->done && timeout);
- __remove_wait_queue(&x->wait, &wait);
- if (!x->done)
- return timeout;
- }
- x->done--;
- return timeout ?: 1;
-}
-
-static inline long __sched
-__wait_for_common(struct completion *x,
- long (*action)(long), long timeout, int state)
-{
- might_sleep();
-
- spin_lock_irq(&x->wait.lock);
- timeout = do_wait_for_common(x, action, timeout, state);
- spin_unlock_irq(&x->wait.lock);
- return timeout;
-}
-
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
-{
- return __wait_for_common(x, schedule_timeout, timeout, state);
-}
-
-static long __sched
-wait_for_common_io(struct completion *x, long timeout, int state)
-{
- return __wait_for_common(x, io_schedule_timeout, timeout, state);
-}
-
-/**
- * wait_for_completion: - waits for completion of a task
- * @x: holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It is NOT
- * interruptible and there is no timeout.
- *
- * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
- * and interrupt capability. Also see complete().
- */
-void __sched wait_for_completion(struct completion *x)
-{
- wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion);
-
-/**
- * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
- * @x: holds the state of this particular completion
- * @timeout: timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible.
- *
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
- */
-unsigned long __sched
-wait_for_completion_timeout(struct completion *x, unsigned long timeout)
-{
- return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_timeout);
-
-/**
- * wait_for_completion_io: - waits for completion of a task
- * @x: holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It is NOT
- * interruptible and there is no timeout. The caller is accounted as waiting
- * for IO.
- */
-void __sched wait_for_completion_io(struct completion *x)
-{
- wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_io);
-
-/**
- * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
- * @x: holds the state of this particular completion
- * @timeout: timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible. The caller is accounted as waiting for IO.
- *
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
- */
-unsigned long __sched
-wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
-{
- return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_io_timeout);
-
-/**
- * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
- * @x: holds the state of this particular completion
- *
- * This waits for completion of a specific task to be signaled. It is
- * interruptible.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_interruptible(struct completion *x)
-{
- long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
- if (t == -ERESTARTSYS)
- return t;
- return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible);
-
-/**
- * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
- * @x: holds the state of this particular completion
- * @timeout: timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. It is interruptible. The timeout is in jiffies.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_interruptible_timeout(struct completion *x,
- unsigned long timeout)
-{
- return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
-
-/**
- * wait_for_completion_killable: - waits for completion of a task (killable)
- * @x: holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It can be
- * interrupted by a kill signal.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_killable(struct completion *x)
-{
- long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
- if (t == -ERESTARTSYS)
- return t;
- return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_killable);
-
-/**
- * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
- * @x: holds the state of this particular completion
- * @timeout: timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be
- * signaled or for a specified timeout to expire. It can be
- * interrupted by a kill signal. The timeout is in jiffies.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_killable_timeout(struct completion *x,
- unsigned long timeout)
-{
- return wait_for_common(x, timeout, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(wait_for_completion_killable_timeout);
-
-/**
- * try_wait_for_completion - try to decrement a completion without blocking
- * @x: completion structure
- *
- * Returns: 0 if a decrement cannot be done without blocking
- * 1 if a decrement succeeded.
- *
- * If a completion is being used as a counting completion,
- * attempt to decrement the counter without blocking. This
- * enables us to avoid waiting if the resource the completion
- * is protecting is not available.
- */
-bool try_wait_for_completion(struct completion *x)
-{
- unsigned long flags;
- int ret = 1;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- if (!x->done)
- ret = 0;
- else
- x->done--;
- spin_unlock_irqrestore(&x->wait.lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(try_wait_for_completion);
-
-/**
- * completion_done - Test to see if a completion has any waiters
- * @x: completion structure
- *
- * Returns: 0 if there are waiters (wait_for_completion() in progress)
- * 1 if there are no waiters.
- *
- */
-bool completion_done(struct completion *x)
-{
- unsigned long flags;
- int ret = 1;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- if (!x->done)
- ret = 0;
- spin_unlock_irqrestore(&x->wait.lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(completion_done);
-
static long __sched
sleep_on_common(wait_queue_head_t *q, int state, long timeout)
{
@@ -3754,7 +2921,7 @@ SYSCALL_DEFINE1(nice, int, increment)
* task_prio - return the priority value of a given task.
* @p: the task in question.
*
- * This is the priority value as seen by users in /proc.
+ * Return: The priority value as seen by users in /proc.
* RT tasks are offset by -200. Normal tasks are centered
* around 0, value goes from -16 to +15.
*/
@@ -3766,6 +2933,8 @@ int task_prio(const struct task_struct *p)
/**
* task_nice - return the nice value of a given task.
* @p: the task in question.
+ *
+ * Return: The nice value [ -20 ... 0 ... 19 ].
*/
int task_nice(const struct task_struct *p)
{
@@ -3776,6 +2945,8 @@ EXPORT_SYMBOL(task_nice);
/**
* idle_cpu - is a given cpu idle currently?
* @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
*/
int idle_cpu(int cpu)
{
@@ -3798,6 +2969,8 @@ int idle_cpu(int cpu)
/**
* idle_task - return the idle task for a given cpu.
* @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
*/
struct task_struct *idle_task(int cpu)
{
@@ -3807,6 +2980,8 @@ struct task_struct *idle_task(int cpu)
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
*/
static struct task_struct *find_process_by_pid(pid_t pid)
{
@@ -4004,6 +3179,8 @@ recheck:
* @policy: new policy.
* @param: structure containing the new RT priority.
*
+ * Return: 0 on success. An error code otherwise.
+ *
* NOTE that the task may be already dead.
*/
int sched_setscheduler(struct task_struct *p, int policy,
@@ -4023,6 +3200,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
* current context has permission. For example, this is needed in
* stop_machine(): we create temporary high priority worker threads,
* but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
const struct sched_param *param)
@@ -4057,6 +3236,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
* @pid: the pid in question.
* @policy: new policy.
* @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
struct sched_param __user *, param)
@@ -4072,6 +3253,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
* sys_sched_setparam - set/change the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
{
@@ -4081,6 +3264,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
/**
* sys_sched_getscheduler - get the policy (scheduling class) of a thread
* @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
*/
SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
{
@@ -4107,6 +3293,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
* sys_sched_getparam - get the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
*/
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
@@ -4148,13 +3337,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
struct task_struct *p;
int retval;
- get_online_cpus();
rcu_read_lock();
p = find_process_by_pid(pid);
if (!p) {
rcu_read_unlock();
- put_online_cpus();
return -ESRCH;
}
@@ -4211,7 +3398,6 @@ out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
out_put_task:
put_task_struct(p);
- put_online_cpus();
return retval;
}
@@ -4231,6 +3417,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -4254,7 +3442,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
unsigned long flags;
int retval;
- get_online_cpus();
rcu_read_lock();
retval = -ESRCH;
@@ -4267,12 +3454,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+ cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
- put_online_cpus();
return retval;
}
@@ -4282,6 +3468,8 @@ out_unlock:
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -4316,6 +3504,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
*
* This function yields the current CPU to other tasks. If there are no
* other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
*/
SYSCALL_DEFINE0(sched_yield)
{
@@ -4338,16 +3528,11 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
-static inline int should_resched(void)
-{
- return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
-}
-
static void __cond_resched(void)
{
- add_preempt_count(PREEMPT_ACTIVE);
+ __preempt_count_add(PREEMPT_ACTIVE);
__schedule();
- sub_preempt_count(PREEMPT_ACTIVE);
+ __preempt_count_sub(PREEMPT_ACTIVE);
}
int __sched _cond_resched(void)
@@ -4441,7 +3626,7 @@ EXPORT_SYMBOL(yield);
* It's the caller's job to ensure that the target task struct
* can't go away on us before we can do any checks.
*
- * Returns:
+ * Return:
* true (>0) if we indeed boosted the target task.
* false (0) if we failed to boost the target.
* -ESRCH if there's no task to yield to.
@@ -4544,8 +3729,9 @@ long __sched io_schedule_timeout(long timeout)
* sys_sched_get_priority_max - return maximum RT priority.
* @policy: scheduling class.
*
- * this syscall returns the maximum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
*/
SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
{
@@ -4569,8 +3755,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
* sys_sched_get_priority_min - return minimum RT priority.
* @policy: scheduling class.
*
- * this syscall returns the minimum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
*/
SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
{
@@ -4596,6 +3783,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
*
* this syscall writes the default timeslice value of a given process
* into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
*/
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
struct timespec __user *, interval)
@@ -4705,7 +3895,7 @@ void show_state_filter(unsigned long state_filter)
debug_show_all_locks();
}
-void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+void init_idle_bootup_task(struct task_struct *idle)
{
idle->sched_class = &idle_sched_class;
}
@@ -4718,14 +3908,14 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle)
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void __cpuinit init_idle(struct task_struct *idle, int cpu)
+void init_idle(struct task_struct *idle, int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
raw_spin_lock_irqsave(&rq->lock, flags);
- __sched_fork(idle);
+ __sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
@@ -4751,7 +3941,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
- task_thread_info(idle)->preempt_count = 0;
+ init_idle_preempt_count(idle, cpu);
/*
* The idle tasks have their own, simple scheduling class:
@@ -4885,6 +4075,53 @@ fail:
return ret;
}
+#ifdef CONFIG_NUMA_BALANCING
+/* Migrate current task p to target_cpu */
+int migrate_task_to(struct task_struct *p, int target_cpu)
+{
+ struct migration_arg arg = { p, target_cpu };
+ int curr_cpu = task_cpu(p);
+
+ if (curr_cpu == target_cpu)
+ return 0;
+
+ if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+ return -EINVAL;
+
+ /* TODO: This is not properly updating schedstats */
+
+ return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
+}
+
+/*
+ * Requeue a task on a given node and accurately track the number of NUMA
+ * tasks on the runqueues
+ */
+void sched_setnuma(struct task_struct *p, int nid)
+{
+ struct rq *rq;
+ unsigned long flags;
+ bool on_rq, running;
+
+ rq = task_rq_lock(p, &flags);
+ on_rq = p->on_rq;
+ running = task_current(rq, p);
+
+ if (on_rq)
+ dequeue_task(rq, p, 0);
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
+
+ p->numa_preferred_nid = nid;
+
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (on_rq)
+ enqueue_task(rq, p, 0);
+ task_rq_unlock(rq, p, &flags);
+}
+#endif
+
/*
* migration_cpu_stop - this will be executed by a highprio stopper thread
* and performs thread migration by bumping thread off CPU then
@@ -4960,6 +4197,13 @@ static void migrate_tasks(unsigned int dead_cpu)
*/
rq->stop = NULL;
+ /*
+ * put_prev_task() and pick_next_task() sched
+ * class method both need to have an up-to-date
+ * value of rq->clock[_task]
+ */
+ update_rq_clock(rq);
+
for ( ; ; ) {
/*
* There's this thread running, bail when that's the only
@@ -5093,7 +4337,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
return table;
}
-static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
{
struct ctl_table *entry, *table;
struct sched_domain *sd;
@@ -5195,7 +4439,7 @@ static void set_rq_offline(struct rq *rq)
* migration_call - callback that gets triggered when a CPU is added.
* Here we can start up the necessary migration thread for the new CPU.
*/
-static int __cpuinit
+static int
migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int cpu = (long)hcpu;
@@ -5249,12 +4493,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
* happens before everything else. This has to be lower priority than
* the notifier in the perf_event subsystem, though.
*/
-static struct notifier_block __cpuinitdata migration_notifier = {
+static struct notifier_block migration_notifier = {
.notifier_call = migration_call,
.priority = CPU_PRI_MIGRATION,
};
-static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+static int sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
@@ -5267,7 +4511,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
}
}
-static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+static int sched_cpu_inactive(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
@@ -5479,7 +4723,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
- SD_SHARE_PKG_RESOURCES);
+ SD_SHARE_PKG_RESOURCES |
+ SD_PREFER_SIBLING);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -5516,7 +4761,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
cpumask_clear_cpu(rq->cpu, old_rd->span);
/*
- * If we dont want to free the old_rt yet then
+ * If we dont want to free the old_rd yet then
* set old_rd to NULL to skip the freeing later
* in this function:
*/
@@ -5648,19 +4893,35 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
* two cpus are in the same cache domain, see cpus_share_cache().
*/
DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu)
{
struct sched_domain *sd;
int id = cpu;
+ int size = 1;
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
id = cpumask_first(sched_domain_span(sd));
+ size = cpumask_weight(sched_domain_span(sd));
+ sd = sd->parent; /* sd_busy */
+ }
+ rcu_assign_pointer(per_cpu(sd_busy, cpu), sd);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
+
+ sd = lowest_flag_domain(cpu, SD_NUMA);
+ rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+ sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+ rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
}
/*
@@ -5683,6 +4944,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
+ /*
+ * Transfer SD_PREFER_SIBLING down in case of a
+ * degenerate parent; the spans match for this
+ * so the property transfers.
+ */
+ if (parent->flags & SD_PREFER_SIBLING)
+ tmp->flags |= SD_PREFER_SIBLING;
destroy_sched_domain(parent, cpu);
} else
tmp = tmp->parent;
@@ -5907,7 +5175,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
get_group(cpu, sdd, &sd->groups);
atomic_inc(&sd->groups->ref);
- if (cpu != cpumask_first(sched_domain_span(sd)))
+ if (cpu != cpumask_first(span))
return 0;
lockdep_assert_held(&sched_domains_mutex);
@@ -5917,12 +5185,12 @@ build_sched_groups(struct sched_domain *sd, int cpu)
for_each_cpu(i, span) {
struct sched_group *sg;
- int group = get_group(i, sdd, &sg);
- int j;
+ int group, j;
if (cpumask_test_cpu(i, covered))
continue;
+ group = get_group(i, sdd, &sg);
cpumask_clear(sched_group_cpus(sg));
sg->sgp->power = 0;
cpumask_setall(sched_group_mask(sg));
@@ -5960,7 +5228,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
- WARN_ON(!sd || !sg);
+ WARN_ON(!sg);
do {
sg->group_weight = cpumask_weight(sched_group_cpus(sg));
@@ -6125,6 +5393,9 @@ static struct sched_domain_topology_level default_topology[] = {
static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+#define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->init; tl++)
+
#ifdef CONFIG_NUMA
static int sched_domains_numa_levels;
@@ -6170,6 +5441,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
| 0*SD_SHARE_PKG_RESOURCES
| 1*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
+ | 1*SD_NUMA
| sd_local_flags(level)
,
.last_balance = jiffies,
@@ -6422,7 +5694,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
struct sched_domain_topology_level *tl;
int j;
- for (tl = sched_domain_topology; tl->init; tl++) {
+ for_each_sd_topology(tl) {
struct sd_data *sdd = &tl->data;
sdd->sd = alloc_percpu(struct sched_domain *);
@@ -6475,7 +5747,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
struct sched_domain_topology_level *tl;
int j;
- for (tl = sched_domain_topology; tl->init; tl++) {
+ for_each_sd_topology(tl) {
struct sd_data *sdd = &tl->data;
for_each_cpu(j, cpu_map) {
@@ -6503,9 +5775,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
- struct s_data *d, const struct cpumask *cpu_map,
- struct sched_domain_attr *attr, struct sched_domain *child,
- int cpu)
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *child, int cpu)
{
struct sched_domain *sd = tl->init(tl, cpu);
if (!sd)
@@ -6516,8 +5787,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
sd->level = child->level + 1;
sched_domain_level_max = max(sched_domain_level_max, sd->level);
child->parent = sd;
+ sd->child = child;
}
- sd->child = child;
set_domain_attribute(sd, attr);
return sd;
@@ -6530,7 +5801,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_attr *attr)
{
- enum s_alloc alloc_state = sa_none;
+ enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
int i, ret = -ENOMEM;
@@ -6544,18 +5815,15 @@ static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_topology_level *tl;
sd = NULL;
- for (tl = sched_domain_topology; tl->init; tl++) {
- sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+ for_each_sd_topology(tl) {
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+ if (tl == sched_domain_topology)
+ *per_cpu_ptr(d.sd, i) = sd;
if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
-
- while (sd->child)
- sd = sd->child;
-
- *per_cpu_ptr(d.sd, i) = sd;
}
/* Build the groups for the domains */
@@ -6750,8 +6018,9 @@ match1:
;
}
+ n = ndoms_cur;
if (doms_new == NULL) {
- ndoms_cur = 0;
+ n = 0;
doms_new = &fallback_doms;
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
WARN_ON_ONCE(dattr_new);
@@ -6759,7 +6028,7 @@ match1:
/* Build new domains */
for (i = 0; i < ndoms_new; i++) {
- for (j = 0; j < ndoms_cur && !new_topology; j++) {
+ for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_new[i], doms_cur[j])
&& dattrs_equal(dattr_new, i, dattr_cur, j))
goto match2;
@@ -6854,22 +6123,22 @@ void __init sched_init_smp(void)
sched_init_numa();
- get_online_cpus();
+ /*
+ * There's no userspace yet to cause hotplug operations; hence all the
+ * cpu masks are stable and all blatant races in the below code cannot
+ * happen.
+ */
mutex_lock(&sched_domains_mutex);
init_sched_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
- put_online_cpus();
hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
- /* RT runtime code needs to handle some hotplug events */
- hotcpu_notifier(update_runtime, 0);
-
init_hrtick();
/* Move init over to a non-isolated CPU */
@@ -7027,6 +6296,7 @@ void __init sched_init(void)
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
+ rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -7201,6 +6471,8 @@ void normalize_rt_tasks(void)
* @cpu: the processor in question.
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
*/
struct task_struct *curr_task(int cpu)
{
@@ -7332,7 +6604,7 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
- tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
+ tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
lockdep_is_held(&tsk->sighand->siglock)),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
@@ -7654,23 +6926,22 @@ int sched_rt_handler(struct ctl_table *table, int write,
#ifdef CONFIG_CGROUP_SCHED
-/* return corresponding task_group object of a cgroup */
-static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
{
- return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
- struct task_group, css);
+ return css ? container_of(css, struct task_group, css) : NULL;
}
-static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
+static struct cgroup_subsys_state *
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
- struct task_group *tg, *parent;
+ struct task_group *parent = css_tg(parent_css);
+ struct task_group *tg;
- if (!cgrp->parent) {
+ if (!parent) {
/* This is early initialization for the top cgroup */
return &root_task_group.css;
}
- parent = cgroup_tg(cgrp->parent);
tg = sched_create_group(parent);
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
@@ -7678,41 +6949,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
return &tg->css;
}
-static int cpu_cgroup_css_online(struct cgroup *cgrp)
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
{
- struct task_group *tg = cgroup_tg(cgrp);
- struct task_group *parent;
-
- if (!cgrp->parent)
- return 0;
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css_parent(css));
- parent = cgroup_tg(cgrp->parent);
- sched_online_group(tg, parent);
+ if (parent)
+ sched_online_group(tg, parent);
return 0;
}
-static void cpu_cgroup_css_free(struct cgroup *cgrp)
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
sched_destroy_group(tg);
}
-static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
sched_offline_group(tg);
}
-static int cpu_cgroup_can_attach(struct cgroup *cgrp,
+static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct task_struct *task;
- cgroup_taskset_for_each(task, cgrp, tset) {
+ cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
- if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+ if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
#else
/* We don't support RT-tasks being in separate groups */
@@ -7723,18 +6991,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp,
return 0;
}
-static void cpu_cgroup_attach(struct cgroup *cgrp,
+static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct task_struct *task;
- cgroup_taskset_for_each(task, cgrp, tset)
+ cgroup_taskset_for_each(task, css, tset)
sched_move_task(task);
}
-static void
-cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
- struct task_struct *task)
+static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
{
/*
* cgroup_exit() is called in the copy_process() failure path.
@@ -7748,15 +7016,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
- u64 shareval)
+static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 shareval)
{
- return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
+ return sched_group_set_shares(css_tg(css), scale_load(shareval));
}
-static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
return (u64) scale_load_down(tg->shares);
}
@@ -7800,7 +7069,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
- account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+ /*
+ * If we need to toggle cfs_bandwidth_used, off->on must occur
+ * before making related changes, and on->off must occur afterwards
+ */
+ if (runtime_enabled && !runtime_was_enabled)
+ cfs_bandwidth_usage_inc();
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
@@ -7826,6 +7100,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
unthrottle_cfs_rq(cfs_rq);
raw_spin_unlock_irq(&rq->lock);
}
+ if (runtime_was_enabled && !runtime_enabled)
+ cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
@@ -7878,26 +7154,28 @@ long tg_get_cfs_period(struct task_group *tg)
return cfs_period_us;
}
-static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return tg_get_cfs_quota(cgroup_tg(cgrp));
+ return tg_get_cfs_quota(css_tg(css));
}
-static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
- s64 cfs_quota_us)
+static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, s64 cfs_quota_us)
{
- return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+ return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
}
-static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return tg_get_cfs_period(cgroup_tg(cgrp));
+ return tg_get_cfs_period(css_tg(css));
}
-static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
- u64 cfs_period_us)
+static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 cfs_period_us)
{
- return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+ return tg_set_cfs_period(css_tg(css), cfs_period_us);
}
struct cfs_schedulable_data {
@@ -7978,10 +7256,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
return ret;
}
-static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
+static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
struct cgroup_map_cb *cb)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
cb->fill(cb, "nr_periods", cfs_b->nr_periods);
@@ -7994,26 +7272,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
-static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
- s64 val)
+static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 val)
{
- return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+ return sched_group_set_rt_runtime(css_tg(css), val);
}
-static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return sched_group_rt_runtime(cgroup_tg(cgrp));
+ return sched_group_rt_runtime(css_tg(css));
}
-static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
- u64 rt_period_us)
+static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 rt_period_us)
{
- return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
+ return sched_group_set_rt_period(css_tg(css), rt_period_us);
}
-static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return sched_group_rt_period(cgroup_tg(cgrp));
+ return sched_group_rt_period(css_tg(css));
}
#endif /* CONFIG_RT_GROUP_SCHED */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dbb7e2c..f64722f 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -33,30 +33,20 @@ struct cpuacct {
struct kernel_cpustat __percpu *cpustat;
};
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
+static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
{
- return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
- struct cpuacct, css);
+ return css ? container_of(css, struct cpuacct, css) : NULL;
}
/* return cpu accounting group to which this task belongs */
static inline struct cpuacct *task_ca(struct task_struct *tsk)
{
- return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
- struct cpuacct, css);
-}
-
-static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
-{
- return cgroup_ca(ca->css.cgroup->parent);
+ return css_ca(task_css(tsk, cpuacct_subsys_id));
}
static inline struct cpuacct *parent_ca(struct cpuacct *ca)
{
- if (!ca->css.cgroup->parent)
- return NULL;
- return cgroup_ca(ca->css.cgroup->parent);
+ return css_ca(css_parent(&ca->css));
}
static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
@@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = {
};
/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
+static struct cgroup_subsys_state *
+cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cpuacct *ca;
- if (!cgrp->parent)
+ if (!parent_css)
return &root_cpuacct.css;
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
@@ -96,9 +87,9 @@ out:
}
/* destroy an existing cpu accounting group */
-static void cpuacct_css_free(struct cgroup *cgrp)
+static void cpuacct_css_free(struct cgroup_subsys_state *css)
{
- struct cpuacct *ca = cgroup_ca(cgrp);
+ struct cpuacct *ca = css_ca(css);
free_percpu(ca->cpustat);
free_percpu(ca->cpuusage);
@@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
}
/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- struct cpuacct *ca = cgroup_ca(cgrp);
+ struct cpuacct *ca = css_ca(css);
u64 totalcpuusage = 0;
int i;
@@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
return totalcpuusage;
}
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
- u64 reset)
+static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 reset)
{
- struct cpuacct *ca = cgroup_ca(cgrp);
+ struct cpuacct *ca = css_ca(css);
int err = 0;
int i;
@@ -172,10 +163,10 @@ out:
return err;
}
-static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
- struct seq_file *m)
+static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct seq_file *m)
{
- struct cpuacct *ca = cgroup_ca(cgroup);
+ struct cpuacct *ca = css_ca(css);
u64 percpu;
int i;
@@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = {
[CPUACCT_STAT_SYSTEM] = "system",
};
-static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
- struct cgroup_map_cb *cb)
+static int cpuacct_stats_show(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct cgroup_map_cb *cb)
{
- struct cpuacct *ca = cgroup_ca(cgrp);
+ struct cpuacct *ca = css_ca(css);
int cpu;
s64 val = 0;
@@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
while (ca != &root_cpuacct) {
kcpustat = this_cpu_ptr(ca->cpustat);
kcpustat->cpustat[index] += val;
- ca = __parent_ca(ca);
+ ca = parent_ca(ca);
}
rcu_read_unlock();
}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 1095e87..8b836b3 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -62,7 +62,7 @@ static int convert_prio(int prio)
* any discrepancies created by racing against the uncertainty of the current
* priority configuration.
*
- * Returns: (int)bool - CPUs were found
+ * Return: (int)bool - CPUs were found
*/
int cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask)
@@ -203,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* cpupri_init - initialize the cpupri structure
* @cp: The cpupri context
*
- * Returns: -ENOMEM if memory fails.
+ * Return: -ENOMEM on memory allocation failure.
*/
int cpupri_init(struct cpupri *cp)
{
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index b5ccba2..9994791 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
* is the only cgroup, then nothing else should be necessary.
*
*/
- __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
+ __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
cpuacct_account_field(p, index, tmp);
}
@@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
+void vtime_common_task_switch(struct task_struct *prev)
{
- if (!vtime_accounting_enabled())
- return;
-
if (is_idle_task(prev))
vtime_account_idle(prev);
else
@@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev)
* vtime_account().
*/
#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account_irq_enter(struct task_struct *tsk)
+void vtime_common_account_irq_enter(struct task_struct *tsk)
{
- if (!vtime_accounting_enabled())
- return;
-
if (!in_interrupt()) {
/*
* If we interrupted user, context_tracking_in_user()
@@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
}
vtime_account_system(tsk);
}
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
+EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
@@ -515,9 +509,8 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
for (;;) {
/* Make sure "rtime" is the bigger of stime/rtime */
- if (stime > rtime) {
- u64 tmp = rtime; rtime = stime; stime = tmp;
- }
+ if (stime > rtime)
+ swap(rtime, stime);
/* Make sure 'total' fits in 32 bits */
if (total >> 32)
@@ -558,16 +551,7 @@ static void cputime_adjust(struct task_cputime *curr,
struct cputime *prev,
cputime_t *ut, cputime_t *st)
{
- cputime_t rtime, stime, utime, total;
-
- if (vtime_accounting_enabled()) {
- *ut = curr->utime;
- *st = curr->stime;
- return;
- }
-
- stime = curr->stime;
- total = stime + curr->utime;
+ cputime_t rtime, stime, utime;
/*
* Tick based cputime accounting depend on random scheduling
@@ -589,13 +573,19 @@ static void cputime_adjust(struct task_cputime *curr,
if (prev->stime + prev->utime >= rtime)
goto out;
- if (total) {
+ stime = curr->stime;
+ utime = curr->utime;
+
+ if (utime == 0) {
+ stime = rtime;
+ } else if (stime == 0) {
+ utime = rtime;
+ } else {
+ cputime_t total = stime + utime;
+
stime = scale_stime((__force u64)stime,
(__force u64)rtime, (__force u64)total);
utime = rtime - stime;
- } else {
- stime = rtime;
- utime = 0;
}
/*
@@ -665,23 +655,17 @@ static void __vtime_account_system(struct task_struct *tsk)
void vtime_account_system(struct task_struct *tsk)
{
- if (!vtime_accounting_enabled())
- return;
-
write_seqlock(&tsk->vtime_seqlock);
__vtime_account_system(tsk);
write_sequnlock(&tsk->vtime_seqlock);
}
-void vtime_account_irq_exit(struct task_struct *tsk)
+void vtime_gen_account_irq_exit(struct task_struct *tsk)
{
- if (!vtime_accounting_enabled())
- return;
-
write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
if (context_tracking_in_user())
tsk->vtime_snap_whence = VTIME_USER;
- __vtime_account_system(tsk);
write_sequnlock(&tsk->vtime_seqlock);
}
@@ -689,12 +673,8 @@ void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_cpu;
- if (!vtime_accounting_enabled())
- return;
-
- delta_cpu = get_vtime_delta(tsk);
-
write_seqlock(&tsk->vtime_seqlock);
+ delta_cpu = get_vtime_delta(tsk);
tsk->vtime_snap_whence = VTIME_SYS;
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
write_sequnlock(&tsk->vtime_seqlock);
@@ -702,22 +682,27 @@ void vtime_account_user(struct task_struct *tsk)
void vtime_user_enter(struct task_struct *tsk)
{
- if (!vtime_accounting_enabled())
- return;
-
write_seqlock(&tsk->vtime_seqlock);
- tsk->vtime_snap_whence = VTIME_USER;
__vtime_account_system(tsk);
+ tsk->vtime_snap_whence = VTIME_USER;
write_sequnlock(&tsk->vtime_seqlock);
}
void vtime_guest_enter(struct task_struct *tsk)
{
+ /*
+ * The flags must be updated under the lock with
+ * the vtime_snap flush and update.
+ * That enforces a right ordering and update sequence
+ * synchronization against the reader (task_gtime())
+ * that can thus safely catch up with a tickless delta.
+ */
write_seqlock(&tsk->vtime_seqlock);
__vtime_account_system(tsk);
current->flags |= PF_VCPU;
write_sequnlock(&tsk->vtime_seqlock);
}
+EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk)
{
@@ -726,6 +711,7 @@ void vtime_guest_exit(struct task_struct *tsk)
current->flags &= ~PF_VCPU;
write_sequnlock(&tsk->vtime_seqlock);
}
+EXPORT_SYMBOL_GPL(vtime_guest_exit);
void vtime_account_idle(struct task_struct *tsk)
{
@@ -734,11 +720,6 @@ void vtime_account_idle(struct task_struct *tsk)
account_idle_time(delta_cpu);
}
-bool vtime_accounting_enabled(void)
-{
- return context_tracking_active();
-}
-
void arch_vtime_task_switch(struct task_struct *prev)
{
write_seqlock(&prev->vtime_seqlock);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 75024a6..5c34d18 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,6 +15,7 @@
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <linux/utsname.h>
+#include <linux/mempolicy.h>
#include "sched.h"
@@ -124,7 +125,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, " ");
SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
- p->comm, p->pid,
+ p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
+#endif
#ifdef CONFIG_CGROUP_SCHED
SEQ_printf(m, " %s", task_group_path(task_group(p)));
#endif
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, p) {
- if (!p->on_rq || task_cpu(p) != rq_cpu)
+ if (task_cpu(p) != rq_cpu)
continue;
print_task(m, rq, p);
@@ -209,22 +213,32 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
-#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",
+ SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
cfs_rq->runnable_load_avg);
- SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
+ SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
cfs_rq->blocked_load_avg);
- SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg",
- (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg));
- SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
cfs_rq->tg_load_contrib);
SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
cfs_rq->tg_runnable_contrib);
+ SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
+ atomic_long_read(&cfs_rq->tg->load_avg));
SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
atomic_read(&cfs_rq->tg->runnable_avg));
#endif
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
+ cfs_rq->tg->cfs_bandwidth.timer_active);
+ SEQ_printf(m, " .%-30s: %d\n", "throttled",
+ cfs_rq->throttled);
+ SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
+ cfs_rq->throttle_count);
+#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
print_cfs_group_stats(m, cpu, cfs_rq->tg);
#endif
}
@@ -287,7 +301,7 @@ do { \
P(nr_load_updates);
P(nr_uninterruptible);
PN(next_balance);
- P(curr->pid);
+ SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
P(cpu_load[0]);
P(cpu_load[1]);
@@ -343,7 +357,7 @@ static void sched_debug_header(struct seq_file *m)
cpu_clk = local_clock();
local_irq_restore(flags);
- SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
+ SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
@@ -486,22 +500,73 @@ static int __init init_sched_debug_procfs(void)
__initcall(init_sched_debug_procfs);
+#define __P(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
+#define P(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+
+
+static void sched_show_numa(struct task_struct *p, struct seq_file *m)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ struct mempolicy *pol;
+ int node, i;
+
+ if (p->mm)
+ P(mm->numa_scan_seq);
+
+ task_lock(p);
+ pol = p->mempolicy;
+ if (pol && !(pol->flags & MPOL_F_MORON))
+ pol = NULL;
+ mpol_get(pol);
+ task_unlock(p);
+
+ SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
+
+ for_each_online_node(node) {
+ for (i = 0; i < 2; i++) {
+ unsigned long nr_faults = -1;
+ int cpu_current, home_node;
+
+ if (p->numa_faults)
+ nr_faults = p->numa_faults[2*node + i];
+
+ cpu_current = !i ? (task_node(p) == node) :
+ (pol && node_isset(node, pol->v.nodes));
+
+ home_node = (p->numa_preferred_nid == node);
+
+ SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
+ i, node, cpu_current, home_node, nr_faults);
+ }
+ }
+
+ mpol_put(pol);
+#endif
+}
+
void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
{
unsigned long nr_switches;
- SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
get_nr_threads(p));
SEQ_printf(m,
- "---------------------------------------------------------\n");
+ "---------------------------------------------------------"
+ "----------\n");
#define __P(F) \
- SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
#define P(F) \
- SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
#define __PN(F) \
- SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) \
- SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
PN(se.exec_start);
PN(se.vruntime);
@@ -560,12 +625,18 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
}
#endif
__P(nr_switches);
- SEQ_printf(m, "%-35s:%21Ld\n",
+ SEQ_printf(m, "%-45s:%21Ld\n",
"nr_voluntary_switches", (long long)p->nvcsw);
- SEQ_printf(m, "%-35s:%21Ld\n",
+ SEQ_printf(m, "%-45s:%21Ld\n",
"nr_involuntary_switches", (long long)p->nivcsw);
P(se.load.weight);
+#ifdef CONFIG_SMP
+ P(se.avg.runnable_avg_sum);
+ P(se.avg.runnable_avg_period);
+ P(se.avg.load_avg_contrib);
+ P(se.avg.decay_count);
+#endif
P(policy);
P(prio);
#undef PN
@@ -579,9 +650,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
t0 = cpu_clock(this_cpu);
t1 = cpu_clock(this_cpu);
- SEQ_printf(m, "%-35s:%21Ld\n",
+ SEQ_printf(m, "%-45s:%21Ld\n",
"clock-delta", (long long)(t1-t0));
}
+
+ sched_show_numa(p, m);
}
void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c61a614..fd773ad 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -113,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+ lw->weight += inc;
+ lw->inv_weight = 0;
+}
+
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+ lw->weight -= dec;
+ lw->inv_weight = 0;
+}
+
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+ lw->weight = w;
+ lw->inv_weight = 0;
+}
+
/*
* Increase the granularity value when there are more CPUs,
* because with more CPUs the 'effective latency' as visible
@@ -662,6 +680,28 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
+#ifdef CONFIG_SMP
+static unsigned long task_h_load(struct task_struct *p);
+
+static inline void __update_task_entity_contrib(struct sched_entity *se);
+
+/* Give new task start runnable values to heavy its load in infant time */
+void init_task_runnable_average(struct task_struct *p)
+{
+ u32 slice;
+
+ p->se.avg.decay_count = 0;
+ slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
+ p->se.avg.runnable_avg_sum = slice;
+ p->se.avg.runnable_avg_period = slice;
+ __update_task_entity_contrib(&p->se);
+}
+#else
+void init_task_runnable_average(struct task_struct *p)
+{
+}
+#endif
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -686,7 +726,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_of(cfs_rq)->clock_task;
+ u64 now = rq_clock_task(rq_of(cfs_rq));
unsigned long delta_exec;
if (unlikely(!curr))
@@ -718,7 +758,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
+ schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
}
/*
@@ -738,14 +778,14 @@ static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
- rq_of(cfs_rq)->clock - se->statistics.wait_start));
+ rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
- rq_of(cfs_rq)->clock - se->statistics.wait_start);
+ rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
trace_sched_stat_wait(task_of(se),
- rq_of(cfs_rq)->clock - se->statistics.wait_start);
+ rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
}
#endif
schedstat_set(se->statistics.wait_start, 0);
@@ -771,7 +811,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* We are starting a new run period:
*/
- se->exec_start = rq_of(cfs_rq)->clock_task;
+ se->exec_start = rq_clock_task(rq_of(cfs_rq));
}
/**************************************************
@@ -780,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_NUMA_BALANCING
/*
- * numa task sample period in ms
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
+ * calculated based on the tasks virtual memory size and
+ * numa_balancing_scan_size.
*/
-unsigned int sysctl_numa_balancing_scan_period_min = 100;
-unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
-unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
/* Portion of address space to scan in MB */
unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -792,41 +833,835 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
-static void task_numa_placement(struct task_struct *p)
+/*
+ * After skipping a page migration on a shared page, skip N more numa page
+ * migrations unconditionally. This reduces the number of NUMA migrations
+ * in shared memory workloads, and has the effect of pulling tasks towards
+ * where their memory lives, over pulling the memory towards the task.
+ */
+unsigned int sysctl_numa_balancing_migrate_deferred = 16;
+
+static unsigned int task_nr_scan_windows(struct task_struct *p)
+{
+ unsigned long rss = 0;
+ unsigned long nr_scan_pages;
+
+ /*
+ * Calculations based on RSS as non-present and empty pages are skipped
+ * by the PTE scanner and NUMA hinting faults should be trapped based
+ * on resident pages
+ */
+ nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+ rss = get_mm_rss(p->mm);
+ if (!rss)
+ rss = nr_scan_pages;
+
+ rss = round_up(rss, nr_scan_pages);
+ return rss / nr_scan_pages;
+}
+
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+#define MAX_SCAN_WINDOW 2560
+
+static unsigned int task_scan_min(struct task_struct *p)
+{
+ unsigned int scan, floor;
+ unsigned int windows = 1;
+
+ if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
+ windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+ floor = 1000 / windows;
+
+ scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
+ return max_t(unsigned int, floor, scan);
+}
+
+static unsigned int task_scan_max(struct task_struct *p)
+{
+ unsigned int smin = task_scan_min(p);
+ unsigned int smax;
+
+ /* Watch for min being lower than max due to floor calculations */
+ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+ return max(smin, smax);
+}
+
+/*
+ * Once a preferred node is selected the scheduler balancer will prefer moving
+ * a task to that node for sysctl_numa_balancing_settle_count number of PTE
+ * scans. This will give the process the chance to accumulate more faults on
+ * the preferred node but still allow the scheduler to move the task again if
+ * the nodes CPUs are overloaded.
+ */
+unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
+
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+
+struct numa_group {
+ atomic_t refcount;
+
+ spinlock_t lock; /* nr_tasks, tasks */
+ int nr_tasks;
+ pid_t gid;
+ struct list_head task_list;
+
+ struct rcu_head rcu;
+ unsigned long total_faults;
+ unsigned long faults[0];
+};
+
+pid_t task_numa_group_id(struct task_struct *p)
+{
+ return p->numa_group ? p->numa_group->gid : 0;
+}
+
+static inline int task_faults_idx(int nid, int priv)
+{
+ return 2 * nid + priv;
+}
+
+static inline unsigned long task_faults(struct task_struct *p, int nid)
+{
+ if (!p->numa_faults)
+ return 0;
+
+ return p->numa_faults[task_faults_idx(nid, 0)] +
+ p->numa_faults[task_faults_idx(nid, 1)];
+}
+
+static inline unsigned long group_faults(struct task_struct *p, int nid)
+{
+ if (!p->numa_group)
+ return 0;
+
+ return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
+}
+
+/*
+ * These return the fraction of accesses done by a particular task, or
+ * task group, on a particular numa node. The group weight is given a
+ * larger multiplier, in order to group tasks together that are almost
+ * evenly spread out between numa nodes.
+ */
+static inline unsigned long task_weight(struct task_struct *p, int nid)
+{
+ unsigned long total_faults;
+
+ if (!p->numa_faults)
+ return 0;
+
+ total_faults = p->total_numa_faults;
+
+ if (!total_faults)
+ return 0;
+
+ return 1000 * task_faults(p, nid) / total_faults;
+}
+
+static inline unsigned long group_weight(struct task_struct *p, int nid)
+{
+ if (!p->numa_group || !p->numa_group->total_faults)
+ return 0;
+
+ return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+}
+
+static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long power_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+
+/* Cached statistics for all CPUs within a node */
+struct numa_stats {
+ unsigned long nr_running;
+ unsigned long load;
+
+ /* Total compute capacity of CPUs on a node */
+ unsigned long power;
+
+ /* Approximate capacity in terms of runnable tasks on a node */
+ unsigned long capacity;
+ int has_capacity;
+};
+
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
+{
+ int cpu, cpus = 0;
+
+ memset(ns, 0, sizeof(*ns));
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ ns->nr_running += rq->nr_running;
+ ns->load += weighted_cpuload(cpu);
+ ns->power += power_of(cpu);
+
+ cpus++;
+ }
+
+ /*
+ * If we raced with hotplug and there are no CPUs left in our mask
+ * the @ns structure is NULL'ed and task_numa_compare() will
+ * not find this node attractive.
+ *
+ * We'll either bail at !has_capacity, or we'll detect a huge imbalance
+ * and bail there.
+ */
+ if (!cpus)
+ return;
+
+ ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
+ ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
+ ns->has_capacity = (ns->nr_running < ns->capacity);
+}
+
+struct task_numa_env {
+ struct task_struct *p;
+
+ int src_cpu, src_nid;
+ int dst_cpu, dst_nid;
+
+ struct numa_stats src_stats, dst_stats;
+
+ int imbalance_pct, idx;
+
+ struct task_struct *best_task;
+ long best_imp;
+ int best_cpu;
+};
+
+static void task_numa_assign(struct task_numa_env *env,
+ struct task_struct *p, long imp)
+{
+ if (env->best_task)
+ put_task_struct(env->best_task);
+ if (p)
+ get_task_struct(p);
+
+ env->best_task = p;
+ env->best_imp = imp;
+ env->best_cpu = env->dst_cpu;
+}
+
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env,
+ long taskimp, long groupimp)
+{
+ struct rq *src_rq = cpu_rq(env->src_cpu);
+ struct rq *dst_rq = cpu_rq(env->dst_cpu);
+ struct task_struct *cur;
+ long dst_load, src_load;
+ long load;
+ long imp = (groupimp > 0) ? groupimp : taskimp;
+
+ rcu_read_lock();
+ cur = ACCESS_ONCE(dst_rq->curr);
+ if (cur->pid == 0) /* idle */
+ cur = NULL;
+
+ /*
+ * "imp" is the fault differential for the source task between the
+ * source and destination node. Calculate the total differential for
+ * the source task and potential destination task. The more negative
+ * the value is, the more rmeote accesses that would be expected to
+ * be incurred if the tasks were swapped.
+ */
+ if (cur) {
+ /* Skip this swap candidate if cannot move to the source cpu */
+ if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+ goto unlock;
+
+ /*
+ * If dst and source tasks are in the same NUMA group, or not
+ * in any group then look only at task weights.
+ */
+ if (cur->numa_group == env->p->numa_group) {
+ imp = taskimp + task_weight(cur, env->src_nid) -
+ task_weight(cur, env->dst_nid);
+ /*
+ * Add some hysteresis to prevent swapping the
+ * tasks within a group over tiny differences.
+ */
+ if (cur->numa_group)
+ imp -= imp/16;
+ } else {
+ /*
+ * Compare the group weights. If a task is all by
+ * itself (not part of a group), use the task weight
+ * instead.
+ */
+ if (env->p->numa_group)
+ imp = groupimp;
+ else
+ imp = taskimp;
+
+ if (cur->numa_group)
+ imp += group_weight(cur, env->src_nid) -
+ group_weight(cur, env->dst_nid);
+ else
+ imp += task_weight(cur, env->src_nid) -
+ task_weight(cur, env->dst_nid);
+ }
+ }
+
+ if (imp < env->best_imp)
+ goto unlock;
+
+ if (!cur) {
+ /* Is there capacity at our destination? */
+ if (env->src_stats.has_capacity &&
+ !env->dst_stats.has_capacity)
+ goto unlock;
+
+ goto balance;
+ }
+
+ /* Balance doesn't matter much if we're running a task per cpu */
+ if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+ goto assign;
+
+ /*
+ * In the overloaded case, try and keep the load balanced.
+ */
+balance:
+ dst_load = env->dst_stats.load;
+ src_load = env->src_stats.load;
+
+ /* XXX missing power terms */
+ load = task_h_load(env->p);
+ dst_load += load;
+ src_load -= load;
+
+ if (cur) {
+ load = task_h_load(cur);
+ dst_load -= load;
+ src_load += load;
+ }
+
+ /* make src_load the smaller */
+ if (dst_load < src_load)
+ swap(dst_load, src_load);
+
+ if (src_load * env->imbalance_pct < dst_load * 100)
+ goto unlock;
+
+assign:
+ task_numa_assign(env, cur, imp);
+unlock:
+ rcu_read_unlock();
+}
+
+static void task_numa_find_cpu(struct task_numa_env *env,
+ long taskimp, long groupimp)
+{
+ int cpu;
+
+ for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
+ /* Skip this CPU if the source task cannot migrate */
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+ continue;
+
+ env->dst_cpu = cpu;
+ task_numa_compare(env, taskimp, groupimp);
+ }
+}
+
+static int task_numa_migrate(struct task_struct *p)
+{
+ struct task_numa_env env = {
+ .p = p,
+
+ .src_cpu = task_cpu(p),
+ .src_nid = task_node(p),
+
+ .imbalance_pct = 112,
+
+ .best_task = NULL,
+ .best_imp = 0,
+ .best_cpu = -1
+ };
+ struct sched_domain *sd;
+ unsigned long taskweight, groupweight;
+ int nid, ret;
+ long taskimp, groupimp;
+
+ /*
+ * Pick the lowest SD_NUMA domain, as that would have the smallest
+ * imbalance and would be the first to start moving tasks about.
+ *
+ * And we want to avoid any moving of tasks about, as that would create
+ * random movement of tasks -- counter the numa conditions we're trying
+ * to satisfy here.
+ */
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+ if (sd)
+ env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ rcu_read_unlock();
+
+ /*
+ * Cpusets can break the scheduler domain tree into smaller
+ * balance domains, some of which do not cross NUMA boundaries.
+ * Tasks that are "trapped" in such domains cannot be migrated
+ * elsewhere, so there is no point in (re)trying.
+ */
+ if (unlikely(!sd)) {
+ p->numa_preferred_nid = cpu_to_node(task_cpu(p));
+ return -EINVAL;
+ }
+
+ taskweight = task_weight(p, env.src_nid);
+ groupweight = group_weight(p, env.src_nid);
+ update_numa_stats(&env.src_stats, env.src_nid);
+ env.dst_nid = p->numa_preferred_nid;
+ taskimp = task_weight(p, env.dst_nid) - taskweight;
+ groupimp = group_weight(p, env.dst_nid) - groupweight;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+
+ /* If the preferred nid has capacity, try to use it. */
+ if (env.dst_stats.has_capacity)
+ task_numa_find_cpu(&env, taskimp, groupimp);
+
+ /* No space available on the preferred nid. Look elsewhere. */
+ if (env.best_cpu == -1) {
+ for_each_online_node(nid) {
+ if (nid == env.src_nid || nid == p->numa_preferred_nid)
+ continue;
+
+ /* Only consider nodes where both task and groups benefit */
+ taskimp = task_weight(p, nid) - taskweight;
+ groupimp = group_weight(p, nid) - groupweight;
+ if (taskimp < 0 && groupimp < 0)
+ continue;
+
+ env.dst_nid = nid;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+ task_numa_find_cpu(&env, taskimp, groupimp);
+ }
+ }
+
+ /* No better CPU than the current one was found. */
+ if (env.best_cpu == -1)
+ return -EAGAIN;
+
+ sched_setnuma(p, env.dst_nid);
+
+ /*
+ * Reset the scan period if the task is being rescheduled on an
+ * alternative node to recheck if the tasks is now properly placed.
+ */
+ p->numa_scan_period = task_scan_min(p);
+
+ if (env.best_task == NULL) {
+ int ret = migrate_task_to(p, env.best_cpu);
+ return ret;
+ }
+
+ ret = migrate_swap(p, env.best_task);
+ put_task_struct(env.best_task);
+ return ret;
+}
+
+/* Attempt to migrate a task to a CPU on the preferred node. */
+static void numa_migrate_preferred(struct task_struct *p)
+{
+ /* This task has no NUMA fault statistics yet */
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+ return;
+
+ /* Periodically retry migrating the task to the preferred node */
+ p->numa_migrate_retry = jiffies + HZ;
+
+ /* Success if task is already running on preferred CPU */
+ if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
+ return;
+
+ /* Otherwise, try migrate to a CPU on the preferred node */
+ task_numa_migrate(p);
+}
+
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+ unsigned long shared, unsigned long private)
{
- int seq;
+ unsigned int period_slot;
+ int ratio;
+ int diff;
+
+ unsigned long remote = p->numa_faults_locality[0];
+ unsigned long local = p->numa_faults_locality[1];
+
+ /*
+ * If there were no record hinting faults then either the task is
+ * completely idle or all activity is areas that are not of interest
+ * to automatic numa balancing. Scan slower
+ */
+ if (local + shared == 0) {
+ p->numa_scan_period = min(p->numa_scan_period_max,
+ p->numa_scan_period << 1);
+
+ p->mm->numa_next_scan = jiffies +
+ msecs_to_jiffies(p->numa_scan_period);
- if (!p->mm) /* for example, ksmd faulting in a user's mm */
return;
+ }
+
+ /*
+ * Prepare to scale scan period relative to the current period.
+ * == NUMA_PERIOD_THRESHOLD scan period stays the same
+ * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+ * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+ */
+ period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+ ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ if (ratio >= NUMA_PERIOD_THRESHOLD) {
+ int slot = ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else {
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+
+ /*
+ * Scale scan rate increases based on sharing. There is an
+ * inverse relationship between the degree of sharing and
+ * the adjustment made to the scanning period. Broadly
+ * speaking the intent is that there is little point
+ * scanning faster if shared accesses dominate as it may
+ * simply bounce migrations uselessly
+ */
+ period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
+ ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+ diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+ }
+
+ p->numa_scan_period = clamp(p->numa_scan_period + diff,
+ task_scan_min(p), task_scan_max(p));
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+
+static void task_numa_placement(struct task_struct *p)
+{
+ int seq, nid, max_nid = -1, max_group_nid = -1;
+ unsigned long max_faults = 0, max_group_faults = 0;
+ unsigned long fault_types[2] = { 0, 0 };
+ spinlock_t *group_lock = NULL;
+
seq = ACCESS_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
+ p->numa_scan_period_max = task_scan_max(p);
+
+ /* If the task is part of a group prevent parallel updates to group stats */
+ if (p->numa_group) {
+ group_lock = &p->numa_group->lock;
+ spin_lock(group_lock);
+ }
+
+ /* Find the node with the highest number of faults */
+ for_each_online_node(nid) {
+ unsigned long faults = 0, group_faults = 0;
+ int priv, i;
+
+ for (priv = 0; priv < 2; priv++) {
+ long diff;
+
+ i = task_faults_idx(nid, priv);
+ diff = -p->numa_faults[i];
+
+ /* Decay existing window, copy faults since last scan */
+ p->numa_faults[i] >>= 1;
+ p->numa_faults[i] += p->numa_faults_buffer[i];
+ fault_types[priv] += p->numa_faults_buffer[i];
+ p->numa_faults_buffer[i] = 0;
+
+ faults += p->numa_faults[i];
+ diff += p->numa_faults[i];
+ p->total_numa_faults += diff;
+ if (p->numa_group) {
+ /* safe because we can only change our own group */
+ p->numa_group->faults[i] += diff;
+ p->numa_group->total_faults += diff;
+ group_faults += p->numa_group->faults[i];
+ }
+ }
+
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_nid = nid;
+ }
+
+ if (group_faults > max_group_faults) {
+ max_group_faults = group_faults;
+ max_group_nid = nid;
+ }
+ }
+
+ update_task_scan_period(p, fault_types[0], fault_types[1]);
+
+ if (p->numa_group) {
+ /*
+ * If the preferred task and group nids are different,
+ * iterate over the nodes again to find the best place.
+ */
+ if (max_nid != max_group_nid) {
+ unsigned long weight, max_weight = 0;
+
+ for_each_online_node(nid) {
+ weight = task_weight(p, nid) + group_weight(p, nid);
+ if (weight > max_weight) {
+ max_weight = weight;
+ max_nid = nid;
+ }
+ }
+ }
+
+ spin_unlock(group_lock);
+ }
+
+ /* Preferred node as the node with the most faults */
+ if (max_faults && max_nid != p->numa_preferred_nid) {
+ /* Update the preferred nid and migrate task if possible */
+ sched_setnuma(p, max_nid);
+ numa_migrate_preferred(p);
+ }
+}
+
+static inline int get_numa_group(struct numa_group *grp)
+{
+ return atomic_inc_not_zero(&grp->refcount);
+}
+
+static inline void put_numa_group(struct numa_group *grp)
+{
+ if (atomic_dec_and_test(&grp->refcount))
+ kfree_rcu(grp, rcu);
+}
+
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
+ int *priv)
+{
+ struct numa_group *grp, *my_grp;
+ struct task_struct *tsk;
+ bool join = false;
+ int cpu = cpupid_to_cpu(cpupid);
+ int i;
+
+ if (unlikely(!p->numa_group)) {
+ unsigned int size = sizeof(struct numa_group) +
+ 2*nr_node_ids*sizeof(unsigned long);
- /* FIXME: Scheduling placement policy hints go here */
+ grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ if (!grp)
+ return;
+
+ atomic_set(&grp->refcount, 1);
+ spin_lock_init(&grp->lock);
+ INIT_LIST_HEAD(&grp->task_list);
+ grp->gid = p->pid;
+
+ for (i = 0; i < 2*nr_node_ids; i++)
+ grp->faults[i] = p->numa_faults[i];
+
+ grp->total_faults = p->total_numa_faults;
+
+ list_add(&p->numa_entry, &grp->task_list);
+ grp->nr_tasks++;
+ rcu_assign_pointer(p->numa_group, grp);
+ }
+
+ rcu_read_lock();
+ tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+
+ if (!cpupid_match_pid(tsk, cpupid))
+ goto no_join;
+
+ grp = rcu_dereference(tsk->numa_group);
+ if (!grp)
+ goto no_join;
+
+ my_grp = p->numa_group;
+ if (grp == my_grp)
+ goto no_join;
+
+ /*
+ * Only join the other group if its bigger; if we're the bigger group,
+ * the other task will join us.
+ */
+ if (my_grp->nr_tasks > grp->nr_tasks)
+ goto no_join;
+
+ /*
+ * Tie-break on the grp address.
+ */
+ if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
+ goto no_join;
+
+ /* Always join threads in the same process. */
+ if (tsk->mm == current->mm)
+ join = true;
+
+ /* Simple filter to avoid false positives due to PID collisions */
+ if (flags & TNF_SHARED)
+ join = true;
+
+ /* Update priv based on whether false sharing was detected */
+ *priv = !join;
+
+ if (join && !get_numa_group(grp))
+ goto no_join;
+
+ rcu_read_unlock();
+
+ if (!join)
+ return;
+
+ double_lock(&my_grp->lock, &grp->lock);
+
+ for (i = 0; i < 2*nr_node_ids; i++) {
+ my_grp->faults[i] -= p->numa_faults[i];
+ grp->faults[i] += p->numa_faults[i];
+ }
+ my_grp->total_faults -= p->total_numa_faults;
+ grp->total_faults += p->total_numa_faults;
+
+ list_move(&p->numa_entry, &grp->task_list);
+ my_grp->nr_tasks--;
+ grp->nr_tasks++;
+
+ spin_unlock(&my_grp->lock);
+ spin_unlock(&grp->lock);
+
+ rcu_assign_pointer(p->numa_group, grp);
+
+ put_numa_group(my_grp);
+ return;
+
+no_join:
+ rcu_read_unlock();
+ return;
+}
+
+void task_numa_free(struct task_struct *p)
+{
+ struct numa_group *grp = p->numa_group;
+ int i;
+ void *numa_faults = p->numa_faults;
+
+ if (grp) {
+ spin_lock(&grp->lock);
+ for (i = 0; i < 2*nr_node_ids; i++)
+ grp->faults[i] -= p->numa_faults[i];
+ grp->total_faults -= p->total_numa_faults;
+
+ list_del(&p->numa_entry);
+ grp->nr_tasks--;
+ spin_unlock(&grp->lock);
+ rcu_assign_pointer(p->numa_group, NULL);
+ put_numa_group(grp);
+ }
+
+ p->numa_faults = NULL;
+ p->numa_faults_buffer = NULL;
+ kfree(numa_faults);
}
/*
* Got a PROT_NONE fault for a page on @node.
*/
-void task_numa_fault(int node, int pages, bool migrated)
+void task_numa_fault(int last_cpupid, int node, int pages, int flags)
{
struct task_struct *p = current;
+ bool migrated = flags & TNF_MIGRATED;
+ int priv;
- if (!sched_feat_numa(NUMA))
+ if (!numabalancing_enabled)
return;
- /* FIXME: Allocate task-specific structure for placement policy here */
+ /* for example, ksmd faulting in a user's mm */
+ if (!p->mm)
+ return;
+
+ /* Do not worry about placement if exiting */
+ if (p->state == TASK_DEAD)
+ return;
+
+ /* Allocate buffer to track faults on a per-node basis */
+ if (unlikely(!p->numa_faults)) {
+ int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+
+ /* numa_faults and numa_faults_buffer share the allocation */
+ p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
+ if (!p->numa_faults)
+ return;
+
+ BUG_ON(p->numa_faults_buffer);
+ p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+ p->total_numa_faults = 0;
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+ }
/*
- * If pages are properly placed (did not migrate) then scan slower.
- * This is reset periodically in case of phase changes
+ * First accesses are treated as private, otherwise consider accesses
+ * to be private if the accessing pid has not changed
*/
- if (!migrated)
- p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
- p->numa_scan_period + jiffies_to_msecs(10));
+ if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
+ priv = 1;
+ } else {
+ priv = cpupid_match_pid(p, last_cpupid);
+ if (!priv && !(flags & TNF_NO_GROUP))
+ task_numa_group(p, last_cpupid, flags, &priv);
+ }
task_numa_placement(p);
+
+ /*
+ * Retry task to preferred node migration periodically, in case it
+ * case it previously failed, or the scheduler moved us.
+ */
+ if (time_after(jiffies, p->numa_migrate_retry))
+ numa_migrate_preferred(p);
+
+ if (migrated)
+ p->numa_pages_migrated += pages;
+
+ p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+ p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
}
static void reset_ptenuma_scan(struct task_struct *p)
@@ -846,6 +1681,7 @@ void task_numa_work(struct callback_head *work)
struct mm_struct *mm = p->mm;
struct vm_area_struct *vma;
unsigned long start, end;
+ unsigned long nr_pte_updates = 0;
long pages;
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -862,35 +1698,9 @@ void task_numa_work(struct callback_head *work)
if (p->flags & PF_EXITING)
return;
- /*
- * We do not care about task placement until a task runs on a node
- * other than the first one used by the address space. This is
- * largely because migrations are driven by what CPU the task
- * is running on. If it's never scheduled on another node, it'll
- * not migrate so why bother trapping the fault.
- */
- if (mm->first_nid == NUMA_PTE_SCAN_INIT)
- mm->first_nid = numa_node_id();
- if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
- /* Are we running on a new node yet? */
- if (numa_node_id() == mm->first_nid &&
- !sched_feat_numa(NUMA_FORCE))
- return;
-
- mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
- }
-
- /*
- * Reset the scan period if enough time has gone by. Objective is that
- * scanning will be reduced if pages are properly placed. As tasks
- * can enter different phases this needs to be re-examined. Lacking
- * proper tracking of reference behaviour, this blunt hammer is used.
- */
- migrate = mm->numa_next_reset;
- if (time_after(now, migrate)) {
- p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
- next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
- xchg(&mm->numa_next_reset, next_scan);
+ if (!mm->numa_next_scan) {
+ mm->numa_next_scan = now +
+ msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
}
/*
@@ -900,20 +1710,20 @@ void task_numa_work(struct callback_head *work)
if (time_before(now, migrate))
return;
- if (p->numa_scan_period == 0)
- p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ if (p->numa_scan_period == 0) {
+ p->numa_scan_period_max = task_scan_max(p);
+ p->numa_scan_period = task_scan_min(p);
+ }
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
return;
/*
- * Do not set pte_numa if the current running node is rate-limited.
- * This loses statistics on the fault but if we are unwilling to
- * migrate to this node, it is less likely we can do useful work
+ * Delay this task enough that another task of this mm will likely win
+ * the next time around.
*/
- if (migrate_ratelimited(numa_node_id()))
- return;
+ p->node_stamp += 2 * TICK_NSEC;
start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
@@ -929,18 +1739,32 @@ void task_numa_work(struct callback_head *work)
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma))
+ if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
continue;
- /* Skip small VMAs. They are not likely to be of relevance */
- if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+ /*
+ * Shared library pages mapped by multiple processes are not
+ * migrated as it is expected they are cache replicated. Avoid
+ * hinting faults in read-only file-backed mappings or the vdso
+ * as migrating the pages will be of marginal benefit.
+ */
+ if (!vma->vm_mm ||
+ (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
continue;
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
end = min(end, vma->vm_end);
- pages -= change_prot_numa(vma, start, end);
+ nr_pte_updates += change_prot_numa(vma, start, end);
+
+ /*
+ * Scan sysctl_numa_balancing_scan_size but ensure that
+ * at least one PTE is updated so that unused virtual
+ * address space is quickly skipped.
+ */
+ if (nr_pte_updates)
+ pages -= (end - start) >> PAGE_SHIFT;
start = end;
if (pages <= 0)
@@ -950,10 +1774,10 @@ void task_numa_work(struct callback_head *work)
out:
/*
- * It is possible to reach the end of the VMA list but the last few VMAs are
- * not guaranteed to the vma_migratable. If they are not, we would find the
- * !migratable VMA on the next scan but not reset the scanner to the start
- * so check it now.
+ * It is possible to reach the end of the VMA list but the last few
+ * VMAs are not guaranteed to the vma_migratable. If they are not, we
+ * would find the !migratable VMA on the next scan but not reset the
+ * scanner to the start so check it now.
*/
if (vma)
mm->numa_scan_offset = start;
@@ -987,8 +1811,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
if (now - curr->node_stamp > period) {
if (!curr->node_stamp)
- curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
- curr->node_stamp = now;
+ curr->numa_scan_period = task_scan_min(curr);
+ curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -1000,6 +1824,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
+
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -1009,8 +1841,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (!parent_entity(se))
update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
- if (entity_is_task(se))
- list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+ if (entity_is_task(se)) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ account_numa_enqueue(rq, task_of(se));
+ list_add(&se->group_node, &rq->cfs_tasks);
+ }
#endif
cfs_rq->nr_running++;
}
@@ -1021,8 +1857,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
- if (entity_is_task(se))
+ if (entity_is_task(se)) {
+ account_numa_dequeue(rq_of(cfs_rq), task_of(se));
list_del_init(&se->group_node);
+ }
cfs_rq->nr_running--;
}
@@ -1037,7 +1875,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
* to gain a more accurate current total weight. See
* update_cfs_rq_load_contribution().
*/
- tg_weight = atomic64_read(&tg->load_avg);
+ tg_weight = atomic_long_read(&tg->load_avg);
tg_weight -= cfs_rq->tg_load_contrib;
tg_weight += cfs_rq->load.weight;
@@ -1110,8 +1948,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+#ifdef CONFIG_SMP
/*
* We choose a half-life close to 1 scheduling period.
* Note: The tables below are dependent on this value.
@@ -1319,13 +2156,13 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
int force_update)
{
struct task_group *tg = cfs_rq->tg;
- s64 tg_contrib;
+ long tg_contrib;
tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
tg_contrib -= cfs_rq->tg_load_contrib;
- if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
- atomic64_add(tg_contrib, &tg->load_avg);
+ if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+ atomic_long_add(tg_contrib, &tg->load_avg);
cfs_rq->tg_load_contrib += tg_contrib;
}
}
@@ -1341,7 +2178,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
long contrib;
/* The fraction of a cpu used by this cfs_rq */
- contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
+ contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
sa->runnable_avg_period + 1);
contrib -= cfs_rq->tg_runnable_contrib;
@@ -1360,8 +2197,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
u64 contrib;
contrib = cfs_rq->tg_load_contrib * tg->shares;
- se->avg.load_avg_contrib = div64_u64(contrib,
- atomic64_read(&tg->load_avg) + 1);
+ se->avg.load_avg_contrib = div_u64(contrib,
+ atomic_long_read(&tg->load_avg) + 1);
/*
* For group entities we need to compute a correction term in the case
@@ -1480,8 +2317,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
if (!decays && !force_update)
return;
- if (atomic64_read(&cfs_rq->removed_load)) {
- u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
+ if (atomic_long_read(&cfs_rq->removed_load)) {
+ unsigned long removed_load;
+ removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
subtract_blocked_load_contrib(cfs_rq, removed_load);
}
@@ -1497,7 +2335,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
{
- __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+ __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
}
@@ -1510,9 +2348,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
* We track migrations using entity decay_count <= 0, on a wake-up
* migration we use a negative decay count to track the remote decays
* accumulated while sleeping.
+ *
+ * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
+ * are seen by enqueue_entity_load_avg() as a migration with an already
+ * constructed load_avg_contrib.
*/
if (unlikely(se->avg.decay_count <= 0)) {
- se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+ se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
if (se->avg.decay_count) {
/*
* In a wake-up migration we have to approximate the
@@ -1530,7 +2372,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
}
wakeup = 0;
} else {
- __synchronize_entity_decay(se);
+ /*
+ * Task re-woke on same cpu (or else migrate_task_rq_fair()
+ * would have made count negative); we must be careful to avoid
+ * double-accounting blocked time after synchronizing decays.
+ */
+ se->avg.last_runnable_update += __synchronize_entity_decay(se)
+ << 20;
}
/* migrated tasks did not contribute to our blocked load */
@@ -1607,7 +2455,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
tsk = task_of(se);
if (se->statistics.sleep_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
if ((s64)delta < 0)
delta = 0;
@@ -1624,7 +2472,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
}
if (se->statistics.block_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
if ((s64)delta < 0)
delta = 0;
@@ -1712,7 +2560,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update the normalized vruntime before updating min_vruntime
- * through callig update_curr().
+ * through calling update_curr().
*/
if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;
@@ -1805,9 +2653,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
- se->statistics.sleep_start = rq_of(cfs_rq)->clock;
+ se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->statistics.block_start = rq_of(cfs_rq)->clock;
+ se->statistics.block_start = rq_clock(rq_of(cfs_rq));
}
#endif
}
@@ -1984,6 +2832,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
*/
update_entity_load_avg(curr, 1);
update_cfs_rq_blocked_load(cfs_rq, 1);
+ update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -2021,13 +2870,14 @@ static inline bool cfs_bandwidth_used(void)
return static_key_false(&__cfs_bandwidth_used);
}
-void account_cfs_bandwidth_used(int enabled, int was_enabled)
+void cfs_bandwidth_usage_inc(void)
{
- /* only need to count groups transitioning between enabled/!enabled */
- if (enabled && !was_enabled)
- static_key_slow_inc(&__cfs_bandwidth_used);
- else if (!enabled && was_enabled)
- static_key_slow_dec(&__cfs_bandwidth_used);
+ static_key_slow_inc(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_dec(void)
+{
+ static_key_slow_dec(&__cfs_bandwidth_used);
}
#else /* HAVE_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
@@ -2035,7 +2885,8 @@ static bool cfs_bandwidth_used(void)
return true;
}
-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
#endif /* HAVE_JUMP_LABEL */
/*
@@ -2082,7 +2933,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
if (unlikely(cfs_rq->throttle_count))
return cfs_rq->throttled_clock_task;
- return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
+ return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
}
/* returns 0 on failure to allocate runtime */
@@ -2138,10 +2989,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct rq *rq = rq_of(cfs_rq);
/* if the deadline is ahead of our clock, nothing to do */
- if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
+ if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
return;
if (cfs_rq->runtime_remaining < 0)
@@ -2230,7 +3080,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
/* adjust cfs_rq_clock_task() */
- cfs_rq->throttled_clock_task_time += rq->clock_task -
+ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
}
#endif
@@ -2245,7 +3095,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
/* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count)
- cfs_rq->throttled_clock_task = rq->clock_task;
+ cfs_rq->throttled_clock_task = rq_clock_task(rq);
cfs_rq->throttle_count++;
return 0;
@@ -2284,9 +3134,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
rq->nr_running -= task_delta;
cfs_rq->throttled = 1;
- cfs_rq->throttled_clock = rq->clock;
+ cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ if (!cfs_b->timer_active)
+ __start_cfs_bandwidth(cfs_b);
raw_spin_unlock(&cfs_b->lock);
}
@@ -2298,15 +3150,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
int enqueue = 1;
long task_delta;
- se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+ se = cfs_rq->tg->se[cpu_of(rq)];
cfs_rq->throttled = 0;
+
+ update_rq_clock(rq);
+
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
+ cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
- update_rq_clock(rq);
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
@@ -2398,6 +3252,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
if (idle)
goto out_unlock;
+ /*
+ * if we have relooped after returning idle once, we need to update our
+ * status as actually running, so that other cpus doing
+ * __start_cfs_bandwidth will stop trying to cancel us.
+ */
+ cfs_b->timer_active = 1;
+
__refill_cfs_bandwidth_runtime(cfs_b);
if (!throttled) {
@@ -2458,7 +3319,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
/* how long we wait to gather additional slack before distributing */
static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
-/* are we near the end of the current quota period? */
+/*
+ * Are we near the end of the current quota period?
+ *
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
+ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * migrate_hrtimers, base is never cleared, so we are fine.
+ */
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2534,10 +3401,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
u64 expires;
/* confirm we're still not at a refresh boundary */
- if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+ raw_spin_lock(&cfs_b->lock);
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
+ raw_spin_unlock(&cfs_b->lock);
return;
+ }
- raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
runtime = cfs_b->runtime;
cfs_b->runtime = 0;
@@ -2599,10 +3468,6 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
throttle_cfs_rq(cfs_rq);
}
-static inline u64 default_cfs_period(void);
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
-static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
-
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -2662,11 +3527,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
* (timer_active==0 becomes visible before the hrtimer call-back
* terminates). In either case we ensure that it's re-programmed
*/
- while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+ while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
+ hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
+ /* bounce the lock to allow do_sched_cfs_period_timer to run */
raw_spin_unlock(&cfs_b->lock);
- /* ensure cfs_b->lock is available while we wait */
- hrtimer_cancel(&cfs_b->period_timer);
-
+ cpu_relax();
raw_spin_lock(&cfs_b->lock);
/* if someone else restarted the timer then we're done */
if (cfs_b->timer_active)
@@ -2706,7 +3571,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
#else /* CONFIG_CFS_BANDWIDTH */
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
- return rq_of(cfs_rq)->clock_task;
+ return rq_clock_task(rq_of(cfs_rq));
}
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
@@ -2919,7 +3784,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(const int cpu)
{
- return cpu_rq(cpu)->load.weight;
+ return cpu_rq(cpu)->cfs.runnable_load_avg;
}
/*
@@ -2964,13 +3829,31 @@ static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+ unsigned long load_avg = rq->cfs.runnable_load_avg;
if (nr_running)
- return rq->load.weight / nr_running;
+ return load_avg / nr_running;
return 0;
}
+static void record_wakee(struct task_struct *p)
+{
+ /*
+ * Rough decay (wiping) for cost saving, don't worry
+ * about the boundary, really active task won't care
+ * about the loss.
+ */
+ if (jiffies > current->wakee_flip_decay_ts + HZ) {
+ current->wakee_flips = 0;
+ current->wakee_flip_decay_ts = jiffies;
+ }
+
+ if (current->last_wakee != p) {
+ current->last_wakee = p;
+ current->wakee_flips++;
+ }
+}
static void task_waking_fair(struct task_struct *p)
{
@@ -2991,6 +3874,7 @@ static void task_waking_fair(struct task_struct *p)
#endif
se->vruntime -= min_vruntime;
+ record_wakee(p);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3048,7 +3932,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
- if (!tg->parent) /* the trivial, non-cgroup case */
+ if (!tg->parent || !wl) /* the trivial, non-cgroup case */
return wl;
for_each_sched_entity(se) {
@@ -3101,14 +3985,35 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
}
#else
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
- unsigned long wl, unsigned long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
return wl;
}
#endif
+static int wake_wide(struct task_struct *p)
+{
+ int factor = this_cpu_read(sd_llc_size);
+
+ /*
+ * Yeah, it's the switching-frequency, could means many wakee or
+ * rapidly switch, use factor here will just help to automatically
+ * adjust the loose-degree, so bigger node will lead to more pull.
+ */
+ if (p->wakee_flips > factor) {
+ /*
+ * wakee is somewhat hot, it needs certain amount of cpu
+ * resource, so if waker is far more hot, prefer to leave
+ * it alone.
+ */
+ if (current->wakee_flips > (factor * p->wakee_flips))
+ return 1;
+ }
+
+ return 0;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
s64 this_load, load;
@@ -3118,6 +4023,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
unsigned long weight;
int balanced;
+ /*
+ * If we wake multiple tasks be careful to not bounce
+ * ourselves around too much.
+ */
+ if (wake_wide(p))
+ return 0;
+
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
@@ -3326,11 +4238,10 @@ done:
* preempt must be disabled.
*/
static int
-select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
- int prev_cpu = task_cpu(p);
int new_cpu = cpu;
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
@@ -3416,12 +4327,6 @@ unlock:
}
/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
* previous cpu. However, the caller only guarantees p->pi_lock is held; no
@@ -3441,10 +4346,10 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
*/
if (se->avg.decay_count) {
se->avg.decay_count = -__synchronize_entity_decay(se);
- atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
+ atomic_long_add(se->avg.load_avg_contrib,
+ &cfs_rq->removed_load);
}
}
-#endif
#endif /* CONFIG_SMP */
static unsigned long
@@ -3816,9 +4721,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+enum fbq_type { regular, remote, all };
+
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
-#define LBF_SOME_PINNED 0x04
+#define LBF_DST_PINNED 0x04
+#define LBF_SOME_PINNED 0x08
struct lb_env {
struct sched_domain *sd;
@@ -3841,6 +4749,8 @@ struct lb_env {
unsigned int loop;
unsigned int loop_break;
unsigned int loop_max;
+
+ enum fbq_type fbq_type;
};
/*
@@ -3887,6 +4797,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
return delta < (s64)sysctl_sched_migration_cost;
}
+#ifdef CONFIG_NUMA_BALANCING
+/* Returns true if the destination node has incurred more faults */
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+{
+ int src_nid, dst_nid;
+
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+ !(env->sd->flags & SD_NUMA)) {
+ return false;
+ }
+
+ src_nid = cpu_to_node(env->src_cpu);
+ dst_nid = cpu_to_node(env->dst_cpu);
+
+ if (src_nid == dst_nid)
+ return false;
+
+ /* Always encourage migration to the preferred node. */
+ if (dst_nid == p->numa_preferred_nid)
+ return true;
+
+ /* If both task and group weight improve, this move is a winner. */
+ if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
+ group_weight(p, dst_nid) > group_weight(p, src_nid))
+ return true;
+
+ return false;
+}
+
+
+static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+{
+ int src_nid, dst_nid;
+
+ if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
+ return false;
+
+ if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+ return false;
+
+ src_nid = cpu_to_node(env->src_cpu);
+ dst_nid = cpu_to_node(env->dst_cpu);
+
+ if (src_nid == dst_nid)
+ return false;
+
+ /* Migrating away from the preferred node is always bad. */
+ if (src_nid == p->numa_preferred_nid)
+ return true;
+
+ /* If either task or group weight get worse, don't do it. */
+ if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
+ group_weight(p, dst_nid) < group_weight(p, src_nid))
+ return true;
+
+ return false;
+}
+
+#else
+static inline bool migrate_improves_locality(struct task_struct *p,
+ struct lb_env *env)
+{
+ return false;
+}
+
+static inline bool migrate_degrades_locality(struct task_struct *p,
+ struct lb_env *env)
+{
+ return false;
+}
+#endif
+
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
@@ -3909,6 +4891,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+ env->flags |= LBF_SOME_PINNED;
+
/*
* Remember if this task can be migrated to any other cpu in
* our sched_group. We may want to revisit it if we couldn't
@@ -3917,13 +4901,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* Also avoid computing new_dst_cpu if we have already computed
* one in current iteration.
*/
- if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+ if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
return 0;
/* Prevent to re-select dst_cpu via env's cpus */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
- env->flags |= LBF_SOME_PINNED;
+ env->flags |= LBF_DST_PINNED;
env->new_dst_cpu = cpu;
break;
}
@@ -3942,11 +4926,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* Aggressive migration if:
- * 1) task is cache cold, or
- * 2) too many balance attempts have failed.
+ * 1) destination numa is preferred
+ * 2) task is cache cold, or
+ * 3) too many balance attempts have failed.
*/
+ tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+ if (!tsk_cache_hot)
+ tsk_cache_hot = migrate_degrades_locality(p, env);
+
+ if (migrate_improves_locality(p, env)) {
+#ifdef CONFIG_SCHEDSTATS
+ if (tsk_cache_hot) {
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
+ }
+#endif
+ return 1;
+ }
- tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
if (!tsk_cache_hot ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
@@ -3989,8 +4986,6 @@ static int move_one_task(struct lb_env *env)
return 0;
}
-static unsigned long task_h_load(struct task_struct *p);
-
static const unsigned int sched_nr_migrate_break = 32;
/*
@@ -4131,118 +5126,124 @@ static void update_blocked_averages(int cpu)
}
/*
- * Compute the cpu's hierarchical load factor for each task group.
+ * Compute the hierarchical load factor for cfs_rq and all its ascendants.
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
*/
-static int tg_load_down(struct task_group *tg, void *data)
-{
- unsigned long load;
- long cpu = (long)data;
-
- if (!tg->parent) {
- load = cpu_rq(cpu)->load.weight;
- } else {
- load = tg->parent->cfs_rq[cpu]->h_load;
- load *= tg->se[cpu]->load.weight;
- load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
- }
-
- tg->cfs_rq[cpu]->h_load = load;
-
- return 0;
-}
-
-static void update_h_load(long cpu)
+static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
{
- struct rq *rq = cpu_rq(cpu);
+ struct rq *rq = rq_of(cfs_rq);
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
unsigned long now = jiffies;
+ unsigned long load;
- if (rq->h_load_throttle == now)
+ if (cfs_rq->last_h_load_update == now)
return;
- rq->h_load_throttle = now;
+ cfs_rq->h_load_next = NULL;
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_load_next = se;
+ if (cfs_rq->last_h_load_update == now)
+ break;
+ }
- rcu_read_lock();
- walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
- rcu_read_unlock();
+ if (!se) {
+ cfs_rq->h_load = cfs_rq->runnable_load_avg;
+ cfs_rq->last_h_load_update = now;
+ }
+
+ while ((se = cfs_rq->h_load_next) != NULL) {
+ load = cfs_rq->h_load;
+ load = div64_ul(load * se->avg.load_avg_contrib,
+ cfs_rq->runnable_load_avg + 1);
+ cfs_rq = group_cfs_rq(se);
+ cfs_rq->h_load = load;
+ cfs_rq->last_h_load_update = now;
+ }
}
static unsigned long task_h_load(struct task_struct *p)
{
struct cfs_rq *cfs_rq = task_cfs_rq(p);
- unsigned long load;
-
- load = p->se.load.weight;
- load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
- return load;
+ update_cfs_rq_h_load(cfs_rq);
+ return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
+ cfs_rq->runnable_load_avg + 1);
}
#else
static inline void update_blocked_averages(int cpu)
{
}
-static inline void update_h_load(long cpu)
-{
-}
-
static unsigned long task_h_load(struct task_struct *p)
{
- return p->se.load.weight;
+ return p->se.avg.load_avg_contrib;
}
#endif
/********** Helpers for find_busiest_group ************************/
/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- * during load balancing.
- */
-struct sd_lb_stats {
- struct sched_group *busiest; /* Busiest group in this sd */
- struct sched_group *this; /* Local group in this sd */
- unsigned long total_load; /* Total load of all groups in sd */
- unsigned long total_pwr; /* Total power of all groups in sd */
- unsigned long avg_load; /* Average load across all groups in sd */
-
- /** Statistics of this group */
- unsigned long this_load;
- unsigned long this_load_per_task;
- unsigned long this_nr_running;
- unsigned long this_has_capacity;
- unsigned int this_idle_cpus;
-
- /* Statistics of the busiest group */
- unsigned int busiest_idle_cpus;
- unsigned long max_load;
- unsigned long busiest_load_per_task;
- unsigned long busiest_nr_running;
- unsigned long busiest_group_capacity;
- unsigned long busiest_has_capacity;
- unsigned int busiest_group_weight;
-
- int group_imb; /* Is there imbalance in this sd */
-};
-
-/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long sum_nr_running; /* Nr tasks running in the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
- unsigned long group_capacity;
- unsigned long idle_cpus;
- unsigned long group_weight;
+ unsigned long load_per_task;
+ unsigned long group_power;
+ unsigned int sum_nr_running; /* Nr tasks running in the group */
+ unsigned int group_capacity;
+ unsigned int idle_cpus;
+ unsigned int group_weight;
int group_imb; /* Is there an imbalance in the group ? */
int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int nr_numa_running;
+ unsigned int nr_preferred_running;
+#endif
};
+/*
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ * during load balancing.
+ */
+struct sd_lb_stats {
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *local; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_pwr; /* Total power of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+
+ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
+ struct sg_lb_stats local_stat; /* Statistics of the local group */
+};
+
+static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
+{
+ /*
+ * Skimp on the clearing to avoid duplicate work. We can avoid clearing
+ * local_stat because update_sg_lb_stats() does a full clear/assignment.
+ * We must however clear busiest_stat::avg_load because
+ * update_sd_pick_busiest() reads this before assignment.
+ */
+ *sds = (struct sd_lb_stats){
+ .busiest = NULL,
+ .local = NULL,
+ .total_load = 0UL,
+ .total_pwr = 0UL,
+ .busiest_stat = {
+ .avg_load = 0UL,
+ },
+ };
+}
+
/**
* get_sd_load_idx - Obtain the load index for a given sched domain.
* @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ * @idle: The idle status of the CPU for whose sd load_idx is obtained.
+ *
+ * Return: The load index.
*/
static inline int get_sd_load_idx(struct sched_domain *sd,
enum cpu_idle_type idle)
@@ -4302,7 +5303,7 @@ static unsigned long scale_rt_power(int cpu)
age_stamp = ACCESS_ONCE(rq->age_stamp);
avg = ACCESS_ONCE(rq->rt_avg);
- total = sched_avg_period() + (rq->clock - age_stamp);
+ total = sched_avg_period() + (rq_clock(rq) - age_stamp);
if (unlikely(total < avg)) {
/* Ensures that power won't end up being negative */
@@ -4357,7 +5358,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long power;
+ unsigned long power, power_orig;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -4369,7 +5370,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
return;
}
- power = 0;
+ power_orig = power = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -4377,8 +5378,33 @@ void update_group_power(struct sched_domain *sd, int cpu)
* span the current group.
*/
- for_each_cpu(cpu, sched_group_cpus(sdg))
- power += power_of(cpu);
+ for_each_cpu(cpu, sched_group_cpus(sdg)) {
+ struct sched_group_power *sgp;
+ struct rq *rq = cpu_rq(cpu);
+
+ /*
+ * build_sched_domains() -> init_sched_groups_power()
+ * gets here before we've attached the domains to the
+ * runqueues.
+ *
+ * Use power_of(), which is set irrespective of domains
+ * in update_cpu_power().
+ *
+ * This avoids power/power_orig from being 0 and
+ * causing divide-by-zero issues on boot.
+ *
+ * Runtime updates will correct power_orig.
+ */
+ if (unlikely(!rq->sd)) {
+ power_orig += power_of(cpu);
+ power += power_of(cpu);
+ continue;
+ }
+
+ sgp = rq->sd->groups->sgp;
+ power_orig += sgp->power_orig;
+ power += sgp->power;
+ }
} else {
/*
* !SD_OVERLAP domains can assume that child groups
@@ -4387,12 +5413,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
group = child->groups;
do {
+ power_orig += group->sgp->power_orig;
power += group->sgp->power;
group = group->next;
} while (group != child->groups);
}
- sdg->sgp->power_orig = sdg->sgp->power = power;
+ sdg->sgp->power_orig = power_orig;
+ sdg->sgp->power = power;
}
/*
@@ -4420,33 +5448,84 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
return 0;
}
+/*
+ * Group imbalance indicates (and tries to solve) the problem where balancing
+ * groups is inadequate due to tsk_cpus_allowed() constraints.
+ *
+ * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
+ * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
+ * Something like:
+ *
+ * { 0 1 2 3 } { 4 5 6 7 }
+ * * * * *
+ *
+ * If we were to balance group-wise we'd place two tasks in the first group and
+ * two tasks in the second group. Clearly this is undesired as it will overload
+ * cpu 3 and leave one of the cpus in the second group unused.
+ *
+ * The current solution to this issue is detecting the skew in the first group
+ * by noticing the lower domain failed to reach balance and had difficulty
+ * moving tasks due to affinity constraints.
+ *
+ * When this is so detected; this group becomes a candidate for busiest; see
+ * update_sd_pick_busiest(). And calculate_imbalance() and
+ * find_busiest_group() avoid some of the usual balance conditions to allow it
+ * to create an effective group imbalance.
+ *
+ * This is a somewhat tricky proposition since the next run might not find the
+ * group imbalance and decide the groups need to be balanced again. A most
+ * subtle and fragile situation.
+ */
+
+static inline int sg_imbalanced(struct sched_group *group)
+{
+ return group->sgp->imbalance;
+}
+
+/*
+ * Compute the group capacity.
+ *
+ * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
+ * first dividing out the smt factor and computing the actual number of cores
+ * and limit power unit capacity with that.
+ */
+static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
+{
+ unsigned int capacity, smt, cpus;
+ unsigned int power, power_orig;
+
+ power = group->sgp->power;
+ power_orig = group->sgp->power_orig;
+ cpus = group->group_weight;
+
+ /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
+ smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
+ capacity = cpus / smt; /* cores */
+
+ capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
+ if (!capacity)
+ capacity = fix_small_capacity(env->sd, group);
+
+ return capacity;
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
* @group: sched_group whose statistics are to be updated.
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
- * @balance: Should we balance.
* @sgs: variable to hold the statistics for this group.
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
- int local_group, int *balance, struct sg_lb_stats *sgs)
+ int local_group, struct sg_lb_stats *sgs)
{
- unsigned long nr_running, max_nr_running, min_nr_running;
- unsigned long load, max_cpu_load, min_cpu_load;
- unsigned int balance_cpu = -1, first_idle_cpu = 0;
- unsigned long avg_load_per_task = 0;
+ unsigned long nr_running;
+ unsigned long load;
int i;
- if (local_group)
- balance_cpu = group_balance_cpu(group);
-
- /* Tally up the load of all CPUs in the group */
- max_cpu_load = 0;
- min_cpu_load = ~0UL;
- max_nr_running = 0;
- min_nr_running = ~0UL;
+ memset(sgs, 0, sizeof(*sgs));
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
@@ -4454,76 +5533,34 @@ static inline void update_sg_lb_stats(struct lb_env *env,
nr_running = rq->nr_running;
/* Bias balancing toward cpus of our domain */
- if (local_group) {
- if (idle_cpu(i) && !first_idle_cpu &&
- cpumask_test_cpu(i, sched_group_mask(group))) {
- first_idle_cpu = 1;
- balance_cpu = i;
- }
-
+ if (local_group)
load = target_load(i, load_idx);
- } else {
+ else
load = source_load(i, load_idx);
- if (load > max_cpu_load)
- max_cpu_load = load;
- if (min_cpu_load > load)
- min_cpu_load = load;
-
- if (nr_running > max_nr_running)
- max_nr_running = nr_running;
- if (min_nr_running > nr_running)
- min_nr_running = nr_running;
- }
sgs->group_load += load;
sgs->sum_nr_running += nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+ sgs->nr_numa_running += rq->nr_numa_running;
+ sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
sgs->sum_weighted_load += weighted_cpuload(i);
if (idle_cpu(i))
sgs->idle_cpus++;
}
- /*
- * First idle cpu or the first cpu(busiest) in this sched group
- * is eligible for doing load balancing at this and above
- * domains. In the newly idle case, we will allow all the cpu's
- * to do the newly idle load balance.
- */
- if (local_group) {
- if (env->idle != CPU_NEWLY_IDLE) {
- if (balance_cpu != env->dst_cpu) {
- *balance = 0;
- return;
- }
- update_group_power(env->sd, env->dst_cpu);
- } else if (time_after_eq(jiffies, group->sgp->next_update))
- update_group_power(env->sd, env->dst_cpu);
- }
-
/* Adjust by relative CPU power of the group */
- sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
+ sgs->group_power = group->sgp->power;
+ sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
- /*
- * Consider the group unbalanced when the imbalance is larger
- * than the average weight of a task.
- *
- * APZ: with cgroup the avg task weight can vary wildly and
- * might not be a suitable number - should we keep a
- * normalized nr_running number somewhere that negates
- * the hierarchy?
- */
if (sgs->sum_nr_running)
- avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+ sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
- if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
- (max_nr_running - min_nr_running) > 1)
- sgs->group_imb = 1;
-
- sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
- SCHED_POWER_SCALE);
- if (!sgs->group_capacity)
- sgs->group_capacity = fix_small_capacity(env->sd, group);
sgs->group_weight = group->group_weight;
+ sgs->group_imb = sg_imbalanced(group);
+ sgs->group_capacity = sg_capacity(env, group);
+
if (sgs->group_capacity > sgs->sum_nr_running)
sgs->group_has_capacity = 1;
}
@@ -4537,13 +5574,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
*
* Determine if @sg is a busier group than the previously selected
* busiest group.
+ *
+ * Return: %true if @sg is a busier group than the previously selected
+ * busiest group. %false otherwise.
*/
static bool update_sd_pick_busiest(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *sg,
struct sg_lb_stats *sgs)
{
- if (sgs->avg_load <= sds->max_load)
+ if (sgs->avg_load <= sds->busiest_stat.avg_load)
return false;
if (sgs->sum_nr_running > sgs->group_capacity)
@@ -4569,18 +5609,46 @@ static bool update_sd_pick_busiest(struct lb_env *env,
return false;
}
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running > sgs->nr_numa_running)
+ return regular;
+ if (sgs->sum_nr_running > sgs->nr_preferred_running)
+ return remote;
+ return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+ if (rq->nr_running > rq->nr_numa_running)
+ return regular;
+ if (rq->nr_running > rq->nr_preferred_running)
+ return remote;
+ return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+ return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+ return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
- * @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
-static inline void update_sd_lb_stats(struct lb_env *env,
- int *balance, struct sd_lb_stats *sds)
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
- struct sg_lb_stats sgs;
+ struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
if (child && child->flags & SD_PREFER_SIBLING)
@@ -4589,17 +5657,23 @@ static inline void update_sd_lb_stats(struct lb_env *env,
load_idx = get_sd_load_idx(env->sd, env->idle);
do {
+ struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
- memset(&sgs, 0, sizeof(sgs));
- update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
+ if (local_group) {
+ sds->local = sg;
+ sgs = &sds->local_stat;
- if (local_group && !(*balance))
- return;
+ if (env->idle != CPU_NEWLY_IDLE ||
+ time_after_eq(jiffies, sg->sgp->next_update))
+ update_group_power(env->sd, env->dst_cpu);
+ }
+
+ update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
- sds->total_load += sgs.group_load;
- sds->total_pwr += sg->sgp->power;
+ if (local_group)
+ goto next_group;
/*
* In case the child domain prefers tasks go to siblings
@@ -4611,30 +5685,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,
* heaviest group when it is already under-utilized (possible
* with a large weight task outweighs the tasks on the system).
*/
- if (prefer_sibling && !local_group && sds->this_has_capacity)
- sgs.group_capacity = min(sgs.group_capacity, 1UL);
+ if (prefer_sibling && sds->local &&
+ sds->local_stat.group_has_capacity)
+ sgs->group_capacity = min(sgs->group_capacity, 1U);
- if (local_group) {
- sds->this_load = sgs.avg_load;
- sds->this = sg;
- sds->this_nr_running = sgs.sum_nr_running;
- sds->this_load_per_task = sgs.sum_weighted_load;
- sds->this_has_capacity = sgs.group_has_capacity;
- sds->this_idle_cpus = sgs.idle_cpus;
- } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
- sds->max_load = sgs.avg_load;
+ if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
- sds->busiest_nr_running = sgs.sum_nr_running;
- sds->busiest_idle_cpus = sgs.idle_cpus;
- sds->busiest_group_capacity = sgs.group_capacity;
- sds->busiest_load_per_task = sgs.sum_weighted_load;
- sds->busiest_has_capacity = sgs.group_has_capacity;
- sds->busiest_group_weight = sgs.group_weight;
- sds->group_imb = sgs.group_imb;
+ sds->busiest_stat = *sgs;
}
+next_group:
+ /* Now, start updating sd_lb_stats */
+ sds->total_load += sgs->group_load;
+ sds->total_pwr += sgs->group_power;
+
sg = sg->next;
} while (sg != env->sd->groups);
+
+ if (env->sd->flags & SD_NUMA)
+ env->fbq_type = fbq_classify_group(&sds->busiest_stat);
}
/**
@@ -4654,7 +5723,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
* assuming lower CPU number will be equivalent to lower a SMT thread
* number.
*
- * Returns 1 when packing is required and a task should be moved to
+ * Return: 1 when packing is required and a task should be moved to
* this CPU. The amount of the imbalance is returned in *imbalance.
*
* @env: The load balancing environment.
@@ -4675,7 +5744,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
return 0;
env->imbalance = DIV_ROUND_CLOSEST(
- sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
+ sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
+ SCHED_POWER_SCALE);
return 1;
}
@@ -4693,24 +5763,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
unsigned long tmp, pwr_now = 0, pwr_move = 0;
unsigned int imbn = 2;
unsigned long scaled_busy_load_per_task;
+ struct sg_lb_stats *local, *busiest;
- if (sds->this_nr_running) {
- sds->this_load_per_task /= sds->this_nr_running;
- if (sds->busiest_load_per_task >
- sds->this_load_per_task)
- imbn = 1;
- } else {
- sds->this_load_per_task =
- cpu_avg_load_per_task(env->dst_cpu);
- }
+ local = &sds->local_stat;
+ busiest = &sds->busiest_stat;
+
+ if (!local->sum_nr_running)
+ local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
+ else if (busiest->load_per_task > local->load_per_task)
+ imbn = 1;
- scaled_busy_load_per_task = sds->busiest_load_per_task
- * SCHED_POWER_SCALE;
- scaled_busy_load_per_task /= sds->busiest->sgp->power;
+ scaled_busy_load_per_task =
+ (busiest->load_per_task * SCHED_POWER_SCALE) /
+ busiest->group_power;
- if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
- (scaled_busy_load_per_task * imbn)) {
- env->imbalance = sds->busiest_load_per_task;
+ if (busiest->avg_load + scaled_busy_load_per_task >=
+ local->avg_load + (scaled_busy_load_per_task * imbn)) {
+ env->imbalance = busiest->load_per_task;
return;
}
@@ -4720,34 +5789,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
* moving them.
*/
- pwr_now += sds->busiest->sgp->power *
- min(sds->busiest_load_per_task, sds->max_load);
- pwr_now += sds->this->sgp->power *
- min(sds->this_load_per_task, sds->this_load);
+ pwr_now += busiest->group_power *
+ min(busiest->load_per_task, busiest->avg_load);
+ pwr_now += local->group_power *
+ min(local->load_per_task, local->avg_load);
pwr_now /= SCHED_POWER_SCALE;
/* Amount of load we'd subtract */
- tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->busiest->sgp->power;
- if (sds->max_load > tmp)
- pwr_move += sds->busiest->sgp->power *
- min(sds->busiest_load_per_task, sds->max_load - tmp);
+ tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
+ busiest->group_power;
+ if (busiest->avg_load > tmp) {
+ pwr_move += busiest->group_power *
+ min(busiest->load_per_task,
+ busiest->avg_load - tmp);
+ }
/* Amount of load we'd add */
- if (sds->max_load * sds->busiest->sgp->power <
- sds->busiest_load_per_task * SCHED_POWER_SCALE)
- tmp = (sds->max_load * sds->busiest->sgp->power) /
- sds->this->sgp->power;
- else
- tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->this->sgp->power;
- pwr_move += sds->this->sgp->power *
- min(sds->this_load_per_task, sds->this_load + tmp);
+ if (busiest->avg_load * busiest->group_power <
+ busiest->load_per_task * SCHED_POWER_SCALE) {
+ tmp = (busiest->avg_load * busiest->group_power) /
+ local->group_power;
+ } else {
+ tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
+ local->group_power;
+ }
+ pwr_move += local->group_power *
+ min(local->load_per_task, local->avg_load + tmp);
pwr_move /= SCHED_POWER_SCALE;
/* Move if we gain throughput */
if (pwr_move > pwr_now)
- env->imbalance = sds->busiest_load_per_task;
+ env->imbalance = busiest->load_per_task;
}
/**
@@ -4759,11 +5831,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long max_pull, load_above_capacity = ~0UL;
+ struct sg_lb_stats *local, *busiest;
+
+ local = &sds->local_stat;
+ busiest = &sds->busiest_stat;
- sds->busiest_load_per_task /= sds->busiest_nr_running;
- if (sds->group_imb) {
- sds->busiest_load_per_task =
- min(sds->busiest_load_per_task, sds->avg_load);
+ if (busiest->group_imb) {
+ /*
+ * In the group_imb case we cannot rely on group-wide averages
+ * to ensure cpu-load equilibrium, look at wider averages. XXX
+ */
+ busiest->load_per_task =
+ min(busiest->load_per_task, sds->avg_load);
}
/*
@@ -4771,21 +5850,23 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* max load less than avg load(as we skip the groups at or below
* its cpu_power, while calculating max_load..)
*/
- if (sds->max_load < sds->avg_load) {
+ if (busiest->avg_load <= sds->avg_load ||
+ local->avg_load >= sds->avg_load) {
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}
- if (!sds->group_imb) {
+ if (!busiest->group_imb) {
/*
* Don't want to pull so many tasks that a group would go idle.
+ * Except of course for the group_imb case, since then we might
+ * have to drop below capacity to reach cpu-load equilibrium.
*/
- load_above_capacity = (sds->busiest_nr_running -
- sds->busiest_group_capacity);
+ load_above_capacity =
+ (busiest->sum_nr_running - busiest->group_capacity);
load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
-
- load_above_capacity /= sds->busiest->sgp->power;
+ load_above_capacity /= busiest->group_power;
}
/*
@@ -4795,15 +5876,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* we also don't want to reduce the group load below the group capacity
* (so that we can implement power-savings policies etc). Thus we look
* for the minimum possible imbalance.
- * Be careful of negative numbers as they'll appear as very large values
- * with unsigned longs.
*/
- max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
+ max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
- env->imbalance = min(max_pull * sds->busiest->sgp->power,
- (sds->avg_load - sds->this_load) * sds->this->sgp->power)
- / SCHED_POWER_SCALE;
+ env->imbalance = min(
+ max_pull * busiest->group_power,
+ (sds->avg_load - local->avg_load) * local->group_power
+ ) / SCHED_POWER_SCALE;
/*
* if *imbalance is less than the average load per runnable task
@@ -4811,9 +5891,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* a think about bumping its value to force at least one task to be
* moved
*/
- if (env->imbalance < sds->busiest_load_per_task)
+ if (env->imbalance < busiest->load_per_task)
return fix_small_imbalance(env, sds);
-
}
/******* find_busiest_group() helpers end here *********************/
@@ -4829,69 +5908,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* to restore balance.
*
* @env: The load balancing environment.
- * @balance: Pointer to a variable indicating if this_cpu
- * is the appropriate cpu to perform load balancing at this_level.
*
- * Returns: - the busiest group if imbalance exists.
+ * Return: - The busiest group if imbalance exists.
* - If no imbalance and user has opted for power-savings balance,
* return the least loaded group whose CPUs can be
* put to idle by rebalancing its tasks onto our group.
*/
-static struct sched_group *
-find_busiest_group(struct lb_env *env, int *balance)
+static struct sched_group *find_busiest_group(struct lb_env *env)
{
+ struct sg_lb_stats *local, *busiest;
struct sd_lb_stats sds;
- memset(&sds, 0, sizeof(sds));
+ init_sd_lb_stats(&sds);
/*
* Compute the various statistics relavent for load balancing at
* this level.
*/
- update_sd_lb_stats(env, balance, &sds);
-
- /*
- * this_cpu is not the appropriate cpu to perform load balancing at
- * this level.
- */
- if (!(*balance))
- goto ret;
+ update_sd_lb_stats(env, &sds);
+ local = &sds.local_stat;
+ busiest = &sds.busiest_stat;
if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
check_asym_packing(env, &sds))
return sds.busiest;
/* There is no busy sibling group to pull tasks from */
- if (!sds.busiest || sds.busiest_nr_running == 0)
+ if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
/*
* If the busiest group is imbalanced the below checks don't
- * work because they assumes all things are equal, which typically
+ * work because they assume all things are equal, which typically
* isn't true due to cpus_allowed constraints and the like.
*/
- if (sds.group_imb)
+ if (busiest->group_imb)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
- !sds.busiest_has_capacity)
+ if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
+ !busiest->group_has_capacity)
goto force_balance;
/*
* If the local group is more busy than the selected busiest group
* don't try and pull any tasks.
*/
- if (sds.this_load >= sds.max_load)
+ if (local->avg_load >= busiest->avg_load)
goto out_balanced;
/*
* Don't pull any tasks if this group is already above the domain
* average load.
*/
- if (sds.this_load >= sds.avg_load)
+ if (local->avg_load >= sds.avg_load)
goto out_balanced;
if (env->idle == CPU_IDLE) {
@@ -4901,15 +5973,16 @@ find_busiest_group(struct lb_env *env, int *balance)
* there is no imbalance between this and busiest group
* wrt to idle cpu's, it is balanced.
*/
- if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
- sds.busiest_nr_running <= sds.busiest_group_weight)
+ if ((local->idle_cpus < busiest->idle_cpus) &&
+ busiest->sum_nr_running <= busiest->group_weight)
goto out_balanced;
} else {
/*
* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
* imbalance_pct to be conservative.
*/
- if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
+ if (100 * busiest->avg_load <=
+ env->sd->imbalance_pct * local->avg_load)
goto out_balanced;
}
@@ -4919,7 +5992,6 @@ force_balance:
return sds.busiest;
out_balanced:
-ret:
env->imbalance = 0;
return NULL;
}
@@ -4931,22 +6003,43 @@ static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
- unsigned long max_load = 0;
+ unsigned long busiest_load = 0, busiest_power = 1;
int i;
- for_each_cpu(i, sched_group_cpus(group)) {
- unsigned long power = power_of(i);
- unsigned long capacity = DIV_ROUND_CLOSEST(power,
- SCHED_POWER_SCALE);
- unsigned long wl;
+ for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+ unsigned long power, capacity, wl;
+ enum fbq_type rt;
- if (!capacity)
- capacity = fix_small_capacity(env->sd, group);
+ rq = cpu_rq(i);
+ rt = fbq_classify_rq(rq);
- if (!cpumask_test_cpu(i, env->cpus))
+ /*
+ * We classify groups/runqueues into three groups:
+ * - regular: there are !numa tasks
+ * - remote: there are numa tasks that run on the 'wrong' node
+ * - all: there is no distinction
+ *
+ * In order to avoid migrating ideally placed numa tasks,
+ * ignore those when there's better options.
+ *
+ * If we ignore the actual busiest queue to migrate another
+ * task, the next balance pass can still reduce the busiest
+ * queue by moving tasks around inside the node.
+ *
+ * If we cannot move enough load due to this classification
+ * the next pass will adjust the group classification and
+ * allow migration of more tasks.
+ *
+ * Both cases only affect the total convergence complexity.
+ */
+ if (rt > env->fbq_type)
continue;
- rq = cpu_rq(i);
+ power = power_of(i);
+ capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
+ if (!capacity)
+ capacity = fix_small_capacity(env->sd, group);
+
wl = weighted_cpuload(i);
/*
@@ -4961,11 +6054,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* the weighted_cpuload() scaled with the cpu power, so that
* the load can be moved away from the cpu that is potentially
* running at a lower capacity.
+ *
+ * Thus we're looking for max(wl_i / power_i), crosswise
+ * multiplication to rid ourselves of the division works out
+ * to: wl_i * power_j > wl_j * power_i; where j is our
+ * previous maximum.
*/
- wl = (wl * SCHED_POWER_SCALE) / power;
-
- if (wl > max_load) {
- max_load = wl;
+ if (wl * busiest_power > busiest_load * power) {
+ busiest_load = wl;
+ busiest_power = power;
busiest = rq;
}
}
@@ -5002,15 +6099,50 @@ static int need_active_balance(struct lb_env *env)
static int active_load_balance_cpu_stop(void *data);
+static int should_we_balance(struct lb_env *env)
+{
+ struct sched_group *sg = env->sd->groups;
+ struct cpumask *sg_cpus, *sg_mask;
+ int cpu, balance_cpu = -1;
+
+ /*
+ * In the newly idle case, we will allow all the cpu's
+ * to do the newly idle load balance.
+ */
+ if (env->idle == CPU_NEWLY_IDLE)
+ return 1;
+
+ sg_cpus = sched_group_cpus(sg);
+ sg_mask = sched_group_mask(sg);
+ /* Try to find first idle cpu */
+ for_each_cpu_and(cpu, sg_cpus, env->cpus) {
+ if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+ continue;
+
+ balance_cpu = cpu;
+ break;
+ }
+
+ if (balance_cpu == -1)
+ balance_cpu = group_balance_cpu(sg);
+
+ /*
+ * First idle cpu or the first cpu(busiest) in this sched group
+ * is eligible for doing load balancing at this and above domains.
+ */
+ return balance_cpu == env->dst_cpu;
+}
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *balance)
+ int *continue_balancing)
{
int ld_moved, cur_ld_moved, active_balance = 0;
+ struct sched_domain *sd_parent = sd->parent;
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
@@ -5024,6 +6156,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
+ .fbq_type = all,
};
/*
@@ -5038,11 +6171,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
schedstat_inc(sd, lb_count[idle]);
redo:
- group = find_busiest_group(&env, balance);
-
- if (*balance == 0)
+ if (!should_we_balance(&env)) {
+ *continue_balancing = 0;
goto out_balanced;
+ }
+ group = find_busiest_group(&env);
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
@@ -5071,7 +6205,6 @@ redo:
env.src_rq = busiest;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
- update_h_load(env.src_cpu);
more_balance:
local_irq_save(flags);
double_rq_lock(env.dst_rq, busiest);
@@ -5115,17 +6248,17 @@ more_balance:
* moreover subsequent load balance cycles should correct the
* excess load moved.
*/
- if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+ if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
+
+ /* Prevent to re-select dst_cpu via env's cpus */
+ cpumask_clear_cpu(env.dst_cpu, env.cpus);
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
- env.flags &= ~LBF_SOME_PINNED;
+ env.flags &= ~LBF_DST_PINNED;
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
- /* Prevent to re-select dst_cpu via env's cpus */
- cpumask_clear_cpu(env.dst_cpu, env.cpus);
-
/*
* Go back to "more_balance" rather than "redo" since we
* need to continue with same src_cpu.
@@ -5133,6 +6266,18 @@ more_balance:
goto more_balance;
}
+ /*
+ * We failed to reach balance because of affinity.
+ */
+ if (sd_parent) {
+ int *group_imbalance = &sd_parent->groups->sgp->imbalance;
+
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+ *group_imbalance = 1;
+ } else if (*group_imbalance)
+ *group_imbalance = 0;
+ }
+
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5240,8 +6385,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
struct sched_domain *sd;
int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
+ u64 curr_cost = 0;
- this_rq->idle_stamp = this_rq->clock;
+ this_rq->idle_stamp = rq_clock(this_rq);
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
@@ -5255,15 +6401,28 @@ void idle_balance(int this_cpu, struct rq *this_rq)
rcu_read_lock();
for_each_domain(this_cpu, sd) {
unsigned long interval;
- int balance = 1;
+ int continue_balancing = 1;
+ u64 t0, domain_cost;
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+ break;
+
if (sd->flags & SD_BALANCE_NEWIDLE) {
+ t0 = sched_clock_cpu(this_cpu);
+
/* If we've pulled tasks over stop searching: */
pulled_task = load_balance(this_cpu, this_rq,
- sd, CPU_NEWLY_IDLE, &balance);
+ sd, CPU_NEWLY_IDLE,
+ &continue_balancing);
+
+ domain_cost = sched_clock_cpu(this_cpu) - t0;
+ if (domain_cost > sd->max_newidle_lb_cost)
+ sd->max_newidle_lb_cost = domain_cost;
+
+ curr_cost += domain_cost;
}
interval = msecs_to_jiffies(sd->balance_interval);
@@ -5285,6 +6444,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
*/
this_rq->next_balance = next_balance;
}
+
+ if (curr_cost > this_rq->max_idle_balance_cost)
+ this_rq->max_idle_balance_cost = curr_cost;
}
/*
@@ -5421,14 +6583,13 @@ static inline void set_cpu_sd_state_busy(void)
int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
- for (; sd; sd = sd->parent)
- atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+ atomic_inc(&sd->groups->sgp->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -5439,14 +6600,13 @@ void set_cpu_sd_state_idle(void)
int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
- for (; sd; sd = sd->parent)
- atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+ atomic_dec(&sd->groups->sgp->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -5471,7 +6631,7 @@ void nohz_balance_enter_idle(int cpu)
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
-static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
+static int sched_ilb_notifier(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
@@ -5503,22 +6663,46 @@ void update_max_interval(void)
*/
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
- int balance = 1;
+ int continue_balancing = 1;
struct rq *rq = cpu_rq(cpu);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
- int need_serialize;
+ int need_serialize, need_decay = 0;
+ u64 max_cost = 0;
update_blocked_averages(cpu);
rcu_read_lock();
for_each_domain(cpu, sd) {
+ /*
+ * Decay the newidle max times here because this is a regular
+ * visit to all the domains. Decay ~1% per second.
+ */
+ if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
+ sd->max_newidle_lb_cost =
+ (sd->max_newidle_lb_cost * 253) / 256;
+ sd->next_decay_max_lb_cost = jiffies + HZ;
+ need_decay = 1;
+ }
+ max_cost += sd->max_newidle_lb_cost;
+
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
+ /*
+ * Stop the load balance at this level. There is another
+ * CPU in our sched group which is doing load balancing more
+ * actively.
+ */
+ if (!continue_balancing) {
+ if (need_decay)
+ continue;
+ break;
+ }
+
interval = sd->balance_interval;
if (idle != CPU_IDLE)
interval *= sd->busy_factor;
@@ -5535,9 +6719,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &balance)) {
+ if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
/*
- * The LBF_SOME_PINNED logic could have changed
+ * The LBF_DST_PINNED logic could have changed
* env->dst_cpu, so we can't know our idle
* state even if we migrated tasks. Update it.
*/
@@ -5552,14 +6736,14 @@ out:
next_balance = sd->last_balance + interval;
update_next_balance = 1;
}
-
+ }
+ if (need_decay) {
/*
- * Stop the load balance at this level. There is another
- * CPU in our sched group which is doing load balancing more
- * actively.
+ * Ensure the rq-wide value also decays but keep it at a
+ * reasonable floor to avoid funnies with rq->avg_idle.
*/
- if (!balance)
- break;
+ rq->max_idle_balance_cost =
+ max((u64)sysctl_sched_migration_cost, max_cost);
}
rcu_read_unlock();
@@ -5629,6 +6813,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
{
unsigned long now = jiffies;
struct sched_domain *sd;
+ struct sched_group_power *sgp;
+ int nr_busy;
if (unlikely(idle_cpu(cpu)))
return 0;
@@ -5654,22 +6840,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
goto need_kick;
rcu_read_lock();
- for_each_domain(cpu, sd) {
- struct sched_group *sg = sd->groups;
- struct sched_group_power *sgp = sg->sgp;
- int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
- goto need_kick_unlock;
+ if (sd) {
+ sgp = sd->groups->sgp;
+ nr_busy = atomic_read(&sgp->nr_busy_cpus);
- if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
- && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
+ if (nr_busy > 1)
goto need_kick_unlock;
-
- if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
- break;
}
+
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
+
+ if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
+ sched_domain_span(sd)) < cpu))
+ goto need_kick_unlock;
+
rcu_read_unlock();
return 0;
@@ -5751,7 +6937,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
entity_tick(cfs_rq, se, queued);
}
- if (sched_feat_numa(NUMA))
+ if (numabalancing_enabled)
task_tick_numa(rq, curr);
update_rq_runnable_avg(rq, 1);
@@ -5777,11 +6963,15 @@ static void task_fork_fair(struct task_struct *p)
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
- if (unlikely(task_cpu(p) != this_cpu)) {
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
- }
+ /*
+ * Not only the cpu but also the task_group of the parent might have
+ * been changed after parent->se.parent,cfs_rq were copied to
+ * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
+ * of child point to valid ones.
+ */
+ rcu_read_lock();
+ __set_task_cpu(p, this_cpu);
+ rcu_read_unlock();
update_curr(cfs_rq);
@@ -5848,17 +7038,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
se->vruntime -= cfs_rq->min_vruntime;
}
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
/*
* Remove our load from contribution when we leave sched_fair
* and ensure we don't carry in an old decay_count if we
* switch back.
*/
- if (p->se.avg.decay_count) {
- struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
- __synchronize_entity_decay(&p->se);
- subtract_blocked_load_contrib(cfs_rq,
- p->se.avg.load_avg_contrib);
+ if (se->avg.decay_count) {
+ __synchronize_entity_decay(se);
+ subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
}
#endif
}
@@ -5907,9 +7095,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
atomic64_set(&cfs_rq->decay_counter, 1);
- atomic64_set(&cfs_rq->removed_load, 0);
+ atomic_long_set(&cfs_rq->removed_load, 0);
#endif
}
@@ -6060,7 +7248,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
se->cfs_rq = parent->my_q;
se->my_q = cfs_rq;
- update_load_set(&se->load, 0);
+ /* guarantee group entities always have weight */
+ update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
}
@@ -6091,6 +7280,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
se = tg->se[i];
/* Propagate contribution to hierarchy */
raw_spin_lock_irqsave(&rq->lock, flags);
+
+ /* Possible calls to update_curr() need rq clock */
+ update_rq_clock(rq);
for_each_sched_entity(se)
update_cfs_shares(group_cfs_rq(se));
raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6146,9 +7338,8 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
-#ifdef CONFIG_FAIR_GROUP_SCHED
.migrate_task_rq = migrate_task_rq_fair,
-#endif
+
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 99399f8..5716929 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)
/*
* Apply the automatic NUMA scheduling policy. Enabled automatically
* at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=. Allow PTE scanning to be forced on UMA machines
- * for debugging the core machinery.
+ * numa_balancing=
*/
#ifdef CONFIG_NUMA_BALANCING
SCHED_FEAT(NUMA, false)
-SCHED_FEAT(NUMA_FORCE, false)
+
+/*
+ * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
+ * higher number of hinting faults are recorded during active load
+ * balancing.
+ */
+SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
+
+/*
+ * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
+ * lower number of hinting faults have been recorded. As this has
+ * the potential to prevent a task ever migrating to a new node
+ * due to CPU overload it is disabled by default.
+ */
+SCHED_FEAT(NUMA_RESIST_LOWER, false)
#endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index d8da010..516c3d9 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,7 @@
#ifdef CONFIG_SMP
static int
-select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
new file mode 100644
index 0000000..16f5a30
--- /dev/null
+++ b/kernel/sched/proc.c
@@ -0,0 +1,591 @@
+/*
+ * kernel/sched/proc.c
+ *
+ * Kernel load calculations, forked from sched/core.c
+ */
+
+#include <linux/export.h>
+
+#include "sched.h"
+
+unsigned long this_cpu_load(void)
+{
+ struct rq *this = this_rq();
+ return this->cpu_load[0];
+}
+
+
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ * nr_active = 0;
+ * for_each_possible_cpu(cpu)
+ * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ * - for_each_possible_cpu() is prohibitively expensive on machines with
+ * serious number of cpus, therefore we need to take a distributed approach
+ * to calculating nr_active.
+ *
+ * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ * So assuming nr_active := 0 when we start out -- true per definition, we
+ * can simply take per-cpu deltas and fold those into a global accumulate
+ * to obtain the same result. See calc_load_fold_active().
+ *
+ * Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ * across the machine, we assume 10 ticks is sufficient time for every
+ * cpu to have completed this task.
+ *
+ * This places an upper-bound on the IRQ-off latency of the machine. Then
+ * again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ * this would add another cross-cpu cacheline miss and atomic operation
+ * to the wakeup path. Instead we increment on whatever cpu the task ran
+ * when it went into uninterruptible state and decrement on whatever cpu
+ * did the wakeup. This means that only the sum of nr_uninterruptible over
+ * all cpus yields the correct result.
+ *
+ * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
+/* Variables and functions for calc_load */
+atomic_long_t calc_load_tasks;
+unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
+
+long calc_load_fold_active(struct rq *this_rq)
+{
+ long nr_active, delta = 0;
+
+ nr_active = this_rq->nr_running;
+ nr_active += (long) this_rq->nr_uninterruptible;
+
+ if (nr_active != this_rq->calc_load_active) {
+ delta = nr_active - this_rq->calc_load_active;
+ this_rq->calc_load_active = nr_active;
+ }
+
+ return delta;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ load += 1UL << (FSHIFT - 1);
+ return load >> FSHIFT;
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ * - When we go NO_HZ idle during the window, we can negate our sample
+ * contribution, causing under-accounting.
+ *
+ * We avoid this by keeping two idle-delta counters and flipping them
+ * when the window starts, thus separating old and new NO_HZ load.
+ *
+ * The only trick is the slight shift in index flip for read vs write.
+ *
+ * 0s 5s 10s 15s
+ * +10 +10 +10 +10
+ * |-|-----------|-|-----------|-|-----------|-|
+ * r:0 0 1 1 0 0 1 1 0
+ * w:0 1 1 0 0 1 1 0 0
+ *
+ * This ensures we'll fold the old idle contribution in this window while
+ * accumlating the new one.
+ *
+ * - When we wake up from NO_HZ idle during the window, we push up our
+ * contribution, since we effectively move our sample point to a known
+ * busy state.
+ *
+ * This is solved by pushing the window forward, and thus skipping the
+ * sample, for this cpu (effectively using the idle-delta for this cpu which
+ * was in effect at the time the window opened). This also solves the issue
+ * of having to deal with a cpu having been in NOHZ idle for multiple
+ * LOAD_FREQ intervals.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
+
+static inline int calc_load_write_idx(void)
+{
+ int idx = calc_load_idx;
+
+ /*
+ * See calc_global_nohz(), if we observe the new index, we also
+ * need to observe the new update time.
+ */
+ smp_rmb();
+
+ /*
+ * If the folding window started, make sure we start writing in the
+ * next idle-delta.
+ */
+ if (!time_before(jiffies, calc_load_update))
+ idx++;
+
+ return idx & 1;
+}
+
+static inline int calc_load_read_idx(void)
+{
+ return calc_load_idx & 1;
+}
+
+void calc_load_enter_idle(void)
+{
+ struct rq *this_rq = this_rq();
+ long delta;
+
+ /*
+ * We're going into NOHZ mode, if there's any pending delta, fold it
+ * into the pending idle delta.
+ */
+ delta = calc_load_fold_active(this_rq);
+ if (delta) {
+ int idx = calc_load_write_idx();
+ atomic_long_add(delta, &calc_load_idle[idx]);
+ }
+}
+
+void calc_load_exit_idle(void)
+{
+ struct rq *this_rq = this_rq();
+
+ /*
+ * If we're still before the sample window, we're done.
+ */
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ /*
+ * We woke inside or after the sample window, this means we're already
+ * accounted through the nohz accounting, so skip the entire deal and
+ * sync up for the next window.
+ */
+ this_rq->calc_load_update = calc_load_update;
+ if (time_before(jiffies, this_rq->calc_load_update + 10))
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_fold_idle(void)
+{
+ int idx = calc_load_read_idx();
+ long delta = 0;
+
+ if (atomic_long_read(&calc_load_idle[idx]))
+ delta = atomic_long_xchg(&calc_load_idle[idx], 0);
+
+ return delta;
+}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+
+ return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(void)
+{
+ long delta, active, n;
+
+ if (!time_before(jiffies, calc_load_update + 10)) {
+ /*
+ * Catch-up, fold however many we are behind still
+ */
+ delta = jiffies - calc_load_update - 10;
+ n = 1 + (delta / LOAD_FREQ);
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+ calc_load_update += n * LOAD_FREQ;
+ }
+
+ /*
+ * Flip the idle index...
+ *
+ * Make sure we first write the new time then flip the index, so that
+ * calc_load_write_idx() will see the new time when it reads the new
+ * index, this avoids a double flip messing things up.
+ */
+ smp_wmb();
+ calc_load_idx++;
+}
+#else /* !CONFIG_NO_HZ_COMMON */
+
+static inline long calc_load_fold_idle(void) { return 0; }
+static inline void calc_global_nohz(void) { }
+
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(unsigned long ticks)
+{
+ long active, delta;
+
+ if (time_before(jiffies, calc_load_update + 10))
+ return;
+
+ /*
+ * Fold the 'old' idle-delta to include all NO_HZ cpus.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+ calc_load_update += LOAD_FREQ;
+
+ /*
+ * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
+ */
+ calc_global_nohz();
+}
+
+/*
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+ long delta;
+
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ delta = calc_load_fold_active(this_rq);
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * End of global load-average stuff
+ */
+
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT 7
+static const unsigned char
+ degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+ degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ {0, 0, 0, 0, 0, 0, 0, 0},
+ {64, 32, 8, 0, 0, 0, 0, 0},
+ {96, 72, 40, 12, 1, 0, 0},
+ {112, 98, 75, 43, 15, 1, 0},
+ {120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+ int j = 0;
+
+ if (!missed_updates)
+ return load;
+
+ if (missed_updates >= degrade_zero_ticks[idx])
+ return 0;
+
+ if (idx == 1)
+ return load >> missed_updates;
+
+ while (missed_updates) {
+ if (missed_updates % 2)
+ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+ missed_updates >>= 1;
+ j++;
+ }
+ return load;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+ unsigned long pending_updates)
+{
+ int i, scale;
+
+ this_rq->nr_load_updates++;
+
+ /* Update our load: */
+ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_rq->cpu_load[i];
+ old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ new_load = this_load;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale - 1;
+
+ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+ }
+
+ sched_avg_update(this_rq);
+}
+
+#ifdef CONFIG_SMP
+static inline unsigned long get_rq_runnable_load(struct rq *rq)
+{
+ return rq->cfs.runnable_load_avg;
+}
+#else
+static inline unsigned long get_rq_runnable_load(struct rq *rq)
+{
+ return rq->load.weight;
+}
+#endif
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+void update_idle_cpu_load(struct rq *this_rq)
+{
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long load = get_rq_runnable_load(this_rq);
+ unsigned long pending_updates;
+
+ /*
+ * bail if there's load or we're actually up-to-date.
+ */
+ if (load || curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ this_rq->last_load_update_tick = curr_jiffies;
+
+ __update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+ struct rq *this_rq = this_rq();
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long pending_updates;
+
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&this_rq->lock);
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * We were idle, this means load 0, the current load might be
+ * !0 due to remote wakeups and the sort.
+ */
+ __update_cpu_load(this_rq, 0, pending_updates);
+ }
+ raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+ unsigned long load = get_rq_runnable_load(this_rq);
+ /*
+ * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+ */
+ this_rq->last_load_update_tick = jiffies;
+ __update_cpu_load(this_rq, load, 1);
+
+ calc_load_account_active(this_rq);
+}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 127a2c4..7d57275 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq)
* if we should look at the mask. It would be a shame
* if we looked at the mask, but the mask was not
* updated yet.
+ *
+ * Matched by the barrier in pull_rt_task().
*/
- wmb();
+ smp_wmb();
atomic_inc(&rq->rd->rto_count);
}
@@ -399,20 +401,6 @@ static inline struct task_group *next_task_group(struct task_group *tg)
(iter = next_task_group(iter)) && \
(rt_rq = iter->rt_rq[cpu_of(rq)]);)
-static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
-{
- list_add_rcu(&rt_rq->leaf_rt_rq_list,
- &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
-}
-
-static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
-{
- list_del_rcu(&rt_rq->leaf_rt_rq_list);
-}
-
-#define for_each_leaf_rt_rq(rt_rq, rq) \
- list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
-
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = rt_se->parent)
@@ -472,7 +460,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
#ifdef CONFIG_SMP
static inline const struct cpumask *sched_rt_period_mask(void)
{
- return cpu_rq(smp_processor_id())->rd->span;
+ return this_rq()->rd->span;
}
#else
static inline const struct cpumask *sched_rt_period_mask(void)
@@ -509,17 +497,6 @@ typedef struct rt_rq *rt_rq_iter_t;
#define for_each_rt_rq(rt_rq, iter, rq) \
for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-}
-
-static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-}
-
-#define for_each_leaf_rt_rq(rt_rq, rq) \
- for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = NULL)
@@ -699,15 +676,6 @@ balanced:
}
}
-static void disable_runtime(struct rq *rq)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- __disable_runtime(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
static void __enable_runtime(struct rq *rq)
{
rt_rq_iter_t iter;
@@ -732,37 +700,6 @@ static void __enable_runtime(struct rq *rq)
}
}
-static void enable_runtime(struct rq *rq)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- __enable_runtime(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- int cpu = (int)(long)hcpu;
-
- switch (action) {
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- disable_runtime(cpu_rq(cpu));
- return NOTIFY_OK;
-
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- enable_runtime(cpu_rq(cpu));
- return NOTIFY_OK;
-
- default:
- return NOTIFY_DONE;
- }
-}
-
static int balance_runtime(struct rt_rq *rt_rq)
{
int more = 0;
@@ -926,7 +863,7 @@ static void update_curr_rt(struct rq *rq)
if (curr->sched_class != &rt_sched_class)
return;
- delta_exec = rq->clock_task - curr->se.exec_start;
+ delta_exec = rq_clock_task(rq) - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
@@ -936,7 +873,7 @@ static void update_curr_rt(struct rq *rq)
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
- curr->se.exec_start = rq->clock_task;
+ curr->se.exec_start = rq_clock_task(rq);
cpuacct_charge(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
@@ -1106,9 +1043,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
return;
- if (!rt_rq->rt_nr_running)
- list_add_leaf_rt_rq(rt_rq);
-
if (head)
list_add(&rt_se->run_list, queue);
else
@@ -1128,8 +1062,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
__clear_bit(rt_se_prio(rt_se), array->bitmap);
dec_rt_tasks(rt_se, rt_rq);
- if (!rt_rq->rt_nr_running)
- list_del_leaf_rt_rq(rt_rq);
}
/*
@@ -1239,13 +1171,10 @@ static void yield_task_rt(struct rq *rq)
static int find_lowest_rq(struct task_struct *task);
static int
-select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
struct rq *rq;
- int cpu;
-
- cpu = task_cpu(p);
if (p->nr_cpus_allowed == 1)
goto out;
@@ -1283,8 +1212,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
*/
if (curr && unlikely(rt_task(curr)) &&
(curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio) &&
- (p->nr_cpus_allowed > 1)) {
+ curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
if (target != -1)
@@ -1385,7 +1313,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
} while (rt_rq);
p = rt_task_of(rt_se);
- p->se.exec_start = rq->clock_task;
+ p->se.exec_start = rq_clock_task(rq);
return p;
}
@@ -1434,42 +1362,24 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
return 0;
}
-/* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
+/*
+ * Return the highest pushable rq's task, which is suitable to be executed
+ * on the cpu, NULL otherwise
+ */
+static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
{
- struct task_struct *next = NULL;
- struct sched_rt_entity *rt_se;
- struct rt_prio_array *array;
- struct rt_rq *rt_rq;
- int idx;
-
- for_each_leaf_rt_rq(rt_rq, rq) {
- array = &rt_rq->active;
- idx = sched_find_first_bit(array->bitmap);
-next_idx:
- if (idx >= MAX_RT_PRIO)
- continue;
- if (next && next->prio <= idx)
- continue;
- list_for_each_entry(rt_se, array->queue + idx, run_list) {
- struct task_struct *p;
+ struct plist_head *head = &rq->rt.pushable_tasks;
+ struct task_struct *p;
- if (!rt_entity_is_task(rt_se))
- continue;
+ if (!has_pushable_tasks(rq))
+ return NULL;
- p = rt_task_of(rt_se);
- if (pick_rt_task(rq, p, cpu)) {
- next = p;
- break;
- }
- }
- if (!next) {
- idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
- goto next_idx;
- }
+ plist_for_each_entry(p, head, pushable_tasks) {
+ if (pick_rt_task(rq, p, cpu))
+ return p;
}
- return next;
+ return NULL;
}
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
@@ -1718,6 +1628,12 @@ static int pull_rt_task(struct rq *this_rq)
if (likely(!rt_overloaded(this_rq)))
return 0;
+ /*
+ * Match the barrier from rt_set_overloaded; this guarantees that if we
+ * see overloaded we must also see the rto_mask bit.
+ */
+ smp_rmb();
+
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
@@ -1743,12 +1659,10 @@ static int pull_rt_task(struct rq *this_rq)
double_lock_balance(this_rq, src_rq);
/*
- * Are there still pullable RT tasks?
+ * We can pull only a task, which is pushable
+ * on its rq, and no others.
*/
- if (src_rq->rt.rt_nr_running <= 1)
- goto skip;
-
- p = pick_next_highest_task_rt(src_rq, this_cpu);
+ p = pick_highest_pushable_task(src_rq, this_cpu);
/*
* Do we have an RT task that preempts
@@ -2021,8 +1935,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
p->rt.time_slice = sched_rr_timeslice;
/*
- * Requeue to the end of queue if we (and all of our ancestors) are the
- * only element on the queue
+ * Requeue to the end of queue if we (and all of our ancestors) are not
+ * the only element on the queue
*/
for_each_sched_rt_entity(rt_se) {
if (rt_se->run_list.prev != rt_se->run_list.next) {
@@ -2037,7 +1951,7 @@ static void set_curr_task_rt(struct rq *rq)
{
struct task_struct *p = rq->curr;
- p->se.exec_start = rq->clock_task;
+ p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ce39224d..88c85b2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,12 +6,21 @@
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
#include <linux/tick.h>
+#include <linux/slab.h>
#include "cpupri.h"
#include "cpuacct.h"
+struct rq;
+
extern __read_mostly int scheduler_running;
+extern unsigned long calc_load_update;
+extern atomic_long_t calc_load_tasks;
+
+extern long calc_load_fold_active(struct rq *this_rq);
+extern void update_cpu_load_active(struct rq *this_rq);
+
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -140,10 +149,11 @@ struct task_group {
struct cfs_rq **cfs_rq;
unsigned long shares;
- atomic_t load_weight;
- atomic64_t load_avg;
+#ifdef CONFIG_SMP
+ atomic_long_t load_avg;
atomic_t runnable_avg;
#endif
+#endif
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity **rt_se;
@@ -261,27 +271,21 @@ struct cfs_rq {
#endif
#ifdef CONFIG_SMP
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* CFS Load tracking
* Under CFS, load is tracked on a per-entity basis and aggregated up.
* This allows for the description of both thread and group usage (in
* the FAIR_GROUP_SCHED case).
*/
- u64 runnable_load_avg, blocked_load_avg;
- atomic64_t decay_counter, removed_load;
+ unsigned long runnable_load_avg, blocked_load_avg;
+ atomic64_t decay_counter;
u64 last_decay;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-/* These always depend on CONFIG_FAIR_GROUP_SCHED */
+ atomic_long_t removed_load;
+
#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* Required to track per-cpu representation of a task_group */
u32 tg_runnable_contrib;
- u64 tg_load_contrib;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+ unsigned long tg_load_contrib;
/*
* h_load = weight * f(tg)
@@ -290,6 +294,9 @@ struct cfs_rq {
* this group.
*/
unsigned long h_load;
+ u64 last_h_load_update;
+ struct sched_entity *h_load_next;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -353,7 +360,6 @@ struct rt_rq {
unsigned long rt_nr_boosted;
struct rq *rq;
- struct list_head leaf_rt_rq_list;
struct task_group *tg;
#endif
};
@@ -403,6 +409,10 @@ struct rq {
* remote CPUs use both these fields when doing load calculation.
*/
unsigned int nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int nr_numa_running;
+ unsigned int nr_preferred_running;
+#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
@@ -426,9 +436,6 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
-#ifdef CONFIG_SMP
- unsigned long h_load_throttle;
-#endif /* CONFIG_SMP */
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -474,6 +481,9 @@ struct rq {
u64 age_stamp;
u64 idle_stamp;
u64 avg_idle;
+
+ /* This is used to determine avg_idle's max value */
+ u64 max_idle_balance_cost;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -540,6 +550,22 @@ DECLARE_PER_CPU(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() (&__raw_get_cpu_var(runqueues))
+static inline u64 rq_clock(struct rq *rq)
+{
+ return rq->clock;
+}
+
+static inline u64 rq_clock_task(struct rq *rq)
+{
+ return rq->clock_task;
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+extern void sched_setnuma(struct task_struct *p, int node);
+extern int migrate_task_to(struct task_struct *p, int cpu);
+extern int migrate_swap(struct task_struct *, struct task_struct *);
+#endif /* CONFIG_NUMA_BALANCING */
+
#ifdef CONFIG_SMP
#define rcu_dereference_check_sched_domain(p) \
@@ -581,8 +607,24 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
return hsd;
}
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd) {
+ if (sd->flags & flag)
+ break;
+ }
+
+ return sd;
+}
+
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain *, sd_busy);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_power {
atomic_t ref;
@@ -592,6 +634,7 @@ struct sched_group_power {
*/
unsigned int power, power_orig;
unsigned long next_update;
+ int imbalance; /* XXX unrelated to power but shared group state */
/*
* Number of busy cpus in this group.
*/
@@ -652,9 +695,9 @@ extern int group_balance_cpu(struct sched_group *sg);
/*
* Return the group to which this tasks belongs.
*
- * We cannot use task_subsys_state() and friends because the cgroup
- * subsystem changes that value before the cgroup_subsys::attach() method
- * is called, therefore we cannot pin it and might observe the wrong value.
+ * We cannot use task_css() and friends because the cgroup subsystem
+ * changes that value before the cgroup_subsys::attach() method is called,
+ * therefore we cannot pin it and might observe the wrong value.
*
* The same is true for autogroup's p->signal->autogroup->tg, the autogroup
* core changes this before calling sched_move_task().
@@ -706,6 +749,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
*/
smp_wmb();
task_thread_info(p)->cpu = cpu;
+ p->wake_cpu = cpu;
#endif
}
@@ -884,24 +928,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
#define WF_FORK 0x02 /* child wakeup after fork */
#define WF_MIGRATED 0x4 /* internal use, task got migrated */
-static inline void update_load_add(struct load_weight *lw, unsigned long inc)
-{
- lw->weight += inc;
- lw->inv_weight = 0;
-}
-
-static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
-{
- lw->weight -= dec;
- lw->inv_weight = 0;
-}
-
-static inline void update_load_set(struct load_weight *lw, unsigned long w)
-{
- lw->weight = w;
- lw->inv_weight = 0;
-}
-
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
@@ -979,7 +1005,7 @@ struct sched_class {
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
- int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+ int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
@@ -1028,17 +1054,8 @@ extern void update_group_power(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq, int cpu);
extern void idle_balance(int this_cpu, struct rq *this_rq);
-/*
- * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
- * becomes useful in lb
- */
-#if defined(CONFIG_FAIR_GROUP_SCHED)
extern void idle_enter_fair(struct rq *this_rq);
extern void idle_exit_fair(struct rq *this_rq);
-#else
-static inline void idle_enter_fair(struct rq *this_rq) {}
-static inline void idle_exit_fair(struct rq *this_rq) {}
-#endif
#else /* CONFIG_SMP */
@@ -1051,7 +1068,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
extern void update_max_interval(void);
-extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
@@ -1063,6 +1079,8 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
extern void update_idle_cpu_load(struct rq *this_rq);
+extern void init_task_runnable_average(struct task_struct *p);
+
#ifdef CONFIG_PARAVIRT
static inline u64 steal_ticks(u64 steal)
{
@@ -1233,6 +1251,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
}
+static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ spin_lock(l1);
+ spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ raw_spin_lock(l1);
+ raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
/*
* double_rq_lock - safely lock two runqueues
*
@@ -1318,7 +1354,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
+extern void cfs_bandwidth_usage_inc(void);
+extern void cfs_bandwidth_usage_dec(void);
#ifdef CONFIG_NO_HZ_COMMON
enum rq_nohz_flag_bits {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 2ef90a5..4ab7043 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
* from dequeue_task() to account for possible rq->clock skew across cpus. The
* delta taken on each cpu would annul the skew.
*/
-static inline void sched_info_dequeued(struct task_struct *t)
+static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = task_rq(t)->clock, delta = 0;
+ unsigned long long now = rq_clock(rq), delta = 0;
if (unlikely(sched_info_on()))
if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
sched_info_reset_dequeued(t);
t->sched_info.run_delay += delta;
- rq_sched_info_dequeued(task_rq(t), delta);
+ rq_sched_info_dequeued(rq, delta);
}
/*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
* long it was waiting to run. We also note when it began so that we
* can keep stats on how long its timeslice is.
*/
-static void sched_info_arrive(struct task_struct *t)
+static void sched_info_arrive(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = task_rq(t)->clock, delta = 0;
+ unsigned long long now = rq_clock(rq), delta = 0;
if (t->sched_info.last_queued)
delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
- rq_sched_info_arrive(task_rq(t), delta);
+ rq_sched_info_arrive(rq, delta);
}
/*
@@ -96,29 +96,30 @@ static void sched_info_arrive(struct task_struct *t)
* the timestamp if it is already not set. It's assumed that
* sched_info_dequeued() will clear that stamp when appropriate.
*/
-static inline void sched_info_queued(struct task_struct *t)
+static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
{
if (unlikely(sched_info_on()))
if (!t->sched_info.last_queued)
- t->sched_info.last_queued = task_rq(t)->clock;
+ t->sched_info.last_queued = rq_clock(rq);
}
/*
- * Called when a process ceases being the active-running process, either
- * voluntarily or involuntarily. Now we can calculate how long we ran.
+ * Called when a process ceases being the active-running process involuntarily
+ * due, typically, to expiring its time slice (this may also be called when
+ * switching to the idle task). Now we can calculate how long we ran.
* Also, if the process is still in the TASK_RUNNING state, call
* sched_info_queued() to mark that it has now again started waiting on
* the runqueue.
*/
-static inline void sched_info_depart(struct task_struct *t)
+static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
{
- unsigned long long delta = task_rq(t)->clock -
+ unsigned long long delta = rq_clock(rq) -
t->sched_info.last_arrival;
- rq_sched_info_depart(task_rq(t), delta);
+ rq_sched_info_depart(rq, delta);
if (t->state == TASK_RUNNING)
- sched_info_queued(t);
+ sched_info_queued(rq, t);
}
/*
@@ -127,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
* the idle task.) We are only called when prev != next.
*/
static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct rq *rq,
+ struct task_struct *prev, struct task_struct *next)
{
- struct rq *rq = task_rq(prev);
-
/*
* prev now departs the cpu. It's not interesting to record
* stats about how efficient we were at scheduling the idle
* process, however.
*/
if (prev != rq->idle)
- sched_info_depart(prev);
+ sched_info_depart(rq, prev);
if (next != rq->idle)
- sched_info_arrive(next);
+ sched_info_arrive(rq, next);
}
static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq,
+ struct task_struct *prev, struct task_struct *next)
{
if (unlikely(sched_info_on()))
- __sched_info_switch(prev, next);
+ __sched_info_switch(rq, prev, next);
}
#else
-#define sched_info_queued(t) do { } while (0)
+#define sched_info_queued(rq, t) do { } while (0)
#define sched_info_reset_dequeued(t) do { } while (0)
-#define sched_info_dequeued(t) do { } while (0)
-#define sched_info_switch(t, next) do { } while (0)
+#define sched_info_dequeued(rq, t) do { } while (0)
+#define sched_info_depart(rq, t) do { } while (0)
+#define sched_info_arrive(rq, next) do { } while (0)
+#define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
/*
@@ -162,6 +165,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
*/
/**
+ * cputimer_running - return true if cputimer is running
+ *
+ * @tsk: Pointer to target task.
+ */
+static inline bool cputimer_running(struct task_struct *tsk)
+
+{
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+ if (!cputimer->running)
+ return false;
+
+ /*
+ * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
+ * in __exit_signal(), we won't account to the signal struct further
+ * cputime consumed by that task, even though the task can still be
+ * ticking after __exit_signal().
+ *
+ * In order to keep a consistent behaviour between thread group cputime
+ * and thread group cputimer accounting, lets also ignore the cputime
+ * elapsing after __exit_signal() in any thread group timer running.
+ *
+ * This makes sure that POSIX CPU clocks and timers are synchronized, so
+ * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
+ * clock delta is behind the expiring timer value.
+ */
+ if (unlikely(!tsk->sighand))
+ return false;
+
+ return true;
+}
+
+/**
* account_group_user_time - Maintain utime for a thread group.
*
* @tsk: Pointer to task structure.
@@ -176,7 +212,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- if (!cputimer->running)
+ if (!cputimer_running(tsk))
return;
raw_spin_lock(&cputimer->lock);
@@ -199,7 +235,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- if (!cputimer->running)
+ if (!cputimer_running(tsk))
return;
raw_spin_lock(&cputimer->lock);
@@ -222,7 +258,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- if (!cputimer->running)
+ if (!cputimer_running(tsk))
return;
raw_spin_lock(&cputimer->lock);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index da5eb5b..47197de 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
#ifdef CONFIG_SMP
static int
-select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* stop tasks as never migrate */
}
@@ -28,7 +28,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
struct task_struct *stop = rq->stop;
if (stop && stop->on_rq) {
- stop->se.exec_start = rq->clock_task;
+ stop->se.exec_start = rq_clock_task(rq);
return stop;
}
@@ -57,7 +57,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
struct task_struct *curr = rq->curr;
u64 delta_exec;
- delta_exec = rq->clock_task - curr->se.exec_start;
+ delta_exec = rq_clock_task(rq) - curr->se.exec_start;
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
@@ -67,7 +67,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
- curr->se.exec_start = rq->clock_task;
+ curr->se.exec_start = rq_clock_task(rq);
cpuacct_charge(curr, delta_exec);
}
@@ -79,7 +79,7 @@ static void set_curr_task_stop(struct rq *rq)
{
struct task_struct *stop = rq->stop;
- stop->se.exec_start = rq->clock_task;
+ stop->se.exec_start = rq_clock_task(rq);
}
static void switched_to_stop(struct rq *rq, struct task_struct *p)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
new file mode 100644
index 0000000..7d50f79
--- /dev/null
+++ b/kernel/sched/wait.c
@@ -0,0 +1,504 @@
+/*
+ * Generic waiting primitives.
+ *
+ * (C) 2004 Nadia Yvette Chambers, Oracle
+ */
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/wait.h>
+#include <linux/hash.h>
+
+void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
+{
+ spin_lock_init(&q->lock);
+ lockdep_set_class_and_name(&q->lock, key, name);
+ INIT_LIST_HEAD(&q->task_list);
+}
+
+EXPORT_SYMBOL(__init_waitqueue_head);
+
+void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ __add_wait_queue(q, wait);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue);
+
+void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ wait->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ __add_wait_queue_tail(q, wait);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive);
+
+void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __remove_wait_queue(q, wait);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(remove_wait_queue);
+
+
+/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key)
+{
+ wait_queue_t *curr, *next;
+
+ list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
+ unsigned flags = curr->flags;
+
+ if (curr->func(curr, mode, wake_flags, key) &&
+ (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ break;
+ }
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, 0, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+{
+ __wake_up_common(q, mode, nr, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked);
+
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+ __wake_up_common(q, mode, 1, 0, key);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+
+/**
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+ int wake_flags = 1; /* XXX WF_SYNC */
+
+ if (unlikely(!q))
+ return;
+
+ if (unlikely(nr_exclusive != 1))
+ wake_flags = 0;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+ __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+
+/*
+ * Note: we use "set_current_state()" _after_ the wait-queue add,
+ * because we need a memory barrier there on SMP, so that any
+ * wake-function that tests for the wait-queue being active
+ * will be guaranteed to see waitqueue addition _or_ subsequent
+ * tests in this thread will see the wakeup having taken place.
+ *
+ * The spin_unlock() itself is semi-permeable and only protects
+ * one way (it only protects stuff inside the critical region and
+ * stops them from bleeding out - it would still allow subsequent
+ * loads to move into the critical region).
+ */
+void
+prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+ unsigned long flags;
+
+ wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ if (list_empty(&wait->task_list))
+ __add_wait_queue(q, wait);
+ set_current_state(state);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait);
+
+void
+prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+ unsigned long flags;
+
+ wait->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ if (list_empty(&wait->task_list))
+ __add_wait_queue_tail(q, wait);
+ set_current_state(state);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait_exclusive);
+
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+ unsigned long flags;
+
+ if (signal_pending_state(state, current))
+ return -ERESTARTSYS;
+
+ wait->private = current;
+ wait->func = autoremove_wake_function;
+
+ spin_lock_irqsave(&q->lock, flags);
+ if (list_empty(&wait->task_list)) {
+ if (wait->flags & WQ_FLAG_EXCLUSIVE)
+ __add_wait_queue_tail(q, wait);
+ else
+ __add_wait_queue(q, wait);
+ }
+ set_current_state(state);
+ spin_unlock_irqrestore(&q->lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(prepare_to_wait_event);
+
+/**
+ * finish_wait - clean up after waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ */
+void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+ /*
+ * We can check for list emptiness outside the lock
+ * IFF:
+ * - we use the "careful" check that verifies both
+ * the next and prev pointers, so that there cannot
+ * be any half-pending updates in progress on other
+ * CPU's that we haven't seen yet (and that might
+ * still change the stack area.
+ * and
+ * - all other users take the lock (ie we can only
+ * have _one_ other CPU that looks at or modifies
+ * the list).
+ */
+ if (!list_empty_careful(&wait->task_list)) {
+ spin_lock_irqsave(&q->lock, flags);
+ list_del_init(&wait->task_list);
+ spin_unlock_irqrestore(&q->lock, flags);
+ }
+}
+EXPORT_SYMBOL(finish_wait);
+
+/**
+ * abort_exclusive_wait - abort exclusive waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ * @mode: runstate of the waiter to be woken
+ * @key: key to identify a wait bit queue or %NULL
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ *
+ * Wakes up the next waiter if the caller is concurrently
+ * woken up through the queue.
+ *
+ * This prevents waiter starvation where an exclusive waiter
+ * aborts and is woken up concurrently and no one wakes up
+ * the next waiter.
+ */
+void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
+ unsigned int mode, void *key)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+ spin_lock_irqsave(&q->lock, flags);
+ if (!list_empty(&wait->task_list))
+ list_del_init(&wait->task_list);
+ else if (waitqueue_active(q))
+ __wake_up_locked_key(q, mode, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(abort_exclusive_wait);
+
+int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ int ret = default_wake_function(wait, mode, sync, key);
+
+ if (ret)
+ list_del_init(&wait->task_list);
+ return ret;
+}
+EXPORT_SYMBOL(autoremove_wake_function);
+
+int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue *wait_bit
+ = container_of(wait, struct wait_bit_queue, wait);
+
+ if (wait_bit->key.flags != key->flags ||
+ wait_bit->key.bit_nr != key->bit_nr ||
+ test_bit(key->bit_nr, key->flags))
+ return 0;
+ else
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+EXPORT_SYMBOL(wake_bit_function);
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking)
+ * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
+ * permitted return codes. Nonzero return codes halt waiting and return.
+ */
+int __sched
+__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
+ int (*action)(void *), unsigned mode)
+{
+ int ret = 0;
+
+ do {
+ prepare_to_wait(wq, &q->wait, mode);
+ if (test_bit(q->key.bit_nr, q->key.flags))
+ ret = (*action)(q->key.flags);
+ } while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
+ finish_wait(wq, &q->wait);
+ return ret;
+}
+EXPORT_SYMBOL(__wait_on_bit);
+
+int __sched out_of_line_wait_on_bit(void *word, int bit,
+ int (*action)(void *), unsigned mode)
+{
+ wait_queue_head_t *wq = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wait, word, bit);
+
+ return __wait_on_bit(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit);
+
+int __sched
+__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
+ int (*action)(void *), unsigned mode)
+{
+ do {
+ int ret;
+
+ prepare_to_wait_exclusive(wq, &q->wait, mode);
+ if (!test_bit(q->key.bit_nr, q->key.flags))
+ continue;
+ ret = action(q->key.flags);
+ if (!ret)
+ continue;
+ abort_exclusive_wait(wq, &q->wait, mode, &q->key);
+ return ret;
+ } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
+ finish_wait(wq, &q->wait);
+ return 0;
+}
+EXPORT_SYMBOL(__wait_on_bit_lock);
+
+int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
+ int (*action)(void *), unsigned mode)
+{
+ wait_queue_head_t *wq = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wait, word, bit);
+
+ return __wait_on_bit_lock(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
+
+void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
+{
+ struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
+ if (waitqueue_active(wq))
+ __wake_up(wq, TASK_NORMAL, 1, &key);
+}
+EXPORT_SYMBOL(__wake_up_bit);
+
+/**
+ * wake_up_bit - wake up a waiter on a bit
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that wakes up waiters
+ * on a bit. For instance, if one were to have waiters on a bitflag,
+ * one would call wake_up_bit() after clearing the bit.
+ *
+ * In order for this to function properly, as it uses waitqueue_active()
+ * internally, some kind of memory barrier must be done prior to calling
+ * this. Typically, this will be smp_mb__after_clear_bit(), but in some
+ * cases where bitflags are manipulated non-atomically under a lock, one
+ * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
+ * because spin_unlock() does not guarantee a memory barrier.
+ */
+void wake_up_bit(void *word, int bit)
+{
+ __wake_up_bit(bit_waitqueue(word, bit), word, bit);
+}
+EXPORT_SYMBOL(wake_up_bit);
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+ const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+ const struct zone *zone = page_zone(virt_to_page(word));
+ unsigned long val = (unsigned long)word << shift | bit;
+
+ return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
+}
+EXPORT_SYMBOL(bit_waitqueue);
+
+/*
+ * Manipulate the atomic_t address to produce a better bit waitqueue table hash
+ * index (we're keying off bit -1, but that would produce a horrible hash
+ * value).
+ */
+static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
+{
+ if (BITS_PER_LONG == 64) {
+ unsigned long q = (unsigned long)p;
+ return bit_waitqueue((void *)(q & ~1), q & 1);
+ }
+ return bit_waitqueue(p, 0);
+}
+
+static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
+ void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue *wait_bit
+ = container_of(wait, struct wait_bit_queue, wait);
+ atomic_t *val = key->flags;
+
+ if (wait_bit->key.flags != key->flags ||
+ wait_bit->key.bit_nr != key->bit_nr ||
+ atomic_read(val) != 0)
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
+ * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
+ * return codes halt waiting and return.
+ */
+static __sched
+int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
+ int (*action)(atomic_t *), unsigned mode)
+{
+ atomic_t *val;
+ int ret = 0;
+
+ do {
+ prepare_to_wait(wq, &q->wait, mode);
+ val = q->key.flags;
+ if (atomic_read(val) == 0)
+ break;
+ ret = (*action)(val);
+ } while (!ret && atomic_read(val) != 0);
+ finish_wait(wq, &q->wait);
+ return ret;
+}
+
+#define DEFINE_WAIT_ATOMIC_T(name, p) \
+ struct wait_bit_queue name = { \
+ .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
+ .wait = { \
+ .private = current, \
+ .func = wake_atomic_t_function, \
+ .task_list = \
+ LIST_HEAD_INIT((name).wait.task_list), \
+ }, \
+ }
+
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
+ unsigned mode)
+{
+ wait_queue_head_t *wq = atomic_t_waitqueue(p);
+ DEFINE_WAIT_ATOMIC_T(wait, p);
+
+ return __wait_on_atomic_t(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
+
+/**
+ * wake_up_atomic_t - Wake up a waiter on a atomic_t
+ * @p: The atomic_t being waited on, a kernel virtual address
+ *
+ * Wake up anyone waiting for the atomic_t to go to zero.
+ *
+ * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
+ * check is done by the waiter's wake function, not the by the waker itself).
+ */
+void wake_up_atomic_t(atomic_t *p)
+{
+ __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
+}
+EXPORT_SYMBOL(wake_up_atomic_t);