summaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c703
1 files changed, 452 insertions, 251 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e85cda2..5ac63c9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -513,11 +513,12 @@ static inline void init_hrtick(void)
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
+#ifdef CONFIG_SMP
void resched_task(struct task_struct *p)
{
int cpu;
- lockdep_assert_held(&task_rq(p)->lock);
+ assert_raw_spin_locked(&task_rq(p)->lock);
if (test_tsk_need_resched(p))
return;
@@ -525,10 +526,8 @@ void resched_task(struct task_struct *p)
set_tsk_need_resched(p);
cpu = task_cpu(p);
- if (cpu == smp_processor_id()) {
- set_preempt_need_resched();
+ if (cpu == smp_processor_id())
return;
- }
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
@@ -547,7 +546,6 @@ void resched_cpu(int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
-#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
/*
* In the semi idle case, use the nearest busy cpu for migrating timers
@@ -695,6 +693,12 @@ void sched_avg_update(struct rq *rq)
}
}
+#else /* !CONFIG_SMP */
+void resched_task(struct task_struct *p)
+{
+ assert_raw_spin_locked(&task_rq(p)->lock);
+ set_tsk_need_resched(p);
+}
#endif /* CONFIG_SMP */
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -763,14 +767,14 @@ static void set_load_weight(struct task_struct *p)
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_queued(rq, p);
+ sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, flags);
}
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_dequeued(rq, p);
+ sched_info_dequeued(p);
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -983,7 +987,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* ttwu() will sort out the placement.
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- !(task_preempt_count(p) & PREEMPT_ACTIVE));
+ !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
#ifdef CONFIG_LOCKDEP
/*
@@ -1013,107 +1017,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
-static void __migrate_swap_task(struct task_struct *p, int cpu)
-{
- if (p->on_rq) {
- struct rq *src_rq, *dst_rq;
-
- src_rq = task_rq(p);
- dst_rq = cpu_rq(cpu);
-
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, cpu);
- activate_task(dst_rq, p, 0);
- check_preempt_curr(dst_rq, p, 0);
- } else {
- /*
- * Task isn't running anymore; make it appear like we migrated
- * it before it went to sleep. This means on wakeup we make the
- * previous cpu our targer instead of where it really is.
- */
- p->wake_cpu = cpu;
- }
-}
-
-struct migration_swap_arg {
- struct task_struct *src_task, *dst_task;
- int src_cpu, dst_cpu;
-};
-
-static int migrate_swap_stop(void *data)
-{
- struct migration_swap_arg *arg = data;
- struct rq *src_rq, *dst_rq;
- int ret = -EAGAIN;
-
- src_rq = cpu_rq(arg->src_cpu);
- dst_rq = cpu_rq(arg->dst_cpu);
-
- double_raw_lock(&arg->src_task->pi_lock,
- &arg->dst_task->pi_lock);
- double_rq_lock(src_rq, dst_rq);
- if (task_cpu(arg->dst_task) != arg->dst_cpu)
- goto unlock;
-
- if (task_cpu(arg->src_task) != arg->src_cpu)
- goto unlock;
-
- if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
- goto unlock;
-
- if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
- goto unlock;
-
- __migrate_swap_task(arg->src_task, arg->dst_cpu);
- __migrate_swap_task(arg->dst_task, arg->src_cpu);
-
- ret = 0;
-
-unlock:
- double_rq_unlock(src_rq, dst_rq);
- raw_spin_unlock(&arg->dst_task->pi_lock);
- raw_spin_unlock(&arg->src_task->pi_lock);
-
- return ret;
-}
-
-/*
- * Cross migrate two tasks
- */
-int migrate_swap(struct task_struct *cur, struct task_struct *p)
-{
- struct migration_swap_arg arg;
- int ret = -EINVAL;
-
- arg = (struct migration_swap_arg){
- .src_task = cur,
- .src_cpu = task_cpu(cur),
- .dst_task = p,
- .dst_cpu = task_cpu(p),
- };
-
- if (arg.src_cpu == arg.dst_cpu)
- goto out;
-
- /*
- * These three tests are all lockless; this is OK since all of them
- * will be re-checked with proper locks held further down the line.
- */
- if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
- goto out;
-
- if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
- goto out;
-
- if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
- goto out;
-
- ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
-
-out:
- return ret;
-}
-
struct migration_arg {
struct task_struct *task;
int dest_cpu;
@@ -1333,9 +1236,9 @@ out:
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
{
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1427,13 +1330,12 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
if (rq->idle_stamp) {
u64 delta = rq_clock(rq) - rq->idle_stamp;
- u64 max = 2*rq->max_idle_balance_cost;
+ u64 max = 2*sysctl_sched_migration_cost;
- update_avg(&rq->avg_idle, delta);
-
- if (rq->avg_idle > max)
+ if (delta > max)
rq->avg_idle = max;
-
+ else
+ update_avg(&rq->avg_idle, delta);
rq->idle_stamp = 0;
}
#endif
@@ -1494,14 +1396,6 @@ static void sched_ttwu_pending(void)
void scheduler_ipi(void)
{
- /*
- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
- * TIF_NEED_RESCHED remotely (for the first time) will also send
- * this IPI.
- */
- if (tif_need_resched())
- set_preempt_need_resched();
-
if (llist_empty(&this_rq()->wake_list)
&& !tick_nohz_full_cpu(smp_processor_id())
&& !got_nohz_idle_kick())
@@ -1619,7 +1513,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+ cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
@@ -1701,7 +1595,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
*
* __sched_fork() is basic setup used by init_idle() too:
*/
-static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+static void __sched_fork(struct task_struct *p)
{
p->on_rq = 0;
@@ -1725,24 +1619,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#ifdef CONFIG_NUMA_BALANCING
if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
- p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ p->mm->numa_next_scan = jiffies;
+ p->mm->numa_next_reset = jiffies;
p->mm->numa_scan_seq = 0;
}
- if (clone_flags & CLONE_VM)
- p->numa_preferred_nid = current->numa_preferred_nid;
- else
- p->numa_preferred_nid = -1;
-
p->node_stamp = 0ULL;
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+ p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
- p->numa_faults = NULL;
- p->numa_faults_buffer = NULL;
-
- INIT_LIST_HEAD(&p->numa_entry);
- p->numa_group = NULL;
#endif /* CONFIG_NUMA_BALANCING */
}
@@ -1768,12 +1654,12 @@ void set_numabalancing_state(bool enabled)
/*
* fork()/clone()-time setup:
*/
-void sched_fork(unsigned long clone_flags, struct task_struct *p)
+void sched_fork(struct task_struct *p)
{
unsigned long flags;
int cpu = get_cpu();
- __sched_fork(clone_flags, p);
+ __sched_fork(p);
/*
* We mark the process as running here. This guarantees that
* nobody will actually run it, and a signal or other external
@@ -1831,7 +1717,10 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
#if defined(CONFIG_SMP)
p->on_cpu = 0;
#endif
- init_task_preempt_count(p);
+#ifdef CONFIG_PREEMPT_COUNT
+ /* Want to start with kernel preemption disabled. */
+ task_thread_info(p)->preempt_count = 1;
+#endif
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
#endif
@@ -1858,7 +1747,7 @@ void wake_up_new_task(struct task_struct *p)
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
*/
- set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
#endif
/* Initialize new task's runnable average */
@@ -1949,7 +1838,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
trace_sched_switch(prev, next);
- sched_info_switch(rq, prev, next);
+ sched_info_switch(prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
@@ -2001,8 +1890,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
- task_numa_free(prev);
-
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
@@ -2186,7 +2073,7 @@ void sched_exec(void)
int dest_cpu;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+ dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
if (dest_cpu == smp_processor_id())
goto unlock;
@@ -2253,20 +2140,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
struct rq *rq;
u64 ns = 0;
-#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
- /*
- * 64-bit doesn't need locks to atomically read a 64bit value.
- * So we have a optimization chance when the task's delta_exec is 0.
- * Reading ->on_cpu is racy, but this is ok.
- *
- * If we race with it leaving cpu, we'll take a lock. So we're correct.
- * If we race with it entering cpu, unaccounted time is 0. This is
- * indistinguishable from the read occurring a few cycles earlier.
- */
- if (!p->on_cpu)
- return p->se.sum_exec_runtime;
-#endif
-
rq = task_rq_lock(p, &flags);
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
task_rq_unlock(rq, p, &flags);
@@ -2342,7 +2215,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
-void __kprobes preempt_count_add(int val)
+void __kprobes add_preempt_count(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2351,7 +2224,7 @@ void __kprobes preempt_count_add(int val)
if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
return;
#endif
- __preempt_count_add(val);
+ preempt_count() += val;
#ifdef CONFIG_DEBUG_PREEMPT
/*
* Spinlock count overflowing soon?
@@ -2362,9 +2235,9 @@ void __kprobes preempt_count_add(int val)
if (preempt_count() == val)
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
}
-EXPORT_SYMBOL(preempt_count_add);
+EXPORT_SYMBOL(add_preempt_count);
-void __kprobes preempt_count_sub(int val)
+void __kprobes sub_preempt_count(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2382,9 +2255,9 @@ void __kprobes preempt_count_sub(int val)
if (preempt_count() == val)
trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
- __preempt_count_sub(val);
+ preempt_count() -= val;
}
-EXPORT_SYMBOL(preempt_count_sub);
+EXPORT_SYMBOL(sub_preempt_count);
#endif
@@ -2557,7 +2430,6 @@ need_resched:
put_prev_task(rq, prev);
next = pick_next_task(rq);
clear_tsk_need_resched(prev);
- clear_preempt_need_resched();
rq->skip_clock_update = 0;
if (likely(prev != next)) {
@@ -2648,9 +2520,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
return;
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ add_preempt_count_notrace(PREEMPT_ACTIVE);
__schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
+ sub_preempt_count_notrace(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -2660,7 +2532,6 @@ asmlinkage void __sched notrace preempt_schedule(void)
} while (need_resched());
}
EXPORT_SYMBOL(preempt_schedule);
-#endif /* CONFIG_PREEMPT */
/*
* this is the entry point to schedule() from kernel preemption
@@ -2670,19 +2541,20 @@ EXPORT_SYMBOL(preempt_schedule);
*/
asmlinkage void __sched preempt_schedule_irq(void)
{
+ struct thread_info *ti = current_thread_info();
enum ctx_state prev_state;
/* Catch callers which need to be fixed */
- BUG_ON(preempt_count() || !irqs_disabled());
+ BUG_ON(ti->preempt_count || !irqs_disabled());
prev_state = exception_enter();
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ add_preempt_count(PREEMPT_ACTIVE);
local_irq_enable();
__schedule();
local_irq_disable();
- __preempt_count_sub(PREEMPT_ACTIVE);
+ sub_preempt_count(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -2694,6 +2566,8 @@ asmlinkage void __sched preempt_schedule_irq(void)
exception_exit(prev_state);
}
+#endif /* CONFIG_PREEMPT */
+
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
@@ -2701,6 +2575,393 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
}
EXPORT_SYMBOL(default_wake_function);
+/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key)
+{
+ wait_queue_t *curr, *next;
+
+ list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
+ unsigned flags = curr->flags;
+
+ if (curr->func(curr, mode, wake_flags, key) &&
+ (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ break;
+ }
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, 0, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+{
+ __wake_up_common(q, mode, nr, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked);
+
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+ __wake_up_common(q, mode, 1, 0, key);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+
+/**
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+ int wake_flags = WF_SYNC;
+
+ if (unlikely(!q))
+ return;
+
+ if (unlikely(nr_exclusive != 1))
+ wake_flags = 0;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+ __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done++;
+ __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete_all(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done += UINT_MAX/2;
+ __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+static inline long __sched
+do_wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
+{
+ if (!x->done) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ __add_wait_queue_tail_exclusive(&x->wait, &wait);
+ do {
+ if (signal_pending_state(state, current)) {
+ timeout = -ERESTARTSYS;
+ break;
+ }
+ __set_current_state(state);
+ spin_unlock_irq(&x->wait.lock);
+ timeout = action(timeout);
+ spin_lock_irq(&x->wait.lock);
+ } while (!x->done && timeout);
+ __remove_wait_queue(&x->wait, &wait);
+ if (!x->done)
+ return timeout;
+ }
+ x->done--;
+ return timeout ?: 1;
+}
+
+static inline long __sched
+__wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
+{
+ might_sleep();
+
+ spin_lock_irq(&x->wait.lock);
+ timeout = do_wait_for_common(x, action, timeout, state);
+ spin_unlock_irq(&x->wait.lock);
+ return timeout;
+}
+
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
+void __sched wait_for_completion(struct completion *x)
+{
+ wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO.
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+ wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x: holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_killable(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_killable);
+
+/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+
+/**
+ * try_wait_for_completion - try to decrement a completion without blocking
+ * @x: completion structure
+ *
+ * Return: 0 if a decrement cannot be done without blocking
+ * 1 if a decrement succeeded.
+ *
+ * If a completion is being used as a counting completion,
+ * attempt to decrement the counter without blocking. This
+ * enables us to avoid waiting if the resource the completion
+ * is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+ unsigned long flags;
+ int ret = 1;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ if (!x->done)
+ ret = 0;
+ else
+ x->done--;
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ * completion_done - Test to see if a completion has any waiters
+ * @x: completion structure
+ *
+ * Return: 0 if there are waiters (wait_for_completion() in progress)
+ * 1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+ unsigned long flags;
+ int ret = 1;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ if (!x->done)
+ ret = 0;
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(completion_done);
+
static long __sched
sleep_on_common(wait_queue_head_t *q, int state, long timeout)
{
@@ -3337,11 +3598,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
struct task_struct *p;
int retval;
+ get_online_cpus();
rcu_read_lock();
p = find_process_by_pid(pid);
if (!p) {
rcu_read_unlock();
+ put_online_cpus();
return -ESRCH;
}
@@ -3398,6 +3661,7 @@ out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
out_put_task:
put_task_struct(p);
+ put_online_cpus();
return retval;
}
@@ -3442,6 +3706,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
unsigned long flags;
int retval;
+ get_online_cpus();
rcu_read_lock();
retval = -ESRCH;
@@ -3454,11 +3719,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+ cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
+ put_online_cpus();
return retval;
}
@@ -3528,11 +3794,16 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
+static inline int should_resched(void)
+{
+ return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+}
+
static void __cond_resched(void)
{
- __preempt_count_add(PREEMPT_ACTIVE);
+ add_preempt_count(PREEMPT_ACTIVE);
__schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
+ sub_preempt_count(PREEMPT_ACTIVE);
}
int __sched _cond_resched(void)
@@ -3915,7 +4186,7 @@ void init_idle(struct task_struct *idle, int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
- __sched_fork(0, idle);
+ __sched_fork(idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
@@ -3941,7 +4212,7 @@ void init_idle(struct task_struct *idle, int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
- init_idle_preempt_count(idle, cpu);
+ task_thread_info(idle)->preempt_count = 0;
/*
* The idle tasks have their own, simple scheduling class:
@@ -4075,53 +4346,6 @@ fail:
return ret;
}
-#ifdef CONFIG_NUMA_BALANCING
-/* Migrate current task p to target_cpu */
-int migrate_task_to(struct task_struct *p, int target_cpu)
-{
- struct migration_arg arg = { p, target_cpu };
- int curr_cpu = task_cpu(p);
-
- if (curr_cpu == target_cpu)
- return 0;
-
- if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
- return -EINVAL;
-
- /* TODO: This is not properly updating schedstats */
-
- return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
-}
-
-/*
- * Requeue a task on a given node and accurately track the number of NUMA
- * tasks on the runqueues
- */
-void sched_setnuma(struct task_struct *p, int nid)
-{
- struct rq *rq;
- unsigned long flags;
- bool on_rq, running;
-
- rq = task_rq_lock(p, &flags);
- on_rq = p->on_rq;
- running = task_current(rq, p);
-
- if (on_rq)
- dequeue_task(rq, p, 0);
- if (running)
- p->sched_class->put_prev_task(rq, p);
-
- p->numa_preferred_nid = nid;
-
- if (running)
- p->sched_class->set_curr_task(rq);
- if (on_rq)
- enqueue_task(rq, p, 0);
- task_rq_unlock(rq, p, &flags);
-}
-#endif
-
/*
* migration_cpu_stop - this will be executed by a highprio stopper thread
* and performs thread migration by bumping thread off CPU then
@@ -4761,7 +4985,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
cpumask_clear_cpu(rq->cpu, old_rd->span);
/*
- * If we dont want to free the old_rd yet then
+ * If we dont want to free the old_rt yet then
* set old_rd to NULL to skip the freeing later
* in this function:
*/
@@ -4895,9 +5119,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
DEFINE_PER_CPU(struct sched_domain *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_busy);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu)
{
@@ -4909,19 +5130,11 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
- sd = sd->parent; /* sd_busy */
}
- rcu_assign_pointer(per_cpu(sd_busy, cpu), sd);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
-
- sd = lowest_flag_domain(cpu, SD_NUMA);
- rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
-
- sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
- rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
}
/*
@@ -5441,7 +5654,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
| 0*SD_SHARE_PKG_RESOURCES
| 1*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
- | 1*SD_NUMA
| sd_local_flags(level)
,
.last_balance = jiffies,
@@ -6123,17 +6335,14 @@ void __init sched_init_smp(void)
sched_init_numa();
- /*
- * There's no userspace yet to cause hotplug operations; hence all the
- * cpu masks are stable and all blatant races in the below code cannot
- * happen.
- */
+ get_online_cpus();
mutex_lock(&sched_domains_mutex);
init_sched_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
+ put_online_cpus();
hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
@@ -6296,7 +6505,6 @@ void __init sched_init(void)
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
- rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -7069,12 +7277,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
- /*
- * If we need to toggle cfs_bandwidth_used, off->on must occur
- * before making related changes, and on->off must occur afterwards
- */
- if (runtime_enabled && !runtime_was_enabled)
- cfs_bandwidth_usage_inc();
+ account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
@@ -7100,8 +7303,6 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
unthrottle_cfs_rq(cfs_rq);
raw_spin_unlock_irq(&rq->lock);
}
- if (runtime_was_enabled && !runtime_enabled)
- cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);