From 33c3d6c61debcc0d295fe65521cfbc45409936c7 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 9 Feb 2010 14:43:59 -0500 Subject: sched: Cleanup pre_schedule_rt Since [commit 9a897c5a: sched: RT-balance, replace hooks with pre/post schedule and wakeup methods] we must call pre_schedule_rt if prev is rt task. So condition rt_task(prev) is always true and the 'unlikely' declaration is simply incorrect. Signed-off-by: Yong Zhang Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Rusty Russell Signed-off-by: Steven Rostedt diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bea7d79..1ab66a2 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1474,7 +1474,7 @@ skip: static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ - if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) + if (rq->rt.highest_prio.curr > prev->prio) pull_rt_task(rq); } -- cgit v0.10.2 From 8e54a2c036d8c47195f094af1628834f4c55844a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Dec 2010 11:28:30 -0500 Subject: sched: Change pick_next_task_rt from unlikely to likely The if (unlikely(!rt_rq->rt_nr_running)) test in pick_next_task_rt() tests if there is another rt task ready to run. If so, then pick it. In most systems, only one RT task runs at a time most of the time. Running the branch unlikely annotator profiler on a system doing average work "running firefox, evolution, xchat, distcc builds, etc", it showed the following: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 324344 135104992 99 _pick_next_task_rt sched_rt.c 1064 99% of the time the condition is true. When an RT task schedules out, it is unlikely that another RT task is waiting to run on that same run queue. Simply remove the unlikely() condition. Acked-by: Gregory Haskins Cc:Peter Zijlstra Signed-off-by: Steven Rostedt diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1ab66a2..c2266c4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1062,7 +1062,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) rt_rq = &rq->rt; - if (unlikely(!rt_rq->rt_nr_running)) + if (!rt_rq->rt_nr_running) return NULL; if (rt_rq_throttled(rt_rq)) -- cgit v0.10.2 From 63f01241176d7cbc976385aec32f0a209b0bc36a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Dec 2010 14:48:10 -0500 Subject: sched: Remove unlikely() from rt_policy() in sched.c The rt_policy() has an unlikely() that the policy it is checking is of RT priority (SCHED_FIFO or SCHED_RR). According to the annotate branch profiler it is incorrect most of the time: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 36667 654674 94 rt_policy sched.c 126 This makes sense because the rt_policy() is used by the sched_set_scheduler() and nice(). Although users may use sys_nice a bit, all RT users use the sched_set_scheduler() to set their RT priority, including kernel threads. The above numbers were from a normal desktop computer running firefox, evolution, xchat and was part of a distcc compile farm. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt diff --git a/kernel/sched.c b/kernel/sched.c index dc91a4d..269a045 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -123,7 +123,7 @@ static inline int rt_policy(int policy) { - if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) + if (policy == SCHED_FIFO || policy == SCHED_RR) return 1; return 0; } -- cgit v0.10.2 From e69c634190dc724ef2d845ace8d783031d3e492e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Dec 2010 17:10:31 -0500 Subject: sched: Remove unlikely() from ttwu_post_activation The unlikely() used in ttwu_post_activation() tests if the rq->idle_stamp is set. But since this is for a wakeup, and wakeups happen when tasks block on IO, and blocking tasks on IO may put the system into idle, this can actually be a common occurence. Running the annotated branch profiler on an average desktop running firefox, evolution, xchat and distcc, the report shows: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 34884862 146110926 80 ttwu_post_activation sched.c 2309 80% of the time, this unlikely is incorrect. Best not to assume what the result is, and just remove the branch annotation. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt diff --git a/kernel/sched.c b/kernel/sched.c index 269a045..6d24b2e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2458,7 +2458,7 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); - if (unlikely(rq->idle_stamp)) { + if (rq->idle_stamp) { u64 delta = rq->clock - rq->idle_stamp; u64 max = 2*sysctl_sched_migration_cost; -- cgit v0.10.2 From 2da8c8bc44b572cbf623629ff736608dc7968436 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 7 Jun 2011 22:53:39 +0200 Subject: sched: Remove pointless in_atomic() definition check It's really supposed to be defined here. If it's not then we actually want the build to crash so that we know it, and not keep it silent. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra diff --git a/kernel/sched.c b/kernel/sched.c index fd18f39..01d9536 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8028,7 +8028,6 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { -#ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || @@ -8050,7 +8049,6 @@ void __might_sleep(const char *file, int line, int preempt_offset) if (irqs_disabled()) print_irqtrace_events(current); dump_stack(); -#endif } EXPORT_SYMBOL(__might_sleep); #endif -- cgit v0.10.2 From bdd4e85dc36cdbcfc1608a5b2a17c80a9db8986a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2011 01:13:27 +0200 Subject: sched: Isolate preempt counting in its own config option Create a new CONFIG_PREEMPT_COUNT that handles the inc/dec of preempt count offset independently. So that the offset can be updated by preempt_disable() and preempt_enable() even without the need for CONFIG_PREEMPT beeing set. This prepares to make CONFIG_DEBUG_SPINLOCK_SLEEP working with !CONFIG_PREEMPT where it currently doesn't detect code that sleeps inside explicit preemption disabled sections. Signed-off-by: Frederic Weisbecker Acked-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h index b4326bf..564d997 100644 --- a/include/linux/bit_spinlock.h +++ b/include/linux/bit_spinlock.h @@ -88,7 +88,7 @@ static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) { #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) return test_bit(bitnum, addr); -#elif defined CONFIG_PREEMPT +#elif defined CONFIG_PREEMPT_COUNT return preempt_count(); #else return 1; diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index ba36217..f743883 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -93,7 +93,7 @@ */ #define in_nmi() (preempt_count() & NMI_MASK) -#if defined(CONFIG_PREEMPT) +#if defined(CONFIG_PREEMPT_COUNT) # define PREEMPT_CHECK_OFFSET 1 #else # define PREEMPT_CHECK_OFFSET 0 @@ -115,7 +115,7 @@ #define in_atomic_preempt_off() \ ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT # define preemptible() (preempt_count() == 0 && !irqs_disabled()) # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1) #else diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 716875e..8e38d4c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -134,7 +134,7 @@ static inline int page_cache_get_speculative(struct page *page) VM_BUG_ON(in_interrupt()); #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) -# ifdef CONFIG_PREEMPT +# ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic()); # endif /* @@ -172,7 +172,7 @@ static inline int page_cache_add_speculative(struct page *page, int count) VM_BUG_ON(in_interrupt()); #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) -# ifdef CONFIG_PREEMPT +# ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic()); # endif VM_BUG_ON(page_count(page) == 0); diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 2e681d9..58969b2 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -27,6 +27,21 @@ asmlinkage void preempt_schedule(void); +#define preempt_check_resched() \ +do { \ + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ + preempt_schedule(); \ +} while (0) + +#else /* !CONFIG_PREEMPT */ + +#define preempt_check_resched() do { } while (0) + +#endif /* CONFIG_PREEMPT */ + + +#ifdef CONFIG_PREEMPT_COUNT + #define preempt_disable() \ do { \ inc_preempt_count(); \ @@ -39,12 +54,6 @@ do { \ dec_preempt_count(); \ } while (0) -#define preempt_check_resched() \ -do { \ - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ - preempt_schedule(); \ -} while (0) - #define preempt_enable() \ do { \ preempt_enable_no_resched(); \ @@ -80,18 +89,17 @@ do { \ preempt_check_resched(); \ } while (0) -#else +#else /* !CONFIG_PREEMPT_COUNT */ #define preempt_disable() do { } while (0) #define preempt_enable_no_resched() do { } while (0) #define preempt_enable() do { } while (0) -#define preempt_check_resched() do { } while (0) #define preempt_disable_notrace() do { } while (0) #define preempt_enable_no_resched_notrace() do { } while (0) #define preempt_enable_notrace() do { } while (0) -#endif +#endif /* CONFIG_PREEMPT_COUNT */ #ifdef CONFIG_PREEMPT_NOTIFIERS diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 99f9aa7..8f4f881 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -239,7 +239,7 @@ extern int rcu_read_lock_bh_held(void); * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. */ -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) { int lockdep_opinion = 0; @@ -250,12 +250,12 @@ static inline int rcu_read_lock_sched_held(void) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); } -#else /* #ifdef CONFIG_PREEMPT */ +#else /* #ifdef CONFIG_PREEMPT_COUNT */ static inline int rcu_read_lock_sched_held(void) { return 1; } -#endif /* #else #ifdef CONFIG_PREEMPT */ +#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */ #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -276,17 +276,17 @@ static inline int rcu_read_lock_bh_held(void) return 1; } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) { return preempt_count() != 0 || irqs_disabled(); } -#else /* #ifdef CONFIG_PREEMPT */ +#else /* #ifdef CONFIG_PREEMPT_COUNT */ static inline int rcu_read_lock_sched_held(void) { return 1; } -#endif /* #else #ifdef CONFIG_PREEMPT */ +#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */ #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 483c1ed..4ecd5cb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2502,7 +2502,7 @@ extern int _cond_resched(void); extern int __cond_resched_lock(spinlock_t *lock); -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT #define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET #else #define PREEMPT_LOCK_OFFSET 0 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index bf987b9..24e7cb0 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" + select PREEMPT_COUNT help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) @@ -52,3 +53,5 @@ config PREEMPT endchoice +config PREEMPT_COUNT + bool \ No newline at end of file diff --git a/kernel/sched.c b/kernel/sched.c index 01d9536..90ad7cf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2843,7 +2843,7 @@ void sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) p->on_cpu = 0; #endif -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif -- cgit v0.10.2 From e8f7c70f44f75c827c04239b0ae5f0068b65b76e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2011 01:51:02 +0200 Subject: sched: Make sleeping inside spinlock detection working in !CONFIG_PREEMPT Select CONFIG_PREEMPT_COUNT when we enable the sleeping inside spinlock detection, so that the preempt offset gets correctly incremented/decremented from preempt_disable()/preempt_enable(). This makes the preempt count eventually working in !CONFIG_PREEMPT when that debug option is set and thus fixes the detection of explicit preemption disabled sections under such config. Code that sleeps in explicitly preempt disabled section can be finally spotted in non-preemptible kernels. Signed-off-by: Frederic Weisbecker Acked-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 28afa4c..a7dd7b5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -650,6 +650,7 @@ config TRACE_IRQFLAGS config DEBUG_SPINLOCK_SLEEP bool "Spinlock debugging: sleep-inside-spinlock checking" + select PREEMPT_COUNT depends on DEBUG_KERNEL help If you say Y here, various routines which may sleep will become very -- cgit v0.10.2 From d902db1eb60387040fe541573083e47469db50ac Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2011 19:31:56 +0200 Subject: sched: Generalize sleep inside spinlock detection The sleeping inside spinlock detection is actually used for more general sleeping inside atomic sections debugging: preemption disabled, rcu read side critical sections, interrupts, interrupt disabled, etc... Change the name of the config and its help section to reflect its more general role. Signed-off-by: Frederic Weisbecker Acked-by: Paul E. McKenney Acked-by: Randy Dunlap Cc: Peter Zijlstra Cc: Ingo Molnar diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl index 7b3f493..07a9c48 100644 --- a/Documentation/DocBook/kernel-hacking.tmpl +++ b/Documentation/DocBook/kernel-hacking.tmpl @@ -409,7 +409,7 @@ cond_resched(); /* Will sleep */ You should always compile your kernel - CONFIG_DEBUG_SPINLOCK_SLEEP on, and it will warn + CONFIG_DEBUG_ATOMIC_SLEEP on, and it will warn you if you break these rules. If you do break the rules, you will eventually lock up your box. diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist index da0382d..7b13be4 100644 --- a/Documentation/SubmitChecklist +++ b/Documentation/SubmitChecklist @@ -53,7 +53,7 @@ kernel patches. 12: Has been tested with CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, - CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEP all simultaneously + CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_ATOMIC_SLEEP all simultaneously enabled. 13: Has been build- and runtime tested with and without CONFIG_SMP and diff --git a/Documentation/development-process/4.Coding b/Documentation/development-process/4.Coding index f3f1a46..83f5f5b 100644 --- a/Documentation/development-process/4.Coding +++ b/Documentation/development-process/4.Coding @@ -244,7 +244,7 @@ testing purposes. In particular, you should turn on: - DEBUG_SLAB can find a variety of memory allocation and use errors; it should be used on most development kernels. - - DEBUG_SPINLOCK, DEBUG_SPINLOCK_SLEEP, and DEBUG_MUTEXES will find a + - DEBUG_SPINLOCK, DEBUG_ATOMIC_SLEEP, and DEBUG_MUTEXES will find a number of common locking errors. There are quite a few other debugging options, some of which will be diff --git a/Documentation/ja_JP/SubmitChecklist b/Documentation/ja_JP/SubmitChecklist index 2df4576..cb5507b 100644 --- a/Documentation/ja_JP/SubmitChecklist +++ b/Documentation/ja_JP/SubmitChecklist @@ -68,7 +68,7 @@ Linux カーネルパッチ投稿者向けチェックリスト 12: CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, CONFIG_DEBUG_SPINLOCK, - CONFIG_DEBUG_SPINLOCK_SLEEP これら全てを同時に有効にして動作確認を + CONFIG_DEBUG_ATOMIC_SLEEP これら全てを同時に有効にして動作確認を 行ってください。 13: CONFIG_SMP, CONFIG_PREEMPT を有効にした場合と無効にした場合の両方で diff --git a/Documentation/zh_CN/SubmitChecklist b/Documentation/zh_CN/SubmitChecklist index 951415b..4c741d6 100644 --- a/Documentation/zh_CN/SubmitChecklist +++ b/Documentation/zh_CN/SubmitChecklist @@ -67,7 +67,7 @@ Linux 12ѾͨCONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, - CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEPԣͬʱ + CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_ATOMIC_SLEEPԣͬʱ ʹܡ 13Ѿʹû߲ʹ CONFIG_SMP CONFIG_PREEMPTִʱ䡣 diff --git a/include/linux/kernel.h b/include/linux/kernel.h index fb0e732..24b489f 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -121,7 +121,7 @@ extern int _cond_resched(void); # define might_resched() do { } while (0) #endif -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP void __might_sleep(const char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep diff --git a/kernel/sched.c b/kernel/sched.c index 90ad7cf..a5f318b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8018,7 +8018,7 @@ void __init sched_init(void) scheduler_running = 1; } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a7dd7b5..81a4f33 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -648,13 +648,15 @@ config TRACE_IRQFLAGS Enables hooks to interrupt enabling and disabling for either tracing or lock debugging. -config DEBUG_SPINLOCK_SLEEP - bool "Spinlock debugging: sleep-inside-spinlock checking" +config DEBUG_ATOMIC_SLEEP + bool "Sleep inside atomic section checking" select PREEMPT_COUNT depends on DEBUG_KERNEL help If you say Y here, various routines which may sleep will become very - noisy if they are called with a spinlock held. + noisy if they are called inside atomic sections: when a spinlock is + held, inside an rcu read side critical section, inside preempt disabled + sections, inside an interrupt, etc... config DEBUG_LOCKING_API_SELFTESTS bool "Locking API boot-time self-tests" -- cgit v0.10.2 From 2a46dae38087e62dd5fb08a6dadf1407717ed13c Mon Sep 17 00:00:00 2001 From: "Nikunj A. Dadhania" Date: Tue, 7 Jun 2011 15:43:22 +0530 Subject: sched: Remove rcu_read_lock() from wake_affine() wake_affine() is only called from one path: select_task_rq_fair(), which already has the RCU read lock held. Signed-off-by: Nikunj A. Dadhania Signed-off-by: Peter Zijlstra Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/20110607101251.777.34547.stgit@IBM-009124035060.in.ibm.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 433491c2..eb98f77 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1481,7 +1481,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * effect of the currently running task from the load * of the current CPU: */ - rcu_read_lock(); if (sync) { tg = task_group(current); weight = current->se.load.weight; @@ -1517,7 +1516,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) balanced = this_eff_load <= prev_eff_load; } else balanced = true; - rcu_read_unlock(); /* * If the currently running task will sleep within -- cgit v0.10.2 From 307bf9803f25a8a3f53c1012110fb74e2f893eb0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Jun 2011 15:08:55 +0200 Subject: sched: Simplify mutex_spin_on_owner() It does not make sense to rcu_read_lock/unlock() in every loop iteration while spinning on the mutex. Move the rcu protection outside the loop. Also simplify the return path to always check for lock->owner == NULL which meets the requirements of both owner changed and need_resched() caused loop exits. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1106101458350.11814@ionos Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 5925275..e355ee7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4306,11 +4306,8 @@ EXPORT_SYMBOL(schedule); static inline bool owner_running(struct mutex *lock, struct task_struct *owner) { - bool ret = false; - - rcu_read_lock(); if (lock->owner != owner) - goto fail; + return false; /* * Ensure we emit the owner->on_cpu, dereference _after_ checking @@ -4320,11 +4317,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner) */ barrier(); - ret = owner->on_cpu; -fail: - rcu_read_unlock(); - - return ret; + return owner->on_cpu; } /* @@ -4336,21 +4329,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) if (!sched_feat(OWNER_SPIN)) return 0; + rcu_read_lock(); while (owner_running(lock, owner)) { if (need_resched()) - return 0; + break; arch_mutex_cpu_relax(); } + rcu_read_unlock(); /* - * If the owner changed to another task there is likely - * heavy contention, stop spinning. + * We break out the loop above on need_resched() and when the + * owner changed, which is a sign for heavy contention. Return + * success only when lock->owner is NULL. */ - if (lock->owner) - return 0; - - return 1; + return lock->owner == NULL; } #endif -- cgit v0.10.2 From 1c09ab0d257317f97e8629a3d0c8713d6dd9de4c Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 28 Jun 2011 10:51:31 +0800 Subject: sched: Skip autogroup when looking for all rt sched groups Since commit ec514c48 ("sched: Fix rt_rq runtime leakage bug") 'cat /proc/sched_debug' will print data of root_task_group.rt_rq multiple times. This is because autogroup does not have its own rt group, instead rt group of autogroup is linked to root_task_group. So skip it when we are looking for all rt sched groups, and it will also save some noop operation against root_task_group when __disable_runtime()/__enable_runtime(). -v2: Based on Cheng Xu's idea which uses less code. Signed-off-by: Yong Zhang Cc: Mike Galbraith Cc: Cheng Xu Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/BANLkTi=87P3RoTF_UEtamNfc_XGxQXE__Q@mail.gmail.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 0557705..c2f0e72 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h @@ -13,6 +13,7 @@ struct autogroup { int nice; }; +static inline bool task_group_is_autogroup(struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index b03cd89..97540f0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -185,11 +185,23 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) typedef struct task_group *rt_rq_iter_t; -#define for_each_rt_rq(rt_rq, iter, rq) \ - for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ - (&iter->list != &task_groups) && \ - (rt_rq = iter->rt_rq[cpu_of(rq)]); \ - iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) +static inline struct task_group *next_task_group(struct task_group *tg) +{ + do { + tg = list_entry_rcu(tg->list.next, + typeof(struct task_group), list); + } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); + + if (&tg->list == &task_groups) + tg = NULL; + + return tg; +} + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for (iter = container_of(&task_groups, typeof(*iter), list); \ + (iter = next_task_group(iter)) && \ + (rt_rq = iter->rt_rq[cpu_of(rq)]);) static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) { -- cgit v0.10.2 From dab16ae1a9fc72a9f419f2dff91854e452d02a5e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 5 Jul 2011 16:29:40 +0200 Subject: m32r: Use generic PREEMPT config Use the generic preempt config definition in m32r instead of using a custom one. This also makes it handle the new CONFIG_PREEMPT_COUNT that need to be selected by CONFIG_PREEMPT. Without that it breaks kernel/sched.c: In function 'preempt_schedule': kernel/sched.c:4364: error: implicit declaration of function 'add_preempt_count_notrace' kernel/sched.c:4366: error: implicit declaration of function 'sub_preempt_count_notrace' Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Hirokazu Takata diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 85b44e8..b92b944 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -268,17 +268,7 @@ config SCHED_OMIT_FRAME_POINTER bool default y -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +source "kernel/Kconfig.preempt" config SMP bool "Symmetric multi-processing support" -- cgit v0.10.2 From e22c8f4616c8e68b7659331409e9c081ce265498 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 5 Jul 2011 17:45:34 +0200 Subject: h8300: Use generic config PREEMPT definition So that it can handle the new CONFIG_PREEMPT_COUNT. Signed-off-by: Frederic Weisbecker Cc: Yoshinori Sato diff --git a/arch/h8300/Kconfig.cpu b/arch/h8300/Kconfig.cpu index d236ab4..15c2228 100644 --- a/arch/h8300/Kconfig.cpu +++ b/arch/h8300/Kconfig.cpu @@ -162,9 +162,7 @@ config H8300_TPU_CH int "TPU channel" depends on H8300_TPU -config PREEMPT - bool "Preemptible Kernel" - default n +source "kernel/Kconfig.preempt" source "mm/Kconfig" -- cgit v0.10.2 From bd96efe17d945f0bad56d592f8686dc6309905e7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 5 Jul 2011 17:45:34 +0200 Subject: xtensa: Use generic config PREEMPT definition So that it can handle the new CONFIG_PREEMPT_COUNT. Signed-off-by: Frederic Weisbecker Cc: Chris Zankel diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 5d43c1f..c346ccd 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -80,18 +80,7 @@ config XTENSA_UNALIGNED_USER Say Y here to enable unaligned memory access in user space. -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - Unfortunately the kernel code has some race conditions if both - CONFIG_SMP and CONFIG_PREEMPT are enabled, so this option is - currently disabled if you are building an SMP kernel. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +source "kernel/Kconfig.preempt" config MATH_EMULATION bool "Math emulation" -- cgit v0.10.2 From 9bbd7374361d9bfc75108c3ad1c1b6db28b1be59 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Tue, 5 Jul 2011 19:07:21 -0700 Subject: sched: update correct entity's runtime in check_preempt_wakeup() While looking at check_preempt_wakeup() I realized that we are potentially updating the wrong entity in the fair-group scheduling case. In this case the current task's cfs_rq may not be the same as the one used for the comparison between the waking task and the existing task's vruntime. This potentially results in us using a stale vruntime in the pre-emption decision, providing a small false preference for the previous task. The effects of this are bounded since we always perform a hierarchal update on the tick. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/CAPM31R+2Ke2urUZKao5W92_LupdR4AYEv-EZWiJ3tG=tEes2cw@mail.gmail.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e7d67a9..f88720b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1919,8 +1919,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (!sched_feat(WAKEUP_PREEMPT)) return; - update_curr(cfs_rq); find_matching_se(&se, &pse); + update_curr(cfs_rq_of(se)); BUG_ON(!pse); if (wakeup_preempt_entity(se, pse) == 1) { /* -- cgit v0.10.2 From 9598c82dcacadc3b9daa8170613fd054c6124d30 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Wed, 6 Jul 2011 22:30:37 -0700 Subject: sched: Don't update shares twice on on_rq parent In dequeue_task_fair() we bail on dequeue when we encounter a parenting entity with additional weight. However, we perform a double shares update on this entity as we continue the shares update traversal from this point, despite dequeue_entity() having already updated its queuing cfs_rq. Avoid this by starting from the parent when we resume. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110707053059.797714697@google.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f88720b..6cdff84 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1370,6 +1370,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ if (task_sleep && parent_entity(se)) set_next_buddy(parent_entity(se)); + + /* avoid re-evaluating load for this entity */ + se = parent_entity(se); break; } flags |= DEQUEUE_SLEEP; -- cgit v0.10.2 From 9763b67fb9f3050c6da739105888327587c30c4d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Jul 2011 13:09:25 +0200 Subject: sched, cgroup: Optimize load_balance_fair() Use for_each_leaf_cfs_rq() instead of list_for_each_entry_rcu(), this achieves that load_balance_fair() only iterates those task_groups that actually have tasks on busiest, and that we iterate bottom-up, trying to move light groups before the heavier ones. No idea if it will actually work out to be beneficial in practice, does anybody have a cgroup workload that might show a difference one way or the other? [ Also move update_h_load to sched_fair.c, loosing #ifdef-ery ] Signed-off-by: Peter Zijlstra Reviewed-by: Paul Turner Link: http://lkml.kernel.org/r/1310557009.2586.28.camel@twins Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index b0e7ad7..474f341 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1568,38 +1568,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return rq->avg_load_per_task; } -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* - * Compute the cpu's hierarchical load factor for each task group. - * This needs to be done in a top-down fashion because the load of a child - * group is a fraction of its parents load. - */ -static int tg_load_down(struct task_group *tg, void *data) -{ - unsigned long load; - long cpu = (long)data; - - if (!tg->parent) { - load = cpu_rq(cpu)->load.weight; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->se[cpu]->load.weight; - load /= tg->parent->cfs_rq[cpu]->load.weight + 1; - } - - tg->cfs_rq[cpu]->h_load = load; - - return 0; -} - -static void update_h_load(long cpu) -{ - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); -} - -#endif - #ifdef CONFIG_PREEMPT static void double_rq_lock(struct rq *rq1, struct rq *rq2); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6cdff84..180bcf1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2232,11 +2232,43 @@ static void update_shares(int cpu) struct rq *rq = cpu_rq(cpu); rcu_read_lock(); + /* + * Iterates the task_group tree in a bottom up fashion, see + * list_add_leaf_cfs_rq() for details. + */ for_each_leaf_cfs_rq(rq, cfs_rq) update_shares_cpu(cfs_rq->tg, cpu); rcu_read_unlock(); } +/* + * Compute the cpu's hierarchical load factor for each task group. + * This needs to be done in a top-down fashion because the load of a child + * group is a fraction of its parents load. + */ +static int tg_load_down(struct task_group *tg, void *data) +{ + unsigned long load; + long cpu = (long)data; + + if (!tg->parent) { + load = cpu_rq(cpu)->load.weight; + } else { + load = tg->parent->cfs_rq[cpu]->h_load; + load *= tg->se[cpu]->load.weight; + load /= tg->parent->cfs_rq[cpu]->load.weight + 1; + } + + tg->cfs_rq[cpu]->h_load = load; + + return 0; +} + +static void update_h_load(long cpu) +{ + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); +} + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -2244,14 +2276,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, int *all_pinned) { long rem_load_move = max_load_move; - int busiest_cpu = cpu_of(busiest); - struct task_group *tg; + struct cfs_rq *busiest_cfs_rq; rcu_read_lock(); - update_h_load(busiest_cpu); + update_h_load(cpu_of(busiest)); - list_for_each_entry_rcu(tg, &task_groups, list) { - struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; + for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; u64 rem_load, moved_load; -- cgit v0.10.2 From 5f817d676b7b7ac4a29f5ed93063ae7a24550c12 Mon Sep 17 00:00:00 2001 From: Jan Schoenherr Date: Wed, 13 Jul 2011 20:13:31 +0200 Subject: sched: Fix (harmless) typo 'CONFG_FAIR_GROUP_SCHED' This patch fixes a typo located in a comment. Signed-off-by: Jan Schoenherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310580816-10861-2-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 474f341..3b3826e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8362,7 +8362,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); raw_spin_unlock_irqrestore(&rq->lock, flags); } -#else /* !CONFG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) { } -- cgit v0.10.2 From 045176d22f08bc0b650a028df0f62fc3c2747699 Mon Sep 17 00:00:00 2001 From: Jan Schoenherr Date: Wed, 13 Jul 2011 20:13:32 +0200 Subject: sched: Remove unused function cpu_cfs_rq() The last reference to cpu_cfs_rq() was removed with commit 88ec22d3 ("sched: Remove the cfs_rq dependency from set_task_cpu()"). Thus, remove this function, too. Signed-off-by: Jan Schoenherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310580816-10861-3-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 180bcf1..0588c0b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -135,14 +135,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on - * another cpu ('this_cpu') - */ -static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) -{ - return cfs_rq->tg->cfs_rq[this_cpu]; -} - static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -271,11 +263,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return NULL; } -static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) -{ - return &cpu_rq(this_cpu)->cfs; -} - static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -- cgit v0.10.2 From 99bc52429f11d1f4f81495ac8237085aaeb6bccf Mon Sep 17 00:00:00 2001 From: Bianca Lutz Date: Wed, 13 Jul 2011 20:13:36 +0200 Subject: sched: Do not attempt to destroy uninitialized rt_bandwidth If a task group is to be created and alloc_fair_sched_group() fails, then the rt_bandwidth of the corresponding task group is not yet initialized. The caller, sched_create_group(), starts a clean up procedure which calls free_rt_sched_group() which unconditionally destroys the not yet initialized rt_bandwidth. This crashes or hangs the system in lock_hrtimer_base(): UP systems dereference a NULL pointer, while SMP systems loop endlessly on a condition that cannot become true. This patch simply avoids the destruction of rt_bandwidth when the initialization code path was not reached. (This was discovered by accident with a custom kernel modification.) Signed-off-by: Bianca Lutz Signed-off-by: Jan Schoenherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310580816-10861-7-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index 3b3826e..f107204 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8383,7 +8383,8 @@ static void free_rt_sched_group(struct task_group *tg) { int i; - destroy_rt_bandwidth(&tg->rt_bandwidth); + if (tg->rt_se) + destroy_rt_bandwidth(&tg->rt_bandwidth); for_each_possible_cpu(i) { if (tg->rt_rq) -- cgit v0.10.2 From 26a148eb9c790149750f7e77da0d96029443d400 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Fri, 15 Jul 2011 11:41:31 +0100 Subject: sched: Reorder root_domain to remove 64 bit alignment padding Reorder root_domain to remove 8 bytes of alignment padding on 64 bit builds, this shrinks the size from 1736 to 1728 bytes, therefore using one fewer cachelines. Signed-off-by: Richard Kennedy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310726492.1977.5.camel@castor.rsk Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index f107204..e3f0bac 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -422,6 +422,7 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; + atomic_t rto_count; struct rcu_head rcu; cpumask_var_t span; cpumask_var_t online; @@ -431,7 +432,6 @@ struct root_domain { * one runnable RT task. */ cpumask_var_t rto_mask; - atomic_t rto_count; struct cpupri cpupri; }; -- cgit v0.10.2 From acb5a9ba3bd7cd8b3264f67a3789a9587d3b935b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Thu, 14 Jul 2011 18:32:43 +0200 Subject: sched: Separate group-scheduling code more clearly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up cfs/rt runqueue initialization by moving group scheduling related code into the corresponding functions. Also, keep group scheduling as an add-on, so that things are only done additionally, i. e. remove the init_*_rq() calls from init_tg_*_entry(). (This removes a redundant initalization during sched_init()). In case of group scheduling rt_rq->highest_prio.curr is now initialized twice, but adding another #ifdef seems not worth it. Signed-off-by: Jan H. Schönherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310661163-16606-1-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar diff --git a/kernel/sched.c b/kernel/sched.c index e3f0bac..6fdf7ff 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7859,17 +7859,10 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } -static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +static void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT; INIT_LIST_HEAD(&cfs_rq->tasks); -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq->rq = rq; - /* allow initial update_cfs_load() to truncate */ -#ifdef CONFIG_SMP - cfs_rq->load_stamp = 1; -#endif -#endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; @@ -7889,13 +7882,9 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +#if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO; -#ifdef CONFIG_SMP rt_rq->highest_prio.next = MAX_RT_PRIO; -#endif -#endif -#ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); @@ -7905,11 +7894,6 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; raw_spin_lock_init(&rt_rq->rt_runtime_lock); - -#ifdef CONFIG_RT_GROUP_SCHED - rt_rq->rt_nr_boosted = 0; - rt_rq->rq = rq; -#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -7918,11 +7902,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *parent) { struct rq *rq = cpu_rq(cpu); - tg->cfs_rq[cpu] = cfs_rq; - init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + cfs_rq->rq = rq; +#ifdef CONFIG_SMP + /* allow initial update_cfs_load() to truncate */ + cfs_rq->load_stamp = 1; +#endif + tg->cfs_rq[cpu] = cfs_rq; tg->se[cpu] = se; + /* se could be NULL for root_task_group */ if (!se) return; @@ -7945,12 +7935,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, { struct rq *rq = cpu_rq(cpu); - tg->rt_rq[cpu] = rt_rq; - init_rt_rq(rt_rq, rq); + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; rt_rq->tg = tg; - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; + tg->rt_rq[cpu] = rt_rq; tg->rt_se[cpu] = rt_se; + if (!rt_se) return; @@ -8032,7 +8024,7 @@ void __init sched_init(void) rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; - init_cfs_rq(&rq->cfs, rq); + init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = root_task_group_load; @@ -8335,6 +8327,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!se) goto err_free_rq; + init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); } @@ -8425,6 +8418,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; + init_rt_rq(rt_rq, cpu_rq(i)); + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } -- cgit v0.10.2 From 2bd2d6f2dc952fc44fc52887de36e51896da96b9 Mon Sep 17 00:00:00 2001 From: Stephan Baerwolf Date: Wed, 20 Jul 2011 14:46:59 +0200 Subject: sched: Replace use of entity_key() "entity_key()" is only used in "__enqueue_entity()" and its only function is to subtract a tasks vruntime by its groups minvruntime. Before this patch a rbtree enqueue-decision is done by comparing two tasks in the style: "if (entity_key(cfs_rq, se) < entity_key(cfs_rq, entry))" which would be "if (se->vruntime-cfs_rq->min_vruntime < entry->vruntime-cfs_rq->min_vruntime)" or (if reducing cfs_rq->min_vruntime out) "if (se->vruntime < entry->vruntime)" which is "if (entity_before(se, entry))" So we do not need "entity_key()". If "entity_before()" is inline we will also save one subtraction (only one, because "entity_key(cfs_rq, se)" was cached in "key") Signed-off-by: Stephan Baerwolf Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ns12mnd2h5w8rb9agd8hnsfk@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0588c0b..a2ecbaa 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -321,11 +321,6 @@ static inline int entity_before(struct sched_entity *a, return (s64)(a->vruntime - b->vruntime) < 0; } -static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return se->vruntime - cfs_rq->min_vruntime; -} - static void update_min_vruntime(struct cfs_rq *cfs_rq) { u64 vruntime = cfs_rq->min_vruntime; @@ -359,7 +354,6 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; - s64 key = entity_key(cfs_rq, se); int leftmost = 1; /* @@ -372,7 +366,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * We dont care about collisions. Nodes with * the same key stay together. */ - if (key < entity_key(cfs_rq, entry)) { + if (entity_before(se, entry)) { link = &parent->rb_left; } else { link = &parent->rb_right; -- cgit v0.10.2 From 0f3171438fc917b9f6b8b60dbb7a3fff9a0f68fd Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Fri, 22 Jul 2011 09:14:31 +0800 Subject: sched: Cleanup duplicate local variable in [enqueue|dequeue]_task_fair No need to define a new "cfs_rq" variable in the "for" block. Just use the one at the top of the function. Signed-off-by: Lin Ming Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1311297271.3938.1352.camel@minggr.sh.intel.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a2ecbaa..bc8ee99 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1317,7 +1317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) } for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); + cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); @@ -1360,7 +1360,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); + cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); -- cgit v0.10.2