From d01d4827858cdc2e1c437c87ab65ec0a00fd40f8 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 21 Sep 2009 11:06:27 +0200
Subject: sched: Always show Cpus_allowed field in /proc/<pid>/status

The Cpus_allowed fields in /proc/<pid>/status is currently only
shown in case of CONFIG_CPUSETS. However their contents are also
useful for the !CONFIG_CPUSETS case.

So change the current behaviour and always show these fields.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090921090627.GD4649@osiris.boeblingen.de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 725a650..762aea9 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -321,6 +321,16 @@ static inline void task_context_switch_counts(struct seq_file *m,
 			p->nivcsw);
 }
 
+static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
+{
+	seq_printf(m, "Cpus_allowed:\t");
+	seq_cpumask(m, &task->cpus_allowed);
+	seq_printf(m, "\n");
+	seq_printf(m, "Cpus_allowed_list:\t");
+	seq_cpumask_list(m, &task->cpus_allowed);
+	seq_printf(m, "\n");
+}
+
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
@@ -335,6 +345,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	}
 	task_sig(m, task);
 	task_cap(m, task);
+	task_cpus_allowed(m, task);
 	cpuset_task_status_allowed(m, task);
 #if defined(CONFIG_S390)
 	task_show_regs(m, task);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7e75a41..b81f7f0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2499,15 +2499,9 @@ const struct file_operations proc_cpuset_operations = {
 };
 #endif /* CONFIG_PROC_PID_CPUSET */
 
-/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
+/* Display task mems_allowed in /proc/<pid>/status file. */
 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
 {
-	seq_printf(m, "Cpus_allowed:\t");
-	seq_cpumask(m, &task->cpus_allowed);
-	seq_printf(m, "\n");
-	seq_printf(m, "Cpus_allowed_list:\t");
-	seq_cpumask_list(m, &task->cpus_allowed);
-	seq_printf(m, "\n");
 	seq_printf(m, "Mems_allowed:\t");
 	seq_nodemask(m, &task->mems_allowed);
 	seq_printf(m, "\n");
-- 
cgit v0.10.2


From cf82ff7ea7695b0e82ba07bc5e9f1bd03a74e1aa Mon Sep 17 00:00:00 2001
From: "Jayson R. King" <dev@jaysonking.com>
Date: Mon, 5 Oct 2009 05:21:26 -0500
Subject: sched: Remove obsolete comment in sched_init()

Remove the comment about calling alloc_bootmem() as it is not
called here since commit 36b7b6d465489c4754c4fd66fcec6086eba87896.

Signed-off-by: Jayson R. King <dev@jaysonking.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Kosina <trivial@kernel.org>
LKML-Reference: <4AC9C8A6.6010209@jaysonking.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 830967e..a56446d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9322,10 +9322,6 @@ void __init sched_init(void)
 #ifdef CONFIG_CPUMASK_OFFSTACK
 	alloc_size += num_possible_cpus() * cpumask_size();
 #endif
-	/*
-	 * As sched_init() is called before page_alloc is setup,
-	 * we use alloc_bootmem().
-	 */
 	if (alloc_size) {
 		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 
-- 
cgit v0.10.2


From ce0e7b28fb75cb003cfc8d0238613aaf1c55e797 Mon Sep 17 00:00:00 2001
From: Ryota Ozaki <ozaki.ryota@gmail.com>
Date: Sat, 24 Oct 2009 01:20:10 +0900
Subject: sched, cpuacct: Fix niced guest time accounting

CPU time of a guest is always accounted in 'user' time
without concern for the nice value of its counterpart
process although the guest is scheduled under the nice
value.

This patch fixes the defect and accounts cpu time of
a niced guest in 'nice' time as same as a niced process.

And also the patch adds 'guest_nice' to cpuacct. The
value provides niced guest cpu time which is like 'nice'
to 'user'.

The original discussions can be found here:

  http://www.mail-archive.com/kvm@vger.kernel.org/msg23982.html
  http://www.mail-archive.com/kvm@vger.kernel.org/msg23860.html

Signed-off-by: Ryota Ozaki <ozaki.ryota@gmail.com>
Acked-by: Avi Kivity <avi@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1256314810-7897-1-git-send-email-ozaki.ryota@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 2c48f94..4af0018 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1072,7 +1072,8 @@ second).  The meanings of the columns are as follows, from left to right:
 - irq: servicing interrupts
 - softirq: servicing softirqs
 - steal: involuntary wait
-- guest: running a guest
+- guest: running a normal guest
+- guest_nice: running a niced guest
 
 The "intr" line gives counts of interrupts  serviced since boot time, for each
 of the  possible system interrupts.   The first  column  is the  total of  all
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7cc726c..b9b7aad 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -27,7 +27,7 @@ static int show_stat(struct seq_file *p, void *v)
 	int i, j;
 	unsigned long jif;
 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
-	cputime64_t guest;
+	cputime64_t guest, guest_nice;
 	u64 sum = 0;
 	u64 sum_softirq = 0;
 	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
@@ -36,7 +36,7 @@ static int show_stat(struct seq_file *p, void *v)
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
-	guest = cputime64_zero;
+	guest = guest_nice = cputime64_zero;
 	getboottime(&boottime);
 	jif = boottime.tv_sec;
 
@@ -51,6 +51,8 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+		guest_nice = cputime64_add(guest_nice,
+			kstat_cpu(i).cpustat.guest_nice);
 		for_each_irq_nr(j) {
 			sum += kstat_irqs_cpu(j, i);
 		}
@@ -65,7 +67,8 @@ static int show_stat(struct seq_file *p, void *v)
 	}
 	sum += arch_irq_stat();
 
-	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+		"%llu\n",
 		(unsigned long long)cputime64_to_clock_t(user),
 		(unsigned long long)cputime64_to_clock_t(nice),
 		(unsigned long long)cputime64_to_clock_t(system),
@@ -74,7 +77,8 @@ static int show_stat(struct seq_file *p, void *v)
 		(unsigned long long)cputime64_to_clock_t(irq),
 		(unsigned long long)cputime64_to_clock_t(softirq),
 		(unsigned long long)cputime64_to_clock_t(steal),
-		(unsigned long long)cputime64_to_clock_t(guest));
+		(unsigned long long)cputime64_to_clock_t(guest),
+		(unsigned long long)cputime64_to_clock_t(guest_nice));
 	for_each_online_cpu(i) {
 
 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -88,8 +92,10 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = kstat_cpu(i).cpustat.softirq;
 		steal = kstat_cpu(i).cpustat.steal;
 		guest = kstat_cpu(i).cpustat.guest;
+		guest_nice = kstat_cpu(i).cpustat.guest_nice;
 		seq_printf(p,
-			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+			"%llu\n",
 			i,
 			(unsigned long long)cputime64_to_clock_t(user),
 			(unsigned long long)cputime64_to_clock_t(nice),
@@ -99,7 +105,8 @@ static int show_stat(struct seq_file *p, void *v)
 			(unsigned long long)cputime64_to_clock_t(irq),
 			(unsigned long long)cputime64_to_clock_t(softirq),
 			(unsigned long long)cputime64_to_clock_t(steal),
-			(unsigned long long)cputime64_to_clock_t(guest));
+			(unsigned long long)cputime64_to_clock_t(guest),
+			(unsigned long long)cputime64_to_clock_t(guest_nice));
 	}
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 348fa88..c059044 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -25,6 +25,7 @@ struct cpu_usage_stat {
 	cputime64_t iowait;
 	cputime64_t steal;
 	cputime64_t guest;
+	cputime64_t guest_nice;
 };
 
 struct kernel_stat {
diff --git a/kernel/sched.c b/kernel/sched.c
index e520581..67be4d0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5017,8 +5017,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
 	p->gtime = cputime_add(p->gtime, cputime);
 
 	/* Add guest time to cpustat. */
-	cpustat->user = cputime64_add(cpustat->user, tmp);
-	cpustat->guest = cputime64_add(cpustat->guest, tmp);
+	if (TASK_NICE(p) > 0) {
+		cpustat->nice = cputime64_add(cpustat->nice, tmp);
+		cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
+	} else {
+		cpustat->user = cputime64_add(cpustat->user, tmp);
+		cpustat->guest = cputime64_add(cpustat->guest, tmp);
+	}
 }
 
 /*
-- 
cgit v0.10.2


From 1477b6a7edd9ffa7bba4f9779ce9a76ce92761ed Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 4 Nov 2009 16:14:16 +0900
Subject: sched: Remove unused __schedule() declaration

__schedule() had been removed.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4AF129C8.3030008@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75e6e60..f18102c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -349,7 +349,6 @@ extern signed long schedule_timeout(signed long timeout);
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
-asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
 extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
 
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 9147a31..7d70146 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -870,7 +870,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
 
 	/*
 	 * All threads that don't have debuggerinfo should be
-	 * in __schedule() sleeping, since all other CPUs
+	 * in schedule() sleeping, since all other CPUs
 	 * are in kgdb_wait, and thus have debuggerinfo.
 	 */
 	if (local_debuggerinfo) {
-- 
cgit v0.10.2


From 2a2bb3142d326bb28b03875cabfc49baaac9a14a Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 4 Nov 2009 16:16:10 +0900
Subject: sched: Remove unused time_sync_thresh declaration

time_sync_thresh had been removed.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4AF12A3A.5050200@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f18102c..754b3de 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -171,8 +171,6 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 }
 #endif
 
-extern unsigned long long time_sync_thresh;
-
 /*
  * Task state bitmask. NOTE! These bits are also
  * encoded in fs/proc/array.c: get_task_state().
-- 
cgit v0.10.2


From 9824a2b728b63e7ff586b9fd9293c819be79f0f3 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 4 Nov 2009 16:16:54 +0900
Subject: sched: Remove unused cpu_nr_migrations()

cpu_nr_migrations() is not used, remove it.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4AF12A66.6020609@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 754b3de..dfc21fb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -145,7 +145,6 @@ extern unsigned long this_cpu_load(void);
 
 
 extern void calc_global_load(void);
-extern u64 cpu_nr_migrations(int cpu);
 
 extern unsigned long get_parent_ip(unsigned long addr);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 67be4d0..30fd0ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -541,7 +541,6 @@ struct rq {
 	struct load_weight load;
 	unsigned long nr_load_updates;
 	u64 nr_switches;
-	u64 nr_migrations_in;
 
 	struct cfs_rq cfs;
 	struct rt_rq rt;
@@ -2049,7 +2048,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 #endif
 	if (old_cpu != new_cpu) {
 		p->se.nr_migrations++;
-		new_rq->nr_migrations_in++;
 #ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
@@ -2989,15 +2987,6 @@ static void calc_load_account_active(struct rq *this_rq)
 }
 
 /*
- * Externally visible per-cpu scheduler statistics:
- * cpu_nr_migrations(cpu) - number of migrations into that cpu
- */
-u64 cpu_nr_migrations(int cpu)
-{
-	return cpu_rq(cpu)->nr_migrations_in;
-}
-
-/*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
  */
-- 
cgit v0.10.2


From 45a5c8bad827ebb9c9798becc15bce2e804d49e0 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Wed, 4 Nov 2009 03:15:44 +0530
Subject: sched: Add USER_SCHED to feature removal list

Peter Zijlstra suggested that we remove USER_SCHED at:

   http://lkml.org/lkml/2009/3/21/67

Removing USER_SCHED removes a lot of code from the scheduler
and simplifies the code.

We already have the ability to do user based classification
which is tightened using PAM in userspace.

Schedule USER_SCHED for removal in 2.6.34

Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
LKML-Reference: <20091103214544.GI5495@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index bc693ff..f613df8 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,6 +6,21 @@ be removed from this file.
 
 ---------------------------
 
+What:	USER_SCHED
+When:	2.6.34
+
+Why:	USER_SCHED was implemented as a proof of concept for group scheduling.
+	The effect of USER_SCHED can already be achieved from userspace with
+	the help of libcgroup. The removal of USER_SCHED will also simplify
+	the scheduler code with the removal of one major ifdef. There are also
+	issues USER_SCHED has with USER_NS. A decision was taken not to fix
+	those and instead remove USER_SCHED. Also new group scheduling
+	features will not be implemented for USER_SCHED.
+
+Who:	Dhaval Giani <dhaval@linux.vnet.ibm.com>
+
+---------------------------
+
 What:	PRISM54
 When:	2.6.34
 
-- 
cgit v0.10.2


From e2c880630438f80b474378d5487b511b07665051 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 3 Nov 2009 14:53:15 +1030
Subject: cpumask: Simplify sched_rt.c

find_lowest_rq() wants to call pick_optimal_cpu() on the
intersection of sched_domain_span(sd) and lowest_mask.  Rather
than doing a cpus_and into a temporary, we can open-code it.

This actually makes the code slightly clearer, IMHO.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Gregory Haskins <ghaskins@novell.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <200911031453.15350.rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a4d790c..5c5fef3 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1153,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 
-static inline int pick_optimal_cpu(int this_cpu,
-				   const struct cpumask *mask)
-{
-	int first;
-
-	/* "this_cpu" is cheaper to preempt than a remote processor */
-	if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
-		return this_cpu;
-
-	first = cpumask_first(mask);
-	if (first < nr_cpu_ids)
-		return first;
-
-	return -1;
-}
-
 static int find_lowest_rq(struct task_struct *task)
 {
 	struct sched_domain *sd;
 	struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
-	cpumask_var_t domain_mask;
 
 	if (task->rt.nr_cpus_allowed == 1)
 		return -1; /* No other targets possible */
@@ -1198,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task)
 	 * Otherwise, we consult the sched_domains span maps to figure
 	 * out which cpu is logically closest to our hot cache data.
 	 */
-	if (this_cpu == cpu)
-		this_cpu = -1; /* Skip this_cpu opt if the same */
-
-	if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
-		for_each_domain(cpu, sd) {
-			if (sd->flags & SD_WAKE_AFFINE) {
-				int best_cpu;
+	if (!cpumask_test_cpu(this_cpu, lowest_mask))
+		this_cpu = -1; /* Skip this_cpu opt if not among lowest */
 
-				cpumask_and(domain_mask,
-					    sched_domain_span(sd),
-					    lowest_mask);
+	for_each_domain(cpu, sd) {
+		if (sd->flags & SD_WAKE_AFFINE) {
+			int best_cpu;
 
-				best_cpu = pick_optimal_cpu(this_cpu,
-							    domain_mask);
-
-				if (best_cpu != -1) {
-					free_cpumask_var(domain_mask);
-					return best_cpu;
-				}
-			}
+			/*
+			 * "this_cpu" is cheaper to preempt than a
+			 * remote processor.
+			 */
+			if (this_cpu != -1 &&
+			    cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
+				return this_cpu;
+
+			best_cpu = cpumask_first_and(lowest_mask,
+						     sched_domain_span(sd));
+			if (best_cpu < nr_cpu_ids)
+				return best_cpu;
 		}
-		free_cpumask_var(domain_mask);
 	}
 
 	/*
@@ -1227,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task)
 	 * just give the caller *something* to work with from the compatible
 	 * locations.
 	 */
-	return pick_optimal_cpu(this_cpu, lowest_mask);
+	if (this_cpu != -1)
+		return this_cpu;
+
+	cpu = cpumask_any(lowest_mask);
+	if (cpu < nr_cpu_ids)
+		return cpu;
+	return -1;
 }
 
 /* Will lock the rq it finds */
-- 
cgit v0.10.2


From acc3f5d7cabbfd6cec71f0c1f9900621fa2d6ae7 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 3 Nov 2009 14:53:40 +1030
Subject: cpumask: Partition_sched_domains takes array of cpumask_var_t

Currently partition_sched_domains() takes a 'struct cpumask
*doms_new' which is a kmalloc'ed array of cpumask_t.  You can't
have such an array if 'struct cpumask' is undefined, as we plan
for CONFIG_CPUMASK_OFFSTACK=y.

So, we make this an array of cpumask_var_t instead: this is the
same for the CONFIG_CPUMASK_OFFSTACK=n case, but requires
multiple allocations for the CONFIG_CPUMASK_OFFSTACK=y case.
Hence we add alloc_sched_domains() and free_sched_domains()
functions.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <200911031453.40668.rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dfc21fb..78ba664 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1009,9 +1009,13 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
 	return to_cpumask(sd->span);
 }
 
-extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
+extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 				    struct sched_domain_attr *dattr_new);
 
+/* Allocate an array of sched domains, for partition_sched_domains(). */
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
+
 /* Test a flag in parent sched domain */
 static inline int test_sd_parent(struct sched_domain *sd, int flag)
 {
@@ -1029,7 +1033,7 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
 struct sched_domain_attr;
 
 static inline void
-partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
+partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 			struct sched_domain_attr *dattr_new)
 {
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d247381..3cf2183 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
  *	element of the partition (one sched domain) to be passed to
  *	partition_sched_domains().
  */
-/* FIXME: see the FIXME in partition_sched_domains() */
-static int generate_sched_domains(struct cpumask **domains,
+static int generate_sched_domains(cpumask_var_t **domains,
 			struct sched_domain_attr **attributes)
 {
 	LIST_HEAD(q);		/* queue of cpusets to be scanned */
@@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains,
 	struct cpuset **csa;	/* array of all cpuset ptrs */
 	int csn;		/* how many cpuset ptrs in csa so far */
 	int i, j, k;		/* indices for partition finding loops */
-	struct cpumask *doms;	/* resulting partition; i.e. sched domains */
+	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
 	struct sched_domain_attr *dattr;  /* attributes for custom domains */
 	int ndoms = 0;		/* number of sched domains in result */
 	int nslot;		/* next empty doms[] struct cpumask slot */
@@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains,
 
 	/* Special case for the 99% of systems with one, full, sched domain */
 	if (is_sched_load_balance(&top_cpuset)) {
-		doms = kmalloc(cpumask_size(), GFP_KERNEL);
+		ndoms = 1;
+		doms = alloc_sched_domains(ndoms);
 		if (!doms)
 			goto done;
 
@@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains,
 			*dattr = SD_ATTR_INIT;
 			update_domain_attr_tree(dattr, &top_cpuset);
 		}
-		cpumask_copy(doms, top_cpuset.cpus_allowed);
+		cpumask_copy(doms[0], top_cpuset.cpus_allowed);
 
-		ndoms = 1;
 		goto done;
 	}
 
@@ -636,7 +635,7 @@ restart:
 	 * Now we know how many domains to create.
 	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
 	 */
-	doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
+	doms = alloc_sched_domains(ndoms);
 	if (!doms)
 		goto done;
 
@@ -656,7 +655,7 @@ restart:
 			continue;
 		}
 
-		dp = doms + nslot;
+		dp = doms[nslot];
 
 		if (nslot == ndoms) {
 			static int warnings = 10;
@@ -718,7 +717,7 @@ done:
 static void do_rebuild_sched_domains(struct work_struct *unused)
 {
 	struct sched_domain_attr *attr;
-	struct cpumask *doms;
+	cpumask_var_t *doms;
 	int ndoms;
 
 	get_online_cpus();
@@ -2052,7 +2051,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 				unsigned long phase, void *unused_cpu)
 {
 	struct sched_domain_attr *attr;
-	struct cpumask *doms;
+	cpumask_var_t *doms;
 	int ndoms;
 
 	switch (phase) {
diff --git a/kernel/sched.c b/kernel/sched.c
index 30fd0ba..ae026aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8846,7 +8846,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
 	return __build_sched_domains(cpu_map, NULL);
 }
 
-static struct cpumask *doms_cur;	/* current sched domains */
+static cpumask_var_t *doms_cur;	/* current sched domains */
 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
 static struct sched_domain_attr *dattr_cur;
 				/* attribues of custom domains in 'doms_cur' */
@@ -8868,6 +8868,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
 	return 0;
 }
 
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+	int i;
+	cpumask_var_t *doms;
+
+	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+	if (!doms)
+		return NULL;
+	for (i = 0; i < ndoms; i++) {
+		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+			free_sched_domains(doms, i);
+			return NULL;
+		}
+	}
+	return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+	unsigned int i;
+	for (i = 0; i < ndoms; i++)
+		free_cpumask_var(doms[i]);
+	kfree(doms);
+}
+
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
@@ -8879,12 +8904,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
 
 	arch_update_cpu_topology();
 	ndoms_cur = 1;
-	doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
+	doms_cur = alloc_sched_domains(ndoms_cur);
 	if (!doms_cur)
-		doms_cur = fallback_doms;
-	cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
+		doms_cur = &fallback_doms;
+	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
 	dattr_cur = NULL;
-	err = build_sched_domains(doms_cur);
+	err = build_sched_domains(doms_cur[0]);
 	register_sched_domain_sysctl();
 
 	return err;
@@ -8934,19 +8959,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * doms_new[] to the current sched domain partitioning, doms_cur[].
  * It destroys each deleted domain and builds each new domain.
  *
- * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
  * The masks don't intersect (don't overlap.) We should setup one
  * sched domain for each mask. CPUs not in any of the cpumasks will
  * not be load balanced. If the same cpumask appears both in the
  * current 'doms_cur' domains and in the new 'doms_new', we can leave
  * it as it is.
  *
- * The passed in 'doms_new' should be kmalloc'd. This routine takes
- * ownership of it and will kfree it when done with it. If the caller
- * failed the kmalloc call, then it can pass in doms_new == NULL &&
- * ndoms_new == 1, and partition_sched_domains() will fallback to
- * the single partition 'fallback_doms', it also forces the domains
- * to be rebuilt.
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains.  This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
  *
  * If doms_new == NULL it will be replaced with cpu_online_mask.
  * ndoms_new == 0 is a special case for destroying existing domains,
@@ -8954,8 +8979,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  *
  * Call with hotplug lock held
  */
-/* FIXME: Change to struct cpumask *doms_new[] */
-void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 			     struct sched_domain_attr *dattr_new)
 {
 	int i, j, n;
@@ -8974,40 +8998,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(&doms_cur[i], &doms_new[j])
+			if (cpumask_equal(doms_cur[i], doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
 		/* no match - a current sched domain not in new doms_new[] */
-		detach_destroy_domains(doms_cur + i);
+		detach_destroy_domains(doms_cur[i]);
 match1:
 		;
 	}
 
 	if (doms_new == NULL) {
 		ndoms_cur = 0;
-		doms_new = fallback_doms;
-		cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
+		doms_new = &fallback_doms;
+		cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map);
 		WARN_ON_ONCE(dattr_new);
 	}
 
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur && !new_topology; j++) {
-			if (cpumask_equal(&doms_new[i], &doms_cur[j])
+			if (cpumask_equal(doms_new[i], doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
 		/* no match - add a new doms_new */
-		__build_sched_domains(doms_new + i,
+		__build_sched_domains(doms_new[i],
 					dattr_new ? dattr_new + i : NULL);
 match2:
 		;
 	}
 
 	/* Remember the new sched domains */
-	if (doms_cur != fallback_doms)
-		kfree(doms_cur);
+	if (doms_cur != &fallback_doms)
+		free_sched_domains(doms_cur, ndoms_cur);
 	kfree(dattr_cur);	/* kfree(NULL) is safe */
 	doms_cur = doms_new;
 	dattr_cur = dattr_new;
-- 
cgit v0.10.2


From a1f84a3ab8e002159498814eaa7e48c33752b04b Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 27 Oct 2009 15:35:38 +0100
Subject: sched: Check for an idle shared cache in select_task_rq_fair()

When waking affine, check for an idle shared cache, and if
found, wake to that CPU/sibling instead of the waker's CPU.

This improves pgsql+oltp ramp up by roughly 8%. Possibly more
for other loads, depending on overlap. The trade-off is a
roughly 1% peak downturn if tasks are truly synchronous.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@kernel.org>
LKML-Reference: <1256654138.17752.7.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4e777b4..da87385 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1372,11 +1372,36 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 				want_sd = 0;
 		}
 
-		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
-		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+		if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) {
+			int candidate = -1, i;
 
-			affine_sd = tmp;
-			want_affine = 0;
+			if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
+				candidate = cpu;
+
+			/*
+			 * Check for an idle shared cache.
+			 */
+			if (tmp->flags & SD_PREFER_SIBLING) {
+				if (candidate == cpu) {
+					if (!cpu_rq(prev_cpu)->cfs.nr_running)
+						candidate = prev_cpu;
+				}
+
+				if (candidate == -1 || candidate == cpu) {
+					for_each_cpu(i, sched_domain_span(tmp)) {
+						if (!cpu_rq(i)->cfs.nr_running) {
+							candidate = i;
+							break;
+						}
+					}
+				}
+			}
+
+			if (candidate >= 0) {
+				affine_sd = tmp;
+				want_affine = 0;
+				cpu = candidate;
+			}
 		}
 
 		if (!want_sd && !want_affine)
-- 
cgit v0.10.2


From 1b9508f6831e10d53256825de8904caa22d1ca2c Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 4 Nov 2009 17:53:50 +0100
Subject: sched: Rate-limit newidle

Rate limit newidle to migration_cost. It's a win for all
stages of sysbench oltp tests.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index ae026aa..f849212 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -589,6 +589,8 @@ struct rq {
 
 	u64 rt_avg;
 	u64 age_stamp;
+	u64 idle_stamp;
+	u64 avg_idle;
 #endif
 
 	/* calc_load related fields */
@@ -2353,6 +2355,17 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	if (rq != orig_rq)
 		update_rq_clock(rq);
 
+	if (rq->idle_stamp) {
+		u64 delta = rq->clock - rq->idle_stamp;
+		u64 max = 2*sysctl_sched_migration_cost;
+
+		if (delta > max)
+			rq->avg_idle = max;
+		else
+			update_avg(&rq->avg_idle, delta);
+		rq->idle_stamp = 0;
+	}
+
 	WARN_ON(p->state != TASK_WAKING);
 	cpu = task_cpu(p);
 
@@ -4389,6 +4402,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 	int pulled_task = 0;
 	unsigned long next_balance = jiffies + HZ;
 
+	this_rq->idle_stamp = this_rq->clock;
+
+	if (this_rq->avg_idle < sysctl_sched_migration_cost)
+		return;
+
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
 
@@ -4403,8 +4421,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
-		if (pulled_task)
+		if (pulled_task) {
+			this_rq->idle_stamp = 0;
 			break;
+		}
 	}
 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
 		/*
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index efb8440..6988cf0 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu)
 
 #ifdef CONFIG_SCHEDSTATS
 #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
+#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
 
 	P(yld_count);
 
 	P(sched_switch);
 	P(sched_count);
 	P(sched_goidle);
+#ifdef CONFIG_SMP
+	P64(avg_idle);
+#endif
 
 	P(ttwu_count);
 	P(ttwu_local);
-- 
cgit v0.10.2


From fd210738f6601d0fb462df9a2fe5a41896ff6a8f Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 5 Nov 2009 10:57:46 +0100
Subject: sched: Fix affinity logic in select_task_rq_fair()

Ingo Molnar reported:

[   26.804000] BUG: using smp_processor_id() in preemptible [00000000] code: events/1/10
[   26.808000] caller is vmstat_update+0x26/0x70
[   26.812000] Pid: 10, comm: events/1 Not tainted 2.6.32-rc5 #6887
[   26.816000] Call Trace:
[   26.820000]  [<c1924a24>] ? printk+0x28/0x3c
[   26.824000]  [<c13258a0>] debug_smp_processor_id+0xf0/0x110
[   26.824000] mount used greatest stack depth: 1464 bytes left
[   26.828000]  [<c111d086>] vmstat_update+0x26/0x70
[   26.832000]  [<c1086418>] worker_thread+0x188/0x310
[   26.836000]  [<c10863b7>] ? worker_thread+0x127/0x310
[   26.840000]  [<c108d310>] ? autoremove_wake_function+0x0/0x60
[   26.844000]  [<c1086290>] ? worker_thread+0x0/0x310
[   26.848000]  [<c108cf0c>] kthread+0x7c/0x90
[   26.852000]  [<c108ce90>] ? kthread+0x0/0x90
[   26.856000]  [<c100c0a7>] kernel_thread_helper+0x7/0x10
[   26.860000] BUG: using smp_processor_id() in preemptible [00000000] code: events/1/10
[   26.864000] caller is vmstat_update+0x3c/0x70

Because this commit:

  a1f84a3: sched: Check for an idle shared cache in select_task_rq_fair()

broke ->cpus_allowed.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: arjan@infradead.org
Cc: <stable@kernel.org>
LKML-Reference: <1257415066.12867.1.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index da87385..e4d4483 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1389,6 +1389,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 
 				if (candidate == -1 || candidate == cpu) {
 					for_each_cpu(i, sched_domain_span(tmp)) {
+						if (!cpumask_test_cpu(i, &p->cpus_allowed))
+							continue;
 						if (!cpu_rq(i)->cfs.nr_running) {
 							candidate = i;
 							break;
-- 
cgit v0.10.2


From d8c80ce091f6ead6710bc71b58f2c32e5bf855e4 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 27 Oct 2009 15:45:23 +0800
Subject: sched, no_hz: Remove unused rq->last_tick_seen field

In 15934a37324f32e0fda633dc7984a671ea81cd75,
field last_tick_seen is added to struct rq.
But it is unused now.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Guillaume Chazarain <guichaz@yahoo.fr>
LKML-Reference: <4AE6A513.6010100@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index f849212..23e3535 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -534,7 +534,6 @@ struct rq {
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ
-	unsigned long last_tick_seen;
 	unsigned char in_nohz_recently;
 #endif
 	/* capture load from *all* tasks on this cpu: */
-- 
cgit v0.10.2


From eae0c9dfb534cb3449888b9601228efa6480fdb5 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 10 Nov 2009 03:50:02 +0100
Subject: sched: Fix and clean up rate-limit newidle code

Commit 1b9508f, "Rate-limit newidle" has been confirmed to fix
the netperf UDP loopback regression reported by Alex Shi.

This is a cleanup and a fix:

 - moved to a more out of the way spot

 - fix to ensure that balancing doesn't try to balance
   runqueues which haven't gone online yet, which can
   mess up CPU enumeration during boot.

Reported-by: Alex Shi <alex.shi@intel.com>
Reported-by: Zhang, Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org> # .32.x: a1f84a3: sched: Check for an idle shared cache
Cc: <stable@kernel.org> # .32.x: 1b9508f: sched: Rate-limit newidle
Cc: <stable@kernel.org> # .32.x: fd21073: sched: Fix affinity logic
Cc: <stable@kernel.org> # .32.x
LKML-Reference: <1257821402.5648.17.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 23e3535..ad37776 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2354,17 +2354,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	if (rq != orig_rq)
 		update_rq_clock(rq);
 
-	if (rq->idle_stamp) {
-		u64 delta = rq->clock - rq->idle_stamp;
-		u64 max = 2*sysctl_sched_migration_cost;
-
-		if (delta > max)
-			rq->avg_idle = max;
-		else
-			update_avg(&rq->avg_idle, delta);
-		rq->idle_stamp = 0;
-	}
-
 	WARN_ON(p->state != TASK_WAKING);
 	cpu = task_cpu(p);
 
@@ -2421,6 +2410,17 @@ out_running:
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
+
+	if (unlikely(rq->idle_stamp)) {
+		u64 delta = rq->clock - rq->idle_stamp;
+		u64 max = 2*sysctl_sched_migration_cost;
+
+		if (delta > max)
+			rq->avg_idle = max;
+		else
+			update_avg(&rq->avg_idle, delta);
+		rq->idle_stamp = 0;
+	}
 #endif
 out:
 	task_rq_unlock(rq, &flags);
@@ -4098,7 +4098,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	unsigned long flags;
 	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 
-	cpumask_setall(cpus);
+	cpumask_copy(cpus, cpu_online_mask);
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -4261,7 +4261,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	int all_pinned = 0;
 	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 
-	cpumask_setall(cpus);
+	cpumask_copy(cpus, cpu_online_mask);
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -9522,6 +9522,8 @@ void __init sched_init(void)
 		rq->cpu = i;
 		rq->online = 0;
 		rq->migration_thread = NULL;
+		rq->idle_stamp = 0;
+		rq->avg_idle = 2*sysctl_sched_migration_cost;
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq_attach_root(rq, &def_root_domain);
 #endif
-- 
cgit v0.10.2


From ffd44db5f02af32bcc25a8eb5981bf02a141cdab Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 10 Nov 2009 20:12:01 +0100
Subject: sched: Make sure task has correct sched_class after policy change

From the code in rt_mutex_setprio(), it is evident that the
intention is that task's with a RT 'prio' value as a consequence
of receiving a PI boost also have their 'sched_class' field set
to '&rt_sched_class'.

However, Peter noticed that the code in __setscheduler() could
result in this intention being frustrated. Fix it.

Reported-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <1257880321.4108.457.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index ad37776..43e61fa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6159,22 +6159,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 	BUG_ON(p->se.on_rq);
 
 	p->policy = policy;
-	switch (p->policy) {
-	case SCHED_NORMAL:
-	case SCHED_BATCH:
-	case SCHED_IDLE:
-		p->sched_class = &fair_sched_class;
-		break;
-	case SCHED_FIFO:
-	case SCHED_RR:
-		p->sched_class = &rt_sched_class;
-		break;
-	}
-
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
+	if (rt_prio(p->prio))
+		p->sched_class = &rt_sched_class;
+	else
+		p->sched_class = &fair_sched_class;
 	set_load_weight(p);
 }
 
-- 
cgit v0.10.2


From 055a00865dcfc8e61f3cbefbb879c9577bd36ae5 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 12 Nov 2009 11:07:44 +0100
Subject: sched: Fix/add missing update_rq_clock() calls

kthread_bind(), migrate_task() and sched_fork were missing
updates, and try_to_wake_up() was updating after having already
used the stale clock.

Aside from preventing potential latency hits, there' a side
benefit in that early boot printk time stamps become monotonic.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1258020464.6491.2.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <new-submission>

diff --git a/kernel/sched.c b/kernel/sched.c
index 3c11ae0..701eca4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2017,6 +2017,7 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
 	}
 
 	spin_lock_irqsave(&rq->lock, flags);
+	update_rq_clock(rq);
 	set_task_cpu(p, cpu);
 	p->cpus_allowed = cpumask_of_cpu(cpu);
 	p->rt.nr_cpus_allowed = 1;
@@ -2115,6 +2116,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 	 * it is sufficient to simply update the task's cpu field.
 	 */
 	if (!p->se.on_rq && !task_running(rq, p)) {
+		update_rq_clock(rq);
 		set_task_cpu(p, dest_cpu);
 		return 0;
 	}
@@ -2376,14 +2378,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	task_rq_unlock(rq, &flags);
 
 	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-	if (cpu != orig_cpu)
+	if (cpu != orig_cpu) {
+		local_irq_save(flags);
+		rq = cpu_rq(cpu);
+		update_rq_clock(rq);
 		set_task_cpu(p, cpu);
-
+		local_irq_restore(flags);
+	}
 	rq = task_rq_lock(p, &flags);
 
-	if (rq != orig_rq)
-		update_rq_clock(rq);
-
 	WARN_ON(p->state != TASK_WAKING);
 	cpu = task_cpu(p);
 
@@ -2545,6 +2548,7 @@ static void __sched_fork(struct task_struct *p)
 void sched_fork(struct task_struct *p, int clone_flags)
 {
 	int cpu = get_cpu();
+	unsigned long flags;
 
 	__sched_fork(p);
 
@@ -2581,7 +2585,10 @@ void sched_fork(struct task_struct *p, int clone_flags)
 #ifdef CONFIG_SMP
 	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
 #endif
+	local_irq_save(flags);
+	update_rq_clock(cpu_rq(cpu));
 	set_task_cpu(p, cpu);
+	local_irq_restore(flags);
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
-- 
cgit v0.10.2


From 761b1d26df542fd5eb348837351e4d2f3bc7bffe Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 12 Nov 2009 13:33:45 +0900
Subject: sched: Fix granularity of task_u/stime()

Originally task_s/utime() were designed to return clock_t but
later changed to return cputime_t by following commit:

  commit efe567fc8281661524ffa75477a7c4ca9b466c63
  Author: Christian Borntraeger <borntraeger@de.ibm.com>
  Date:   Thu Aug 23 15:18:02 2007 +0200

It only changed the type of return value, but not the
implementation. As the result the granularity of task_s/utime()
is still that of clock_t, not that of cputime_t.

So using task_s/utime() in __exit_signal() makes values
accumulated to the signal struct to be rounded and coarse
grained.

This patch removes casts to clock_t in task_u/stime(), to keep
granularity of cputime_t over the calculation.

v2:
  Use div_u64() to avoid error "undefined reference to `__udivdi3`"
  on some 32bit systems.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: xiyou.wangcong@gmail.com
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4AFB9029.9000208@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 43e61fa..ab9a034 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5156,41 +5156,45 @@ cputime_t task_stime(struct task_struct *p)
 	return p->stime;
 }
 #else
+
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs) \
+	msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
+#endif
+
 cputime_t task_utime(struct task_struct *p)
 {
-	clock_t utime = cputime_to_clock_t(p->utime),
-		total = utime + cputime_to_clock_t(p->stime);
+	cputime_t utime = p->utime, total = utime + p->stime;
 	u64 temp;
 
 	/*
 	 * Use CFS's precise accounting:
 	 */
-	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+	temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime);
 
 	if (total) {
 		temp *= utime;
 		do_div(temp, total);
 	}
-	utime = (clock_t)temp;
+	utime = (cputime_t)temp;
 
-	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+	p->prev_utime = max(p->prev_utime, utime);
 	return p->prev_utime;
 }
 
 cputime_t task_stime(struct task_struct *p)
 {
-	clock_t stime;
+	cputime_t stime;
 
 	/*
 	 * Use CFS's precise accounting. (we subtract utime from
 	 * the total, to make sure the total observed by userspace
 	 * grows monotonically - apps rely on that):
 	 */
-	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
-			cputime_to_clock_t(task_utime(p));
+	stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p);
 
 	if (stime >= 0)
-		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+		p->prev_stime = max(p->prev_stime, stime);
 
 	return p->prev_stime;
 }
-- 
cgit v0.10.2


From a50bde5130f65733142b32975616427d0ea50856 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 12 Nov 2009 15:55:28 +0100
Subject: sched: Cleanup select_task_rq_fair()

Clean up the new affine to idle sibling bits while trying to
grok them. Should not have any function differences.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20091112145610.832503781@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e4d4483..a32df15 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1319,6 +1319,41 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 }
 
 /*
+ * Try and locate an idle CPU in the sched_domain.
+ */
+static int
+select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+{
+	int cpu = smp_processor_id();
+	int prev_cpu = task_cpu(p);
+	int i;
+
+	/*
+	 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
+	 * test in select_task_rq_fair) and the prev_cpu is idle then that's
+	 * always a better target than the current cpu.
+	 */
+	if (target == cpu) {
+		if (!cpu_rq(prev_cpu)->cfs.nr_running)
+			target = prev_cpu;
+	}
+
+	/*
+	 * Otherwise, iterate the domain and find an elegible idle cpu.
+	 */
+	if (target == -1 || target == cpu) {
+		for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+			if (!cpu_rq(i)->cfs.nr_running) {
+				target = i;
+				break;
+			}
+		}
+	}
+
+	return target;
+}
+
+/*
  * sched_balance_self: balance the current task (running on cpu) in domains
  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
  * SD_BALANCE_EXEC.
@@ -1373,36 +1408,30 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 		}
 
 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) {
-			int candidate = -1, i;
+			int target = -1;
 
+			/*
+			 * If both cpu and prev_cpu are part of this domain,
+			 * cpu is a valid SD_WAKE_AFFINE target.
+			 */
 			if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
-				candidate = cpu;
+				target = cpu;
 
 			/*
-			 * Check for an idle shared cache.
+			 * If there's an idle sibling in this domain, make that
+			 * the wake_affine target instead of the current cpu.
+			 *
+			 * XXX: should we possibly do this outside of
+			 * WAKE_AFFINE, in case the shared cache domain is
+			 * smaller than the WAKE_AFFINE domain?
 			 */
-			if (tmp->flags & SD_PREFER_SIBLING) {
-				if (candidate == cpu) {
-					if (!cpu_rq(prev_cpu)->cfs.nr_running)
-						candidate = prev_cpu;
-				}
-
-				if (candidate == -1 || candidate == cpu) {
-					for_each_cpu(i, sched_domain_span(tmp)) {
-						if (!cpumask_test_cpu(i, &p->cpus_allowed))
-							continue;
-						if (!cpu_rq(i)->cfs.nr_running) {
-							candidate = i;
-							break;
-						}
-					}
-				}
-			}
+			if (tmp->flags & SD_PREFER_SIBLING)
+				target = select_idle_sibling(p, tmp, target);
 
-			if (candidate >= 0) {
+			if (target >= 0) {
 				affine_sd = tmp;
 				want_affine = 0;
-				cpu = candidate;
+				cpu = target;
 			}
 		}
 
-- 
cgit v0.10.2


From fe3bcfe1f6c1fc4ea7706ac2d05e579fd9092682 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 12 Nov 2009 15:55:29 +0100
Subject: sched: More generic WAKE_AFFINE vs select_idle_sibling()

Instead of only considering SD_WAKE_AFFINE | SD_PREFER_SIBLING
domains also allow all SD_PREFER_SIBLING domains below a
SD_WAKE_AFFINE domain to change the affinity target.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20091112145610.909723612@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a32df15..f28a267 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1333,20 +1333,16 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
 	 * test in select_task_rq_fair) and the prev_cpu is idle then that's
 	 * always a better target than the current cpu.
 	 */
-	if (target == cpu) {
-		if (!cpu_rq(prev_cpu)->cfs.nr_running)
-			target = prev_cpu;
-	}
+	if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+		return prev_cpu;
 
 	/*
 	 * Otherwise, iterate the domain and find an elegible idle cpu.
 	 */
-	if (target == -1 || target == cpu) {
-		for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
-			if (!cpu_rq(i)->cfs.nr_running) {
-				target = i;
-				break;
-			}
+	for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+		if (!cpu_rq(i)->cfs.nr_running) {
+			target = i;
+			break;
 		}
 	}
 
@@ -1407,7 +1403,12 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 				want_sd = 0;
 		}
 
-		if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) {
+		/*
+		 * While iterating the domains looking for a spanning
+		 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
+		 * in cache sharing domains along the way.
+		 */
+		if (want_affine) {
 			int target = -1;
 
 			/*
@@ -1420,17 +1421,15 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 			/*
 			 * If there's an idle sibling in this domain, make that
 			 * the wake_affine target instead of the current cpu.
-			 *
-			 * XXX: should we possibly do this outside of
-			 * WAKE_AFFINE, in case the shared cache domain is
-			 * smaller than the WAKE_AFFINE domain?
 			 */
 			if (tmp->flags & SD_PREFER_SIBLING)
 				target = select_idle_sibling(p, tmp, target);
 
 			if (target >= 0) {
-				affine_sd = tmp;
-				want_affine = 0;
+				if (tmp->flags & SD_WAKE_AFFINE) {
+					affine_sd = tmp;
+					want_affine = 0;
+				}
 				cpu = target;
 			}
 		}
-- 
cgit v0.10.2


From 498657a478c60be092208422fefa9c7b248729c2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Nov 2009 18:33:53 +0900
Subject: sched, kvm: Fix race condition involving sched_in_preempt_notifers

In finish_task_switch(), fire_sched_in_preempt_notifiers() is
called after finish_lock_switch().

However, depending on architecture, preemption can be enabled after
finish_lock_switch() which breaks the semantics of preempt
notifiers.

So move it before finish_arch_switch(). This also makes the in-
notifiers symmetric to out- notifiers in terms of locking - now
both are called under rq lock.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Avi Kivity <avi@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4AFD2801.7020900@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 701eca4..cea2bea 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2758,9 +2758,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	perf_event_task_sched_in(current, cpu_of(rq));
+	fire_sched_in_preempt_notifiers(current);
 	finish_lock_switch(rq, prev);
 
-	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
-- 
cgit v0.10.2


From 047106adcc85e3023da210143a6ab8a55df9e0fc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 16 Nov 2009 10:28:09 +0100
Subject: sched: Sched_rt_periodic_timer vs cpu hotplug

Heiko reported a case where a timer interrupt managed to
reference a root_domain structure that was already freed by a
concurrent hot-un-plug operation.

Solve this like the regular sched_domain stuff is also
synchronized, by adding a synchronize_sched() stmt to the free
path, this ensures that a root_domain stays present for any
atomic section that could have observed it.

Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: Siddha Suresh B <suresh.b.siddha@intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
LKML-Reference: <1258363873.26714.83.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index cea2bea..3c91f11 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7912,6 +7912,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 
 static void free_rootdomain(struct root_domain *rd)
 {
+	synchronize_sched();
+
 	cpupri_cleanup(&rd->cpupri);
 
 	free_cpumask_var(rd->rto_mask);
-- 
cgit v0.10.2


From 429947248f814e90f416ab4f68a871ab628000c3 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Fri, 20 Nov 2009 17:40:37 +0100
Subject: sched_feat_write(): Update ppos instead of file->f_pos

sched_feat_write() should update ppos instead of file->f_pos.

(This reduces some BKL dependencies of this code.)

Signed-off-by: Jan Blunck <jblunck@suse.de>
Cc: jkacur@redhat.com
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jamie Lokier <jamie@shareable.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
LKML-Reference: <1258735245-25826-8-git-send-email-jblunck@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index ab9a034..93474a7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -771,7 +771,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
 	if (!sched_feat_names[i])
 		return -EINVAL;
 
-	filp->f_pos += cnt;
+	*ppos += cnt;
 
 	return cnt;
 }
-- 
cgit v0.10.2


From 36ace27e3e60d44ea69ce394b2e45386ae98d9d9 Mon Sep 17 00:00:00 2001
From: Tim Blechmann <tim@klingt.org>
Date: Tue, 24 Nov 2009 11:55:45 +0100
Subject: sched: Optimize branch hint in pick_next_task_fair()

Branch hint profiling on my nehalem machine showed 90%
incorrect branch hints:

  15728471 158903754  90 pick_next_task_fair
  sched_fair.c    1555

Signed-off-by: Tim Blechmann <tim@klingt.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B0BBBB1.2050100@klingt.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f28a267..24086e7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1704,7 +1704,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
 
-	if (unlikely(!cfs_rq->nr_running))
+	if (!cfs_rq->nr_running)
 		return NULL;
 
 	do {
-- 
cgit v0.10.2


From 710390d90f143a9ebb87a475215140f426792efd Mon Sep 17 00:00:00 2001
From: Tim Blechmann <tim@klingt.org>
Date: Tue, 24 Nov 2009 11:55:27 +0100
Subject: sched: Optimize branch hint in context_switch()

Branch hint profiling on my nehalem machine showed over 90%
incorrect branch hints:

  10420275 170645395  94 context_switch                 sched.c
   3043
  10408421 171098521  94 context_switch                 sched.c
   3050

Signed-off-by: Tim Blechmann <tim@klingt.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B0BBB9F.6080304@klingt.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 93474a7..010d5e1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2829,14 +2829,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 */
 	arch_start_context_switch(prev);
 
-	if (unlikely(!mm)) {
+	if (likely(!mm)) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 
-	if (unlikely(!prev->mm)) {
+	if (likely(!prev->mm)) {
 		prev->active_mm = NULL;
 		rq->prev_mm = oldmm;
 	}
-- 
cgit v0.10.2


From a3a1de0c34de6f5f8332cd6151c46af7813c0fcb Mon Sep 17 00:00:00 2001
From: Tim Blechmann <tim@klingt.org>
Date: Tue, 24 Nov 2009 11:55:15 +0100
Subject: sched, x86: Optimize branch hint in __switch_to()

Branch hint profiling on my nehalem machine showed 96%
incorrect branch hints:

  6548732 174664120  96 __switch_to                    process_64.c
    406
  6548745 174565593  96 __switch_to                    process_64.c
    410

Signed-off-by: Tim Blechmann <tim@klingt.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B0BBB93.3080307@klingt.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ad535b6..d9db104 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -406,11 +406,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * This won't pick up thread selector changes, but I guess that is ok.
 	 */
 	savesegment(es, prev->es);
-	if (unlikely(next->es | prev->es))
+	if (next->es | prev->es)
 		loadsegment(es, next->es);
-
 	savesegment(ds, prev->ds);
-	if (unlikely(next->ds | prev->ds))
+	if (next->ds | prev->ds)
 		loadsegment(ds, next->ds);
 
 
-- 
cgit v0.10.2


From 93335a21557e80f6a99bc2812c634e488139043c Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Wed, 25 Nov 2009 15:23:41 +0200
Subject: sched.c: Call debug_show_all_locks() when dumping all tasks

In commit v2.6.21-691-g39bc89f ("make SysRq-T show all tasks
again") the interface of show_state_filter() was changed: zero
valued 'state_filter' specifies "dump all tasks" (instead of -1).

However, the condition for calling debug_show_all_locks() ("show
locks if all tasks are dumped") was not updated accordingly.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Cc: peterz@infradead.org
LKML-Reference: <4b0d2fe4.0ab6660a.6437.3cfc@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 010d5e1..a57c6ae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6915,7 +6915,7 @@ void show_state_filter(unsigned long state_filter)
 	/*
 	 * Only show locks if all tasks are dumped:
 	 */
-	if (state_filter == -1)
+	if (!state_filter)
 		debug_show_all_locks();
 }
 
-- 
cgit v0.10.2


From f6630114d9198aa959ac95c131334c020038f253 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 17 Nov 2009 18:22:15 -0600
Subject: sched: Limit the number of scheduler debug messages

Remove the verbose scheduler debug messages unless kernel
parameter "sched_debug" set.  /proc/sched_debug unchanged.

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091118002221.489305000@alcatraz.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9107b38..f2a9507 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2182,6 +2182,8 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	sbni=		[NET] Granch SBNI12 leased line adapter
 
+	sched_debug	[KNL] Enables verbose scheduler debug messages.
+
 	sc1200wdt=	[HW,WDT] SC1200 WDT (watchdog) driver
 			Format: <io>[,<timeout>[,<isapnp>]]
 
diff --git a/kernel/sched.c b/kernel/sched.c
index a57c6ae..48ff66a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7720,6 +7720,16 @@ early_initcall(migration_init);
 
 #ifdef CONFIG_SCHED_DEBUG
 
+static __read_mostly int sched_domain_debug_enabled;
+
+static int __init sched_domain_debug_setup(char *str)
+{
+	sched_domain_debug_enabled = 1;
+
+	return 0;
+}
+early_param("sched_debug", sched_domain_debug_setup);
+
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  struct cpumask *groupmask)
 {
@@ -7806,6 +7816,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 	cpumask_var_t groupmask;
 	int level = 0;
 
+	if (!sched_domain_debug_enabled)
+		return;
+
 	if (!sd) {
 		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
 		return;
-- 
cgit v0.10.2


From d180c5bccec02612256fd8076ff3c1fac3429553 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 26 Nov 2009 14:48:30 +0900
Subject: sched: Introduce task_times() to replace task_{u,s}time() pair

Functions task_{u,s}time() are called in pair in almost all
cases.  However task_stime() is implemented to call task_utime()
from its inside, so such paired calls run task_utime() twice.

It means we do heavy divisions (div_u64 + do_div) twice to get
utime and stime which can be obtained at same time by one set
of divisions.

This patch introduces a function task_times(*tsk, *utime,
*stime) to retrieve utime and stime at once in better, optimized
way.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
LKML-Reference: <4B0E16AE.906@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/fs/proc/array.c b/fs/proc/array.c
index e209f64..330deda 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -535,8 +535,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	if (!whole) {
 		min_flt = task->min_flt;
 		maj_flt = task->maj_flt;
-		utime = task_utime(task);
-		stime = task_stime(task);
+		task_times(task, &utime, &stime);
 		gtime = task_gtime(task);
 	}
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78ba664..fe6ae15 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1723,6 +1723,7 @@ static inline void put_task_struct(struct task_struct *t)
 extern cputime_t task_utime(struct task_struct *p);
 extern cputime_t task_stime(struct task_struct *p);
 extern cputime_t task_gtime(struct task_struct *p);
+extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
 /*
  * Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index f7864ac..29068ab 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -91,6 +91,8 @@ static void __exit_signal(struct task_struct *tsk)
 	if (atomic_dec_and_test(&sig->count))
 		posix_cpu_timers_exit_group(tsk);
 	else {
+		cputime_t utime, stime;
+
 		/*
 		 * If there is any task waiting for the group exit
 		 * then notify it:
@@ -110,8 +112,9 @@ static void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
-		sig->utime = cputime_add(sig->utime, task_utime(tsk));
-		sig->stime = cputime_add(sig->stime, task_stime(tsk));
+		task_times(tsk, &utime, &stime);
+		sig->utime = cputime_add(sig->utime, utime);
+		sig->stime = cputime_add(sig->stime, stime);
 		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
diff --git a/kernel/sched.c b/kernel/sched.c
index 315ba40..475a6f2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5191,6 +5191,14 @@ cputime_t task_stime(struct task_struct *p)
 {
 	return p->stime;
 }
+
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	if (ut)
+		*ut = task_utime(p);
+	if (st)
+		*st = task_stime(p);
+}
 #else
 
 #ifndef nsecs_to_cputime
@@ -5198,41 +5206,48 @@ cputime_t task_stime(struct task_struct *p)
 	msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
 #endif
 
-cputime_t task_utime(struct task_struct *p)
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-	cputime_t utime = p->utime, total = utime + p->stime;
-	u64 temp;
+	cputime_t rtime, utime = p->utime, total = utime + p->stime;
 
 	/*
 	 * Use CFS's precise accounting:
 	 */
-	temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime);
+	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
 
 	if (total) {
-		temp *= utime;
+		u64 temp;
+
+		temp = (u64)(rtime * utime);
 		do_div(temp, total);
-	}
-	utime = (cputime_t)temp;
+		utime = (cputime_t)temp;
+	} else
+		utime = rtime;
 
+	/*
+	 * Compare with previous values, to keep monotonicity:
+	 */
 	p->prev_utime = max(p->prev_utime, utime);
-	return p->prev_utime;
+	p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+
+	if (ut)
+		*ut = p->prev_utime;
+	if (st)
+		*st = p->prev_stime;
+}
+
+cputime_t task_utime(struct task_struct *p)
+{
+	cputime_t utime;
+	task_times(p, &utime, NULL);
+	return utime;
 }
 
 cputime_t task_stime(struct task_struct *p)
 {
 	cputime_t stime;
-
-	/*
-	 * Use CFS's precise accounting. (we subtract utime from
-	 * the total, to make sure the total observed by userspace
-	 * grows monotonically - apps rely on that):
-	 */
-	stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p);
-
-	if (stime >= 0)
-		p->prev_stime = max(p->prev_stime, stime);
-
-	return p->prev_stime;
+	task_times(p, NULL, &stime);
+	return stime;
 }
 #endif
 
diff --git a/kernel/sys.c b/kernel/sys.c
index ce17760..bbdfce0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1346,8 +1346,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	utime = stime = cputime_zero;
 
 	if (who == RUSAGE_THREAD) {
-		utime = task_utime(current);
-		stime = task_stime(current);
+		task_times(current, &utime, &stime);
 		accumulate_thread_rusage(p, r);
 		maxrss = p->signal->maxrss;
 		goto out;
-- 
cgit v0.10.2


From d5b7c78e975302a1bab28263266c39ecb71acad4 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 26 Nov 2009 14:49:05 +0900
Subject: sched: Remove task_{u,s,g}time()

Now all task_{u,s}time() pairs are replaced by task_times().
And task_gtime() is too simple to be an inline function.

Cleanup them all.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
LKML-Reference: <4B0E16D1.70902@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 330deda..ca61a88 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -511,7 +511,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 			do {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
-				gtime = cputime_add(gtime, task_gtime(t));
+				gtime = cputime_add(gtime, t->gtime);
 				t = next_thread(t);
 			} while (t != task);
 
@@ -536,7 +536,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		min_flt = task->min_flt;
 		maj_flt = task->maj_flt;
 		task_times(task, &utime, &stime);
-		gtime = task_gtime(task);
+		gtime = task->gtime;
 	}
 
 	/* scale priority and nice values from timeslices to -20..20 */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fe6ae15..0395b0f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1720,9 +1720,6 @@ static inline void put_task_struct(struct task_struct *t)
 		__put_task_struct(t);
 }
 
-extern cputime_t task_utime(struct task_struct *p);
-extern cputime_t task_stime(struct task_struct *p);
-extern cputime_t task_gtime(struct task_struct *p);
 extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
 /*
diff --git a/kernel/exit.c b/kernel/exit.c
index 29068ab..2eaf68b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -115,7 +115,7 @@ static void __exit_signal(struct task_struct *tsk)
 		task_times(tsk, &utime, &stime);
 		sig->utime = cputime_add(sig->utime, utime);
 		sig->stime = cputime_add(sig->stime, stime);
-		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
+		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
 		sig->nvcsw += tsk->nvcsw;
diff --git a/kernel/sched.c b/kernel/sched.c
index 475a6f2..82251c2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5182,22 +5182,12 @@ void account_idle_ticks(unsigned long ticks)
  * Use precise platform statistics if available:
  */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-cputime_t task_utime(struct task_struct *p)
-{
-	return p->utime;
-}
-
-cputime_t task_stime(struct task_struct *p)
-{
-	return p->stime;
-}
-
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	if (ut)
-		*ut = task_utime(p);
+		*ut = p->utime;
 	if (st)
-		*st = task_stime(p);
+		*st = p->stime;
 }
 #else
 
@@ -5235,27 +5225,8 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	if (st)
 		*st = p->prev_stime;
 }
-
-cputime_t task_utime(struct task_struct *p)
-{
-	cputime_t utime;
-	task_times(p, &utime, NULL);
-	return utime;
-}
-
-cputime_t task_stime(struct task_struct *p)
-{
-	cputime_t stime;
-	task_times(p, NULL, &stime);
-	return stime;
-}
 #endif
 
-inline cputime_t task_gtime(struct task_struct *p)
-{
-	return p->gtime;
-}
-
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
-- 
cgit v0.10.2


From b7b20df91d43d5e59578b8fc16e895c0c8cbd9b5 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 26 Nov 2009 14:49:27 +0900
Subject: sched, time: Define nsecs_to_jiffies()

Use of msecs_to_jiffies() for nsecs_to_cputime() have some
problems:

 - The type of msecs_to_jiffies()'s argument is unsigned int, so
   it cannot convert msecs greater than UINT_MAX = about 49.7 days.

 - msecs_to_jiffies() returns MAX_JIFFY_OFFSET if MSB of argument
   is set, assuming that input was negative value.  So it cannot
   convert msecs greater than INT_MAX = about 24.8 days too.

This patch defines a new function nsecs_to_jiffies() that can
deal greater values, and that can deal all incoming values as
unsigned.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Amrico Wang <xiyou.wangcong@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <johnstul@linux.vnet.ibm.com>
LKML-Reference: <4B0E16E7.5070307@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 1a9cf78bf..6811f4b 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -307,6 +307,7 @@ extern clock_t jiffies_to_clock_t(long x);
 extern unsigned long clock_t_to_jiffies(unsigned long x);
 extern u64 jiffies_64_to_clock_t(u64 x);
 extern u64 nsec_to_clock_t(u64 x);
+extern unsigned long nsecs_to_jiffies(u64 n);
 
 #define TIMESTAMP_SIZE	30
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 82251c2..b3d4e2b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5192,8 +5192,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 #else
 
 #ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs) \
-	msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
+# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
 #endif
 
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469..8047980 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -662,6 +662,36 @@ u64 nsec_to_clock_t(u64 x)
 #endif
 }
 
+/**
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n:	nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
+{
+#if (NSEC_PER_SEC % HZ) == 0
+	/* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
+	return div_u64(n, NSEC_PER_SEC / HZ);
+#elif (HZ % 512) == 0
+	/* overflow after 292 years if HZ = 1024 */
+	return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
+#else
+	/*
+	 * Generic case - optimized for cases where HZ is a multiple of 3.
+	 * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
+	 */
+	return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
+#endif
+}
+
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
-- 
cgit v0.10.2


From 8592e6486a177a02f048567cb928bc3a1f9a86c3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 2 Dec 2009 12:56:46 +0900
Subject: sched: Revert 498657a478c60be092208422fefa9c7b248729c2

498657a478c60be092208422fefa9c7b248729c2 incorrectly assumed
that preempt wasn't disabled around context_switch() and thus
was fixing imaginary problem.  It also broke KVM because it
depended on ->sched_in() to be called with irq enabled so that
it can do smp calls from there.

Revert the incorrect commit and add comment describing different
contexts under with the two callbacks are invoked.

Avi: spotted transposed in/out in the added comment.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Avi Kivity <avi@redhat.com>
Cc: peterz@infradead.org
Cc: efault@gmx.de
Cc: rusty@rustcorp.com.au
LKML-Reference: <1259726212-30259-2-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 72b1a10..2e681d9 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -105,6 +105,11 @@ struct preempt_notifier;
  * @sched_out: we've just been preempted
  *    notifier: struct preempt_notifier for the task being preempted
  *    next: the task that's kicking us out
+ *
+ * Please note that sched_in and out are called under different
+ * contexts.  sched_out is called with rq lock held and irq disabled
+ * while sched_in is called without rq lock and irq enabled.  This
+ * difference is intentional and depended upon by its users.
  */
 struct preempt_ops {
 	void (*sched_in)(struct preempt_notifier *notifier, int cpu);
diff --git a/kernel/sched.c b/kernel/sched.c
index b3d4e2b..1031cae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2768,9 +2768,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	perf_event_task_sched_in(current, cpu_of(rq));
-	fire_sched_in_preempt_notifiers(current);
 	finish_lock_switch(rq, prev);
 
+	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
-- 
cgit v0.10.2


From bdddd2963c0264c56f18043f6fa829d3c1d3d1c0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 2 Dec 2009 14:09:16 +1030
Subject: sched: Fix isolcpus boot option

Anton Blanchard wrote:

> We allocate and zero cpu_isolated_map after the isolcpus
> __setup option has run. This means cpu_isolated_map always
> ends up empty and if CPUMASK_OFFSTACK is enabled we write to a
> cpumask that hasn't been allocated.

I introduced this regression in 49557e620339cb13 (sched: Fix
boot crash by zalloc()ing most of the cpu masks).

Use the bootmem allocator if they set isolcpus=, otherwise
allocate and zero like normal.

Reported-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: peterz@infradead.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@kernel.org>
LKML-Reference: <200912021409.17013.rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Anton Blanchard <anton@samba.org>

diff --git a/kernel/sched.c b/kernel/sched.c
index 1031cae..4883fee 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8061,6 +8061,7 @@ static cpumask_var_t cpu_isolated_map;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
+	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 	cpulist_parse(str, cpu_isolated_map);
 	return 1;
 }
@@ -9609,7 +9610,9 @@ void __init sched_init(void)
 	zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
 	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
 #endif
-	zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+	/* May be allocated at isolcpus cmdline parse time */
+	if (cpu_isolated_map == NULL)
+		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
 	perf_event_init();
-- 
cgit v0.10.2


From be8147e68625a1adb111acfd6b98a492be4b74d0 Mon Sep 17 00:00:00 2001
From: Tim Blechmann <tim@klingt.org>
Date: Wed, 2 Dec 2009 12:32:10 +0100
Subject: Revert "sched, x86: Optimize branch hint in __switch_to()"

This reverts commit a3a1de0c34de6f5f8332cd6151c46af7813c0fcb.

Commit 8ec6993d9f7d961014af970ded57542961fe9ad9 cleared the es
and ds selectors, so the original branch hints are correct now.
Therefore the branch hint doesn't need to be removed.

Signed-off-by: Tim Blechmann <tim@klingt.org>
LKML-Reference: <4B16503A.8030508@klingt.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 93c501d..eb62cbc 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -406,10 +406,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * This won't pick up thread selector changes, but I guess that is ok.
 	 */
 	savesegment(es, prev->es);
-	if (next->es | prev->es)
+	if (unlikely(next->es | prev->es))
 		loadsegment(es, next->es);
+
 	savesegment(ds, prev->ds);
-	if (next->ds | prev->ds)
+	if (unlikely(next->ds | prev->ds))
 		loadsegment(ds, next->ds);
 
 
-- 
cgit v0.10.2


From d99ca3b977fc5a93141304f571475c2af9e6c1c5 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 2 Dec 2009 17:26:47 +0900
Subject: sched, cputime: Cleanups related to task_times()

- Remove if({u,s}t)s because no one call it with NULL now.
- Use cputime_{add,sub}().
- Add ifndef-endif for prev_{u,s}time since they are used
  only when !VIRT_CPU_ACCOUNTING.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B1624C7.7040302@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0395b0f..dff85e5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1331,7 +1331,9 @@ struct task_struct {
 
 	cputime_t utime, stime, utimescaled, stimescaled;
 	cputime_t gtime;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	cputime_t prev_utime, prev_stime;
+#endif
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time; 		/* monotonic time */
 	struct timespec real_start_time;	/* boot based time */
diff --git a/kernel/fork.c b/kernel/fork.c
index 166b8c4..ad7cb6d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1066,8 +1066,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->gtime = cputime_zero;
 	p->utimescaled = cputime_zero;
 	p->stimescaled = cputime_zero;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	p->prev_utime = cputime_zero;
 	p->prev_stime = cputime_zero;
+#endif
 
 	p->default_timer_slack_ns = current->timer_slack_ns;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 4883fee..17e2c1d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5184,10 +5184,8 @@ void account_idle_ticks(unsigned long ticks)
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-	if (ut)
-		*ut = p->utime;
-	if (st)
-		*st = p->stime;
+	*ut = p->utime;
+	*st = p->stime;
 }
 #else
 
@@ -5197,7 +5195,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-	cputime_t rtime, utime = p->utime, total = utime + p->stime;
+	cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
 
 	/*
 	 * Use CFS's precise accounting:
@@ -5217,12 +5215,10 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	 * Compare with previous values, to keep monotonicity:
 	 */
 	p->prev_utime = max(p->prev_utime, utime);
-	p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+	p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
 
-	if (ut)
-		*ut = p->prev_utime;
-	if (st)
-		*st = p->prev_stime;
+	*ut = p->prev_utime;
+	*st = p->prev_stime;
 }
 #endif
 
-- 
cgit v0.10.2


From 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 2 Dec 2009 17:28:07 +0900
Subject: sched, cputime: Introduce thread_group_times()

This is a real fix for problem of utime/stime values decreasing
described in the thread:

   http://lkml.org/lkml/2009/11/3/522

Now cputime is accounted in the following way:

 - {u,s}time in task_struct are increased every time when the thread
   is interrupted by a tick (timer interrupt).

 - When a thread exits, its {u,s}time are added to signal->{u,s}time,
   after adjusted by task_times().

 - When all threads in a thread_group exits, accumulated {u,s}time
   (and also c{u,s}time) in signal struct are added to c{u,s}time
   in signal struct of the group's parent.

So {u,s}time in task struct are "raw" tick count, while
{u,s}time and c{u,s}time in signal struct are "adjusted" values.

And accounted values are used by:

 - task_times(), to get cputime of a thread:
   This function returns adjusted values that originates from raw
   {u,s}time and scaled by sum_exec_runtime that accounted by CFS.

 - thread_group_cputime(), to get cputime of a thread group:
   This function returns sum of all {u,s}time of living threads in
   the group, plus {u,s}time in the signal struct that is sum of
   adjusted cputimes of all exited threads belonged to the group.

The problem is the return value of thread_group_cputime(),
because it is mixed sum of "raw" value and "adjusted" value:

  group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time)

This misbehavior can break {u,s}time monotonicity.
Assume that if there is a thread that have raw values greater
than adjusted values (e.g. interrupted by 1000Hz ticks 50 times
but only runs 45ms) and if it exits, cputime will decrease (e.g.
-5ms).

To fix this, we could do:

  group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time)

But task_times() contains hard divisions, so applying it for
every thread should be avoided.

This patch fixes the above problem in the following way:

 - Modify thread's exit (= __exit_signal()) not to use task_times().
   It means {u,s}time in signal struct accumulates raw values instead
   of adjusted values.  As the result it makes thread_group_cputime()
   to return pure sum of "raw" values.

 - Introduce a new function thread_group_times(*task, *utime, *stime)
   that converts "raw" values of thread_group_cputime() to "adjusted"
   values, in same calculation procedure as task_times().

 - Modify group's exit (= wait_task_zombie()) to use this introduced
   thread_group_times().  It make c{u,s}time in signal struct to
   have adjusted values like before this patch.

 - Replace some thread_group_cputime() by thread_group_times().
   This replacements are only applied where conveys the "adjusted"
   cputime to users, and where already uses task_times() near by it.
   (i.e. sys_times(), getrusage(), and /proc/<PID>/stat.)

This patch have a positive side effect:

 - Before this patch, if a group contains many short-life threads
   (e.g. runs 0.9ms and not interrupted by ticks), the group's
   cputime could be invisible since thread's cputime was accumulated
   after adjusted: imagine adjustment function as adj(ticks, runtime),
     {adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0.
   After this patch it will not happen because the adjustment is
   applied after accumulated.

v2:
 - remove if()s, put new variables into signal_struct.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B162517.8040909@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ca61a88..2571da4 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -506,7 +506,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 		/* add up live thread stats at the group level */
 		if (whole) {
-			struct task_cputime cputime;
 			struct task_struct *t = task;
 			do {
 				min_flt += t->min_flt;
@@ -517,9 +516,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-			thread_group_cputime(task, &cputime);
-			utime = cputime.utime;
-			stime = cputime.stime;
+			thread_group_times(task, &utime, &stime);
 			gtime = cputime_add(gtime, sig->gtime);
 		}
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dff85e5..34238bd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -624,6 +624,9 @@ struct signal_struct {
 	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+	cputime_t prev_utime, prev_stime;
+#endif
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
@@ -1723,6 +1726,7 @@ static inline void put_task_struct(struct task_struct *t)
 }
 
 extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
+extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
 /*
  * Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index 2eaf68b..b221ad6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -91,8 +91,6 @@ static void __exit_signal(struct task_struct *tsk)
 	if (atomic_dec_and_test(&sig->count))
 		posix_cpu_timers_exit_group(tsk);
 	else {
-		cputime_t utime, stime;
-
 		/*
 		 * If there is any task waiting for the group exit
 		 * then notify it:
@@ -112,9 +110,8 @@ static void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
-		task_times(tsk, &utime, &stime);
-		sig->utime = cputime_add(sig->utime, utime);
-		sig->stime = cputime_add(sig->stime, stime);
+		sig->utime = cputime_add(sig->utime, tsk->utime);
+		sig->stime = cputime_add(sig->stime, tsk->stime);
 		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
@@ -1208,6 +1205,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		struct signal_struct *psig;
 		struct signal_struct *sig;
 		unsigned long maxrss;
+		cputime_t tgutime, tgstime;
 
 		/*
 		 * The resource counters for the group leader are in its
@@ -1223,20 +1221,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		 * need to protect the access to parent->signal fields,
 		 * as other threads in the parent group can be right
 		 * here reaping other children at the same time.
+		 *
+		 * We use thread_group_times() to get times for the thread
+		 * group, which consolidates times for all threads in the
+		 * group including the group leader.
 		 */
+		thread_group_times(p, &tgutime, &tgstime);
 		spin_lock_irq(&p->real_parent->sighand->siglock);
 		psig = p->real_parent->signal;
 		sig = p->signal;
 		psig->cutime =
 			cputime_add(psig->cutime,
-			cputime_add(p->utime,
-			cputime_add(sig->utime,
-				    sig->cutime)));
+			cputime_add(tgutime,
+				    sig->cutime));
 		psig->cstime =
 			cputime_add(psig->cstime,
-			cputime_add(p->stime,
-			cputime_add(sig->stime,
-				    sig->cstime)));
+			cputime_add(tgstime,
+				    sig->cstime));
 		psig->cgtime =
 			cputime_add(psig->cgtime,
 			cputime_add(p->gtime,
diff --git a/kernel/fork.c b/kernel/fork.c
index ad7cb6d..3d6f121 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -884,6 +884,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 	sig->gtime = cputime_zero;
 	sig->cgtime = cputime_zero;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+	sig->prev_utime = sig->prev_stime = cputime_zero;
+#endif
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 17e2c1d..e6ba726 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5187,6 +5187,16 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	*ut = p->utime;
 	*st = p->stime;
 }
+
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputime(p, &cputime);
+
+	*ut = cputime.utime;
+	*st = cputime.stime;
+}
 #else
 
 #ifndef nsecs_to_cputime
@@ -5220,6 +5230,37 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	*ut = p->prev_utime;
 	*st = p->prev_stime;
 }
+
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct signal_struct *sig = p->signal;
+	struct task_cputime cputime;
+	cputime_t rtime, utime, total;
+
+	thread_group_cputime(p, &cputime);
+
+	total = cputime_add(cputime.utime, cputime.stime);
+	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
+
+	if (total) {
+		u64 temp;
+
+		temp = (u64)(rtime * cputime.utime);
+		do_div(temp, total);
+		utime = (cputime_t)temp;
+	} else
+		utime = rtime;
+
+	sig->prev_utime = max(sig->prev_utime, utime);
+	sig->prev_stime = max(sig->prev_stime,
+			      cputime_sub(rtime, sig->prev_utime));
+
+	*ut = sig->prev_utime;
+	*st = sig->prev_stime;
+}
 #endif
 
 /*
diff --git a/kernel/sys.c b/kernel/sys.c
index bbdfce0..9968c5f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -911,16 +911,15 @@ change_okay:
 
 void do_sys_times(struct tms *tms)
 {
-	struct task_cputime cputime;
-	cputime_t cutime, cstime;
+	cputime_t tgutime, tgstime, cutime, cstime;
 
-	thread_group_cputime(current, &cputime);
 	spin_lock_irq(&current->sighand->siglock);
+	thread_group_times(current, &tgutime, &tgstime);
 	cutime = current->signal->cutime;
 	cstime = current->signal->cstime;
 	spin_unlock_irq(&current->sighand->siglock);
-	tms->tms_utime = cputime_to_clock_t(cputime.utime);
-	tms->tms_stime = cputime_to_clock_t(cputime.stime);
+	tms->tms_utime = cputime_to_clock_t(tgutime);
+	tms->tms_stime = cputime_to_clock_t(tgstime);
 	tms->tms_cutime = cputime_to_clock_t(cutime);
 	tms->tms_cstime = cputime_to_clock_t(cstime);
 }
@@ -1338,8 +1337,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 {
 	struct task_struct *t;
 	unsigned long flags;
-	cputime_t utime, stime;
-	struct task_cputime cputime;
+	cputime_t tgutime, tgstime, utime, stime;
 	unsigned long maxrss = 0;
 
 	memset((char *) r, 0, sizeof *r);
@@ -1372,9 +1370,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 				break;
 
 		case RUSAGE_SELF:
-			thread_group_cputime(p, &cputime);
-			utime = cputime_add(utime, cputime.utime);
-			stime = cputime_add(stime, cputime.stime);
+			thread_group_times(p, &tgutime, &tgstime);
+			utime = cputime_add(utime, tgutime);
+			stime = cputime_add(stime, tgstime);
 			r->ru_nvcsw += p->signal->nvcsw;
 			r->ru_nivcsw += p->signal->nivcsw;
 			r->ru_minflt += p->signal->min_flt;
-- 
cgit v0.10.2