From d3b8f889a220aed825accc28eb64ce283a0d51ac Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Mon, 17 Aug 2009 16:40:47 -0700
Subject: x86: Make tsc=reliable override boot time stability checks

This patch makes the tsc=reliable option disable the boot time
stability checks. Currently the option only disables the runtime
watchdog checks. This change allows folks who want to override the
boot time TSC stability checks and use the TSC when the system would
otherwise disqualify it.

There still are some situations that the TSC will be disqualified,
such as cpufreq scaling. But these are situations where the box will
hang if allowed.

Patch also includes a fix for an issue found by Thomas Gleixner, where
the TSC disqualification message wouldn't be printed after a call to
unsynchronized_tsc().

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: akataria@vmware.com
Cc: Stephen Hemminger <shemminger@vyatta.com>
LKML-Reference: <1250552447.7212.92.camel@localhost.localdomain>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 7936b80..4c6b415 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2484,12 +2484,13 @@ and is between 256 and 4096 characters. It is defined in the file
 			Format:
 			<io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
 
-	tsc=		Disable clocksource-must-verify flag for TSC.
+	tsc=		Disable clocksource stability checks for TSC.
 			Format: <string>
 			[x86] reliable: mark tsc clocksource as reliable, this
-			disables clocksource verification at runtime.
-			Used to enable high-resolution timer mode on older
-			hardware, and in virtualized environment.
+			disables clocksource verification at runtime, as well
+			as the stability checks done at bootup.	Used to enable
+			high-resolution timer mode on older hardware, and in
+			virtualized environment.
 
 	turbografx.map[2|3]=	[HW,JOY]
 			TurboGraFX parallel port interface
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 71f4368..648fb26 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -825,6 +825,9 @@ __cpuinit int unsynchronized_tsc(void)
 
 	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
 		return 0;
+
+	if (tsc_clocksource_reliable)
+		return 0;
 	/*
 	 * Intel systems are normally all synchronized.
 	 * Exceptions must mark TSC as unstable:
@@ -832,10 +835,10 @@ __cpuinit int unsynchronized_tsc(void)
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
 		/* assume multi socket systems are not synchronized: */
 		if (num_possible_cpus() > 1)
-			tsc_unstable = 1;
+			return 1;
 	}
 
-	return tsc_unstable;
+	return 0;
 }
 
 static void __init init_tsc_clocksource(void)
-- 
cgit v0.10.2


From 8e8be45e8e55daa381028aec339829929ddb53a5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 2 Sep 2010 16:16:14 -0700
Subject: rcu: add priority-inversion testing to rcutorture

Add an optional test to force long-term preemption of RCU read-side
critical sections, controlled by new test_boost, test_boost_interval,
and test_boost_duration module parameters.  This is to be used to
test RCU priority boosting.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9d8e8fb..89613f9 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,6 +47,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <asm/byteorder.h>
+#include <linux/sched.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -64,6 +65,9 @@ static int irqreader = 1;	/* RCU readers from irq (timers). */
 static int fqs_duration = 0;	/* Duration of bursts (us), 0 to disable. */
 static int fqs_holdoff = 0;	/* Hold time within burst (us). */
 static int fqs_stutter = 3;	/* Wait time between bursts (s). */
+static int test_boost = 1;	/* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
+static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
+static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 
 module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(test_boost, int, 0444);
+MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+module_param(test_boost_duration, int, 0444);
+MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
 
@@ -109,6 +119,7 @@ static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
 
 #define RCU_TORTURE_PIPE_LEN 10
 
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_allocerror;
+static long n_rcu_torture_boost_afferror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +164,16 @@ static int stutter_pause_test;
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 
+#ifdef CONFIG_RCU_BOOST
+#define rcu_can_boost() 1
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define rcu_can_boost() 0
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+static unsigned long boost_starttime;	/* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex);		/* protect setting boost_starttime */
+					/*  and boost task create/destroy. */
+
 /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
 
 #define FULLSTOP_DONTSTOP 0	/* Normal operation. */
@@ -277,6 +304,7 @@ struct rcu_torture_ops {
 	void (*fqs)(void);
 	int (*stats)(char *page);
 	int irq_capable;
+	int can_boost;
 	char *name;
 };
 
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
+	.can_boost	= rcu_can_boost(),
 	.name		= "rcu"
 };
 
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
+	.can_boost	= rcu_can_boost(),
 	.name		= "rcu_sync"
 };
 
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
+	.can_boost	= rcu_can_boost(),
 	.name		= "rcu_expedited"
 };
 
@@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
 };
 
 /*
+ * RCU torture priority-boost testing.  Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked.  If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+
+struct rcu_boost_inflight {
+	struct rcu_head rcu;
+	int inflight;
+};
+
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+	struct rcu_boost_inflight *rbip =
+		container_of(head, struct rcu_boost_inflight, rcu);
+
+	smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+	rbip->inflight = 0;
+}
+
+static int rcu_torture_boost(void *arg)
+{
+	unsigned long call_rcu_time;
+	unsigned long endtime;
+	unsigned long oldstarttime;
+	struct rcu_boost_inflight rbi = { .inflight = 0 };
+	struct sched_param sp;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+
+	/* Set real-time priority. */
+	sp.sched_priority = 1;
+	if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+		VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+		n_rcu_torture_boost_rterror++;
+	}
+
+	/* Each pass through the following loop does one boost-test cycle. */
+	do {
+		/* Wait for the next test interval. */
+		oldstarttime = boost_starttime;
+		while (jiffies - oldstarttime > ULONG_MAX / 2) {
+			schedule_timeout_uninterruptible(1);
+			rcu_stutter_wait("rcu_torture_boost");
+			if (kthread_should_stop() ||
+			    fullstop != FULLSTOP_DONTSTOP)
+				goto checkwait;
+		}
+
+		/* Do one boost-test interval. */
+		endtime = oldstarttime + test_boost_duration * HZ;
+		call_rcu_time = jiffies;
+		while (jiffies - endtime > ULONG_MAX / 2) {
+			/* If we don't have a callback in flight, post one. */
+			if (!rbi.inflight) {
+				smp_mb(); /* RCU core before ->inflight = 1. */
+				rbi.inflight = 1;
+				call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+				if (jiffies - call_rcu_time >
+					 test_boost_duration * HZ - HZ / 2) {
+					VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+					n_rcu_torture_boost_failure++;
+				}
+				call_rcu_time = jiffies;
+			}
+			cond_resched();
+			rcu_stutter_wait("rcu_torture_boost");
+			if (kthread_should_stop() ||
+			    fullstop != FULLSTOP_DONTSTOP)
+				goto checkwait;
+		}
+
+		/*
+		 * Set the start time of the next test interval.
+		 * Yes, this is vulnerable to long delays, but such
+		 * delays simply cause a false negative for the next
+		 * interval.  Besides, we are running at RT priority,
+		 * so delays should be relatively rare.
+		 */
+		while (oldstarttime == boost_starttime) {
+			if (mutex_trylock(&boost_mutex)) {
+				boost_starttime = jiffies +
+						  test_boost_interval * HZ;
+				n_rcu_torture_boosts++;
+				mutex_unlock(&boost_mutex);
+				break;
+			}
+			schedule_timeout_uninterruptible(1);
+		}
+
+		/* Go do the stutter. */
+checkwait:	rcu_stutter_wait("rcu_torture_boost");
+	} while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
+
+	/* Clean up and exit. */
+	VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+	rcutorture_shutdown_absorb("rcu_torture_boost");
+	while (!kthread_should_stop() || rbi.inflight)
+		schedule_timeout_uninterruptible(1);
+	smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+	return 0;
+}
+
+/*
  * RCU torture force-quiescent-state kthread.  Repeatedly induces
  * bursts of calls to force_quiescent_state(), increasing the probability
  * of occurrence of some important types of race conditions.
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page)
 	cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
 	cnt += sprintf(&page[cnt],
 		       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
-		       "rtmbe: %d nt: %ld",
+		       "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+		       "rtbf: %ld rtb: %ld nt: %ld",
 		       rcu_torture_current,
 		       rcu_torture_current_version,
 		       list_empty(&rcu_torture_freelist),
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page)
 		       atomic_read(&n_rcu_torture_alloc_fail),
 		       atomic_read(&n_rcu_torture_free),
 		       atomic_read(&n_rcu_torture_mberror),
+		       n_rcu_torture_boost_ktrerror,
+		       n_rcu_torture_boost_rterror,
+		       n_rcu_torture_boost_allocerror,
+		       n_rcu_torture_boost_afferror,
+		       n_rcu_torture_boost_failure,
+		       n_rcu_torture_boosts,
 		       n_rcu_torture_timers);
-	if (atomic_read(&n_rcu_torture_mberror) != 0)
+	if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+	    n_rcu_torture_boost_ktrerror != 0 ||
+	    n_rcu_torture_boost_rterror != 0 ||
+	    n_rcu_torture_boost_allocerror != 0 ||
+	    n_rcu_torture_boost_afferror != 0 ||
+	    n_rcu_torture_boost_failure != 0)
 		cnt += sprintf(&page[cnt], " !!!");
 	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
 	if (i > 1) {
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg)
 }
 
 static inline void
-rcu_torture_print_module_parms(char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
 {
 	printk(KERN_ALERT "%s" TORTURE_FLAG
 		"--- %s: nreaders=%d nfakewriters=%d "
 		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
 		"shuffle_interval=%d stutter=%d irqreader=%d "
-		"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
+		"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+		"test_boost=%d/%d test_boost_interval=%d "
+		"test_boost_duration=%d\n",
 		torture_type, tag, nrealreaders, nfakewriters,
 		stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-		stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
+		stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+		test_boost, cur_ops->can_boost,
+		test_boost_interval, test_boost_duration);
 }
 
-static struct notifier_block rcutorture_nb = {
+static struct notifier_block rcutorture_shutdown_nb = {
 	.notifier_call = rcutorture_shutdown_notify,
 };
 
+static void rcutorture_booster_cleanup(int cpu)
+{
+	struct task_struct *t;
+
+	if (boost_tasks[cpu] == NULL)
+		return;
+	mutex_lock(&boost_mutex);
+	VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+	t = boost_tasks[cpu];
+	boost_tasks[cpu] = NULL;
+	mutex_unlock(&boost_mutex);
+
+	/* This must be outside of the mutex, otherwise deadlock! */
+	kthread_stop(t);
+}
+
+static int rcutorture_booster_init(int cpu)
+{
+	int retval;
+
+	if (boost_tasks[cpu] != NULL)
+		return 0;  /* Already created, nothing more to do. */
+
+	/* Don't allow time recalculation while creating a new task. */
+	mutex_lock(&boost_mutex);
+	VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+	boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+					  "rcu_torture_boost");
+	if (IS_ERR(boost_tasks[cpu])) {
+		retval = PTR_ERR(boost_tasks[cpu]);
+		VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+		n_rcu_torture_boost_ktrerror++;
+		boost_tasks[cpu] = NULL;
+		mutex_unlock(&boost_mutex);
+		return retval;
+	}
+	kthread_bind(boost_tasks[cpu], cpu);
+	wake_up_process(boost_tasks[cpu]);
+	mutex_unlock(&boost_mutex);
+	return 0;
+}
+
+static int rcutorture_cpu_notify(struct notifier_block *self,
+				 unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		(void)rcutorture_booster_init(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		rcutorture_booster_cleanup(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_cpu_nb = {
+	.notifier_call = rcutorture_cpu_notify,
+};
+
 static void
 rcu_torture_cleanup(void)
 {
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void)
 	}
 	fullstop = FULLSTOP_RMMOD;
 	mutex_unlock(&fullstop_mutex);
-	unregister_reboot_notifier(&rcutorture_nb);
+	unregister_reboot_notifier(&rcutorture_shutdown_nb);
 	if (stutter_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
 		kthread_stop(stutter_task);
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void)
 		kthread_stop(fqs_task);
 	}
 	fqs_task = NULL;
+	if ((test_boost == 1 && cur_ops->can_boost) ||
+	    test_boost == 2) {
+		unregister_cpu_notifier(&rcutorture_cpu_nb);
+		for_each_possible_cpu(i)
+			rcutorture_booster_cleanup(i);
+	}
 
 	/* Wait for all RCU callbacks to fire.  */
 
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void)
 	if (cur_ops->cleanup)
 		cur_ops->cleanup();
 	if (atomic_read(&n_rcu_torture_error))
-		rcu_torture_print_module_parms("End of test: FAILURE");
+		rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
 	else
-		rcu_torture_print_module_parms("End of test: SUCCESS");
+		rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
 }
 
 static int __init
@@ -1242,7 +1464,7 @@ rcu_torture_init(void)
 		nrealreaders = nreaders;
 	else
 		nrealreaders = 2 * num_online_cpus();
-	rcu_torture_print_module_parms("Start of test");
+	rcu_torture_print_module_parms(cur_ops, "Start of test");
 	fullstop = FULLSTOP_DONTSTOP;
 
 	/* Set up the freelist. */
@@ -1263,6 +1485,12 @@ rcu_torture_init(void)
 	atomic_set(&n_rcu_torture_free, 0);
 	atomic_set(&n_rcu_torture_mberror, 0);
 	atomic_set(&n_rcu_torture_error, 0);
+	n_rcu_torture_boost_ktrerror = 0;
+	n_rcu_torture_boost_rterror = 0;
+	n_rcu_torture_boost_allocerror = 0;
+	n_rcu_torture_boost_afferror = 0;
+	n_rcu_torture_boost_failure = 0;
+	n_rcu_torture_boosts = 0;
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
 		atomic_set(&rcu_torture_wcount[i], 0);
 	for_each_possible_cpu(cpu) {
@@ -1376,7 +1604,27 @@ rcu_torture_init(void)
 			goto unwind;
 		}
 	}
-	register_reboot_notifier(&rcutorture_nb);
+	if (test_boost_interval < 1)
+		test_boost_interval = 1;
+	if (test_boost_duration < 2)
+		test_boost_duration = 2;
+	if ((test_boost == 1 && cur_ops->can_boost) ||
+	    test_boost == 2) {
+		int retval;
+
+		boost_starttime = jiffies + test_boost_interval * HZ;
+		register_cpu_notifier(&rcutorture_cpu_nb);
+		for_each_possible_cpu(i) {
+			if (cpu_is_offline(i))
+				continue;  /* Heuristic: CPU can go offline. */
+			retval = rcutorture_booster_init(i);
+			if (retval < 0) {
+				firsterr = retval;
+				goto unwind;
+			}
+		}
+	}
+	register_reboot_notifier(&rcutorture_shutdown_nb);
 	mutex_unlock(&fullstop_mutex);
 	return 0;
 
-- 
cgit v0.10.2


From a386b5af8edda1c742ce9f77891e112eefffc005 Mon Sep 17 00:00:00 2001
From: Kasper Pedersen <kkp2010@kasperkp.dk>
Date: Wed, 20 Oct 2010 15:55:15 -0700
Subject: time: Compensate for rounding on odd-frequency clocksources

When the clocksource is not a multiple of HZ, the clock will be off.  For
acpi_pm, HZ=1000 the error is 127.111 ppm:

The rounding of cycle_interval ends up generating a false error term in
ntp_error accumulation since xtime_interval is not exactly 1/HZ.  So, we
subtract out the error caused by the rounding.

This has been visible since 2.6.32-rc2
	commit a092ff0f90cae22b2ac8028ecd2c6f6c1a9e4601
	time: Implement logarithmic time accumulation
That commit raised NTP_INTERVAL_FREQ and exposed the rounding error.

testing tool: http://n1.taur.dk/permanent/testpmt.c
Also tested with ntpd and a frequency counter.

Signed-off-by: Kasper Pedersen <kkp2010@kasperkp.dk>
Acked-by: john stultz <johnstul@us.ibm.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 49010d8..5bb86da 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,6 +32,8 @@ struct timekeeper {
 	cycle_t cycle_interval;
 	/* Number of clock shifted nano seconds in one NTP interval. */
 	u64	xtime_interval;
+	/* shifted nano seconds left over when rounding cycle_interval */
+	s64	xtime_remainder;
 	/* Raw nano seconds accumulated per NTP interval. */
 	u32	raw_interval;
 
@@ -62,7 +64,7 @@ struct timekeeper timekeeper;
 static void timekeeper_setup_internals(struct clocksource *clock)
 {
 	cycle_t interval;
-	u64 tmp;
+	u64 tmp, ntpinterval;
 
 	timekeeper.clock = clock;
 	clock->cycle_last = clock->read(clock);
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
 	/* Do the ns -> cycle conversion first, using original mult */
 	tmp = NTP_INTERVAL_LENGTH;
 	tmp <<= clock->shift;
+	ntpinterval = tmp;
 	tmp += clock->mult/2;
 	do_div(tmp, clock->mult);
 	if (tmp == 0)
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
 
 	/* Go back from cycles -> shifted ns */
 	timekeeper.xtime_interval = (u64) interval * clock->mult;
+	timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
 	timekeeper.raw_interval =
 		((u64) interval * clock->mult) >> clock->shift;
 
@@ -719,7 +723,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
 
 	/* Accumulate error between NTP and clock interval */
 	timekeeper.ntp_error += tick_length << shift;
-	timekeeper.ntp_error -= timekeeper.xtime_interval <<
+	timekeeper.ntp_error -=
+	    (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
 				(timekeeper.ntp_error_shift + shift);
 
 	return offset;
-- 
cgit v0.10.2


From d0959024d8fb6555ba8bfdc6624cc7b7c2e675fd Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Wed, 20 Oct 2010 15:57:30 -0700
Subject: timer_list: Remove alignment padding on 64 bit when
 CONFIG_TIMER_STATS

Reorder struct timer_list to remove 8 bytes of alignment padding on 64
bit builds when CONFIG_TIMER_STATS is selected.

timer_list is widely used across the kernel so many structures will
benefit and shrink in size.

For example, with my config on x86_64
	per_cpu_dm_data shrinks from 136 to 128 bytes
and
	ahci_port_priv shrinks from 1032 to 968 bytes.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 38cf093..f3dccdb 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -24,9 +24,9 @@ struct timer_list {
 	int slack;
 
 #ifdef CONFIG_TIMER_STATS
+	int start_pid;
 	void *start_site;
 	char start_comm[16];
-	int start_pid;
 #endif
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map lockdep_map;
-- 
cgit v0.10.2


From aaabe31c25a439b92cc281b14ca18b85bae7e7a6 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Wed, 20 Oct 2010 15:57:30 -0700
Subject: timer: Initialize the field slack of timer_list

TIMER_INITIALIZER() should initialize the field slack of timer_list as
__init_timer() does.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/timer.h b/include/linux/timer.h
index f3dccdb..1794674 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -54,6 +54,7 @@ extern struct tvec_base boot_tvec_bases;
 		.expires = (_expires),				\
 		.data = (_data),				\
 		.base = &boot_tvec_bases,			\
+		.slack = -1,					\
 		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
 			__FILE__ ":" __stringify(__LINE__))	\
 	}
-- 
cgit v0.10.2


From 2bf1c05e3c406925e498d06da66b4828f0209ea6 Mon Sep 17 00:00:00 2001
From: Nikitas Angelinas <nikitasangelinas@googlemail.com>
Date: Wed, 20 Oct 2010 15:57:31 -0700
Subject: time: Use ARRAY_SIZE macro in timecompare.c

Replace sizeof(buffer)/sizeof(buffer[0]) with ARRAY_SIZE(buffer) in
kernel/time/timecompare.c

Signed-off-by: Nikitas Angelinas <nikitasangelinas@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index ac38fbb..a9ae369 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/math64.h>
+#include <linux/kernel.h>
 
 /*
  * fixed point arithmetic scale factor for skew
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync,
 	int index;
 	int num_samples = sync->num_samples;
 
-	if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
+	if (num_samples > ARRAY_SIZE(buffer)) {
 		samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
 		if (!samples) {
 			samples = buffer;
-			num_samples = sizeof(buffer)/sizeof(buffer[0]);
+			num_samples = ARRAY_SIZE(buffer);
 		}
 	} else {
 		samples = buffer;
-- 
cgit v0.10.2


From dd6414b50fa2b1cd247a8aa8f8bd42414b7453e1 Mon Sep 17 00:00:00 2001
From: Phil Carmody <ext-phil.2.carmody@nokia.com>
Date: Wed, 20 Oct 2010 15:57:33 -0700
Subject: timer: Permit statically-declared work with deferrable timers

Currently, you have to just define a delayed_work uninitialised, and then
initialise it before first use.  That's a tad clumsy.  At risk of playing
mind-games with the compiler, fooling it into doing pointer arithmetic
with compile-time-constants, this lets clients properly initialise delayed
work with deferrable timers statically.

This patch was inspired by the issues which lead Artem Bityutskiy to
commit 8eab945c5616fc984 ("sunrpc: make the cache cleaner workqueue
deferrable").

Signed-off-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
Acked-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 1794674..cbfb7a3 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -48,6 +48,18 @@ extern struct tvec_base boot_tvec_bases;
 #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
 #endif
 
+/*
+ * Note that all tvec_bases are 2 byte aligned and lower bit of
+ * base in timer_list is guaranteed to be zero. Use the LSB to
+ * indicate whether the timer is deferrable.
+ *
+ * A deferrable timer will work normally when the system is busy, but
+ * will not cause a CPU to come out of idle just to service it; instead,
+ * the timer will be serviced when the CPU eventually wakes up with a
+ * subsequent non-deferrable timer.
+ */
+#define TBASE_DEFERRABLE_FLAG		(0x1)
+
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
 		.entry = { .prev = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
@@ -59,6 +71,19 @@ extern struct tvec_base boot_tvec_bases;
 			__FILE__ ":" __stringify(__LINE__))	\
 	}
 
+#define TBASE_MAKE_DEFERRED(ptr) ((struct tvec_base *)		\
+		  ((unsigned char *)(ptr) + TBASE_DEFERRABLE_FLAG))
+
+#define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) {\
+		.entry = { .prev = TIMER_ENTRY_STATIC },	\
+		.function = (_function),			\
+		.expires = (_expires),				\
+		.data = (_data),				\
+		.base = TBASE_MAKE_DEFERRED(&boot_tvec_bases),	\
+		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
+			__FILE__ ":" __stringify(__LINE__))	\
+	}
+
 #define DEFINE_TIMER(_name, _function, _expires, _data)		\
 	struct timer_list _name =				\
 		TIMER_INITIALIZER(_function, _expires, _data)
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index f11100f..88238c1 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -127,12 +127,20 @@ struct execute_work {
 	.timer = TIMER_INITIALIZER(NULL, 0, 0),			\
 	}
 
+#define __DEFERRED_WORK_INITIALIZER(n, f) {			\
+	.work = __WORK_INITIALIZER((n).work, (f)),		\
+	.timer = TIMER_DEFERRED_INITIALIZER(NULL, 0, 0),	\
+	}
+
 #define DECLARE_WORK(n, f)					\
 	struct work_struct n = __WORK_INITIALIZER(n, f)
 
 #define DECLARE_DELAYED_WORK(n, f)				\
 	struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f)
 
+#define DECLARE_DEFERRED_WORK(n, f)				\
+	struct delayed_work n = __DEFERRED_WORK_INITIALIZER(n, f)
+
 /*
  * initialize a work item's function pointer
  */
diff --git a/kernel/timer.c b/kernel/timer.c
index 97bf05b..72853b2 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
 
-/*
- * Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB to
- * indicate whether the timer is deferrable.
- *
- * A deferrable timer will work normally when the system is busy, but
- * will not cause a CPU to come out of idle just to service it; instead,
- * the timer will be serviced when the CPU eventually wakes up with a
- * subsequent non-deferrable timer.
- */
-#define TBASE_DEFERRABLE_FLAG		(0x1)
-
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
 {
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
 
 static inline void timer_set_deferrable(struct timer_list *timer)
 {
-	timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
-				       TBASE_DEFERRABLE_FLAG));
+	timer->base = TBASE_MAKE_DEFERRED(timer->base);
 }
 
 static inline void
-- 
cgit v0.10.2


From 20f33a03f0cf87e51165f7084f697acfb68e865b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 20 Oct 2010 15:57:34 -0700
Subject: posix-timers: Annotate lock_timer()

lock_timer() conditionally grabs it_lock in case of returning non-NULL
but unlock_timer() releases it unconditionally. This leads sparse to
complain about the lock context imbalance. Rename and wrap lock_timer
using __cond_lock() macro to make sparse happy.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9ca4973..93bd2eb 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer);
 
 static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
 
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
+
+#define lock_timer(tid, flags)						   \
+({	struct k_itimer *__timr;					   \
+	__cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags));  \
+	__timr;								   \
+})
 
 static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 {
@@ -619,7 +625,7 @@ out:
  * the find to the timer lock.  To avoid a dead lock, the timer id MUST
  * be release with out holding the timer lock.
  */
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 {
 	struct k_itimer *timr;
 	/*
-- 
cgit v0.10.2


From 6f1bc451e6a79470b122a37ee1fc6bbca450f444 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang@windriver.com>
Date: Wed, 20 Oct 2010 15:57:31 -0700
Subject: timer: Make try_to_del_timer_sync() the same on SMP and UP

On UP try_to_del_timer_sync() is mapped to del_timer() which does not
take the running timer callback into account, so it has different
semantics.

Remove the SMP dependency of try_to_del_timer_sync() by using
base->running_timer in the UP case as well.

[ tglx: Removed set_running_timer() inline and tweaked the changelog ]

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/timer.h b/include/linux/timer.h
index cbfb7a3..6abd913 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -274,11 +274,11 @@ static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
 
 extern void add_timer(struct timer_list *timer);
 
+extern int try_to_del_timer_sync(struct timer_list *timer);
+
 #ifdef CONFIG_SMP
-  extern int try_to_del_timer_sync(struct timer_list *timer);
   extern int del_timer_sync(struct timer_list *timer);
 #else
-# define try_to_del_timer_sync(t)	del_timer(t)
 # define del_timer_sync(t)		del_timer(t)
 #endif
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 72853b2..47b86c1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -330,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
 
-
-static inline void set_running_timer(struct tvec_base *base,
-					struct timer_list *timer)
-{
-#ifdef CONFIG_SMP
-	base->running_timer = timer;
-#endif
-}
-
 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
@@ -923,15 +914,12 @@ int del_timer(struct timer_list *timer)
 }
 EXPORT_SYMBOL(del_timer);
 
-#ifdef CONFIG_SMP
 /**
  * try_to_del_timer_sync - Try to deactivate a timer
  * @timer: timer do del
  *
  * This function tries to deactivate a timer. Upon successful (ret >= 0)
  * exit the timer is not queued and the handler is not running on any CPU.
- *
- * It must not be called from interrupt contexts.
  */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
@@ -960,6 +948,7 @@ out:
 }
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
+#ifdef CONFIG_SMP
 /**
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -1098,7 +1087,7 @@ static inline void __run_timers(struct tvec_base *base)
 
 			timer_stats_account_timer(timer);
 
-			set_running_timer(base, timer);
+			base->running_timer = timer;
 			detach_timer(timer, 1);
 
 			spin_unlock_irq(&base->lock);
@@ -1106,7 +1095,7 @@ static inline void __run_timers(struct tvec_base *base)
 			spin_lock_irq(&base->lock);
 		}
 	}
-	set_running_timer(base, NULL);
+	base->running_timer = NULL;
 	spin_unlock_irq(&base->lock);
 }
 
-- 
cgit v0.10.2


From 1118e2cd33d47254854e1ba3ba8e32802ff14fdf Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang@windriver.com>
Date: Wed, 20 Oct 2010 15:57:32 -0700
Subject: timer: Del_timer_sync() can be used in softirq context

Actually we have used del_timer_sync() in softirq context for a long time,
e.g. in __dst_free()::cancel_delayed_work().

So change the comments of it to warn on hardirq context only, and make
lockdep know about this change.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/timer.c b/kernel/timer.c
index 47b86c1..612de03 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -959,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
  *
  * Synchronization rules: Callers must prevent restarting of the timer,
  * otherwise this function is meaningless. It must not be called from
- * interrupt contexts. The caller must not hold locks which would prevent
+ * hardirq contexts. The caller must not hold locks which would prevent
  * completion of the timer's handler. The timer's handler must not call
  * add_timer_on(). Upon exit the timer is not queued and the handler is
  * not running on any CPU.
@@ -969,12 +969,10 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
 int del_timer_sync(struct timer_list *timer)
 {
 #ifdef CONFIG_LOCKDEP
-	unsigned long flags;
-
-	local_irq_save(flags);
+	local_bh_disable();
 	lock_map_acquire(&timer->lockdep_map);
 	lock_map_release(&timer->lockdep_map);
-	local_irq_restore(flags);
+	local_bh_enable();
 #endif
 
 	for (;;) {
-- 
cgit v0.10.2


From 466bd3030973910118ca601da8072be97a1e2209 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang@windriver.com>
Date: Wed, 20 Oct 2010 15:57:33 -0700
Subject: timer: Warn when del_timer_sync() is called in hardirq context

Add explict warning when del_timer_sync() is called in hardirq
context.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/timer.c b/kernel/timer.c
index 612de03..483e54b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -974,7 +974,11 @@ int del_timer_sync(struct timer_list *timer)
 	lock_map_release(&timer->lockdep_map);
 	local_bh_enable();
 #endif
-
+	/*
+	 * don't use it in hardirq context, because it
+	 * could lead to deadlock.
+	 */
+	WARN_ON(in_irq());
 	for (;;) {
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
-- 
cgit v0.10.2


From fd35fbcdd1b2579a6e00a1545f7124e4005d0474 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 22 Oct 2010 15:33:38 -0700
Subject: x86-64, asm: Use fxsaveq/fxrestorq in more places

Checkin d7acb92fea932ad2e7846480aeacddc2c03c8485 made use of fxsaveq
in fpu_fxsave() if the assembler supports it; this adds
fxsaveq/fxrstorq to fxrstor_checking() and fxsave_user() as well.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <AANLkTi=RKyHLNTq6iomZOXkc6Zw1j9iAgsq8388XmzwN@mail.gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 4aa2bb3..ef32890 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -93,6 +93,17 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 	int err;
 
 	/* See comment in fxsave() below. */
+#ifdef CONFIG_AS_FXSAVEQ
+	asm volatile("1:  fxrstorq %[fx]\n\t"
+		     "2:\n"
+		     ".section .fixup,\"ax\"\n"
+		     "3:  movl $-1,%[err]\n"
+		     "    jmp  2b\n"
+		     ".previous\n"
+		     _ASM_EXTABLE(1b, 3b)
+		     : [err] "=r" (err)
+		     : [fx] "m" (*fx), "0" (0));
+#else
 	asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
 		     "2:\n"
 		     ".section .fixup,\"ax\"\n"
@@ -102,6 +113,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 		     _ASM_EXTABLE(1b, 3b)
 		     : [err] "=r" (err)
 		     : [fx] "R" (fx), "m" (*fx), "0" (0));
+#endif
 	return err;
 }
 
@@ -119,6 +131,17 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 		return -EFAULT;
 
 	/* See comment in fxsave() below. */
+#ifdef CONFIG_AS_FXSAVEQ
+	asm volatile("1:  fxsaveq %[fx]\n\t"
+		     "2:\n"
+		     ".section .fixup,\"ax\"\n"
+		     "3:  movl $-1,%[err]\n"
+		     "    jmp  2b\n"
+		     ".previous\n"
+		     _ASM_EXTABLE(1b, 3b)
+		     : [err] "=r" (err), [fx] "=m" (*fx)
+		     : "0" (0));
+#else
 	asm volatile("1:  rex64/fxsave (%[fx])\n\t"
 		     "2:\n"
 		     ".section .fixup,\"ax\"\n"
@@ -128,6 +151,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 		     _ASM_EXTABLE(1b, 3b)
 		     : [err] "=r" (err), "=m" (*fx)
 		     : [fx] "R" (fx), "0" (0));
+#endif
 	if (unlikely(err) &&
 	    __clear_user(fx, sizeof(struct i387_fxsave_struct)))
 		err = -EFAULT;
-- 
cgit v0.10.2


From 5a7ae78fd478624df3059cb6f55056b85d074acc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 19 Oct 2010 10:46:28 -0700
Subject: x86: Allow platforms to force enable apic

Some embedded x86 platforms don't setup the APIC in the
BIOS/bootloader and would be forced to add "lapic" on the kernel
command line. That's a bit akward.

Split out the force enable code from detect_init_APIC() and allow
platform code to call it from the platform setup. That avoids the
command line parameter and possible replication of the MSR dance in
the force enable code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <1287510389-8388-1-git-send-email-dirk.brandewie@gmail.com>
Signed-off-by: Dirk Brandewie <dirk.j.brandewie@intel.com>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 286de34..ad50aaa 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -238,6 +238,7 @@ extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
 extern int APIC_init_uniprocessor(void);
 extern void enable_NMI_through_LVT0(void);
+extern int apic_force_enable(void);
 
 /*
  * On 32bit this is mach-xxx local
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 850657d..4638396 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1531,13 +1531,60 @@ static int __init detect_init_APIC(void)
 	return 0;
 }
 #else
+
+static int apic_verify(void)
+{
+	u32 features, h, l;
+
+	/*
+	 * The APIC feature bit should now be enabled
+	 * in `cpuid'
+	 */
+	features = cpuid_edx(1);
+	if (!(features & (1 << X86_FEATURE_APIC))) {
+		pr_warning("Could not enable APIC!\n");
+		return -1;
+	}
+	set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+	/* The BIOS may have set up the APIC at some other address */
+	rdmsr(MSR_IA32_APICBASE, l, h);
+	if (l & MSR_IA32_APICBASE_ENABLE)
+		mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+
+	pr_info("Found and enabled local APIC!\n");
+	return 0;
+}
+
+int apic_force_enable(void)
+{
+	u32 h, l;
+
+	if (disable_apic)
+		return -1;
+
+	/*
+	 * Some BIOSes disable the local APIC in the APIC_BASE
+	 * MSR. This can only be done in software for Intel P6 or later
+	 * and AMD K7 (Model > 1) or later.
+	 */
+	rdmsr(MSR_IA32_APICBASE, l, h);
+	if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+		pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+		l &= ~MSR_IA32_APICBASE_BASE;
+		l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
+		wrmsr(MSR_IA32_APICBASE, l, h);
+		enabled_via_apicbase = 1;
+	}
+	return apic_verify();
+}
+
 /*
  * Detect and initialize APIC
  */
 static int __init detect_init_APIC(void)
 {
-	u32 h, l, features;
-
 	/* Disabled by kernel option? */
 	if (disable_apic)
 		return -1;
@@ -1567,38 +1614,12 @@ static int __init detect_init_APIC(void)
 				"you can enable it with \"lapic\"\n");
 			return -1;
 		}
-		/*
-		 * Some BIOSes disable the local APIC in the APIC_BASE
-		 * MSR. This can only be done in software for Intel P6 or later
-		 * and AMD K7 (Model > 1) or later.
-		 */
-		rdmsr(MSR_IA32_APICBASE, l, h);
-		if (!(l & MSR_IA32_APICBASE_ENABLE)) {
-			pr_info("Local APIC disabled by BIOS -- reenabling.\n");
-			l &= ~MSR_IA32_APICBASE_BASE;
-			l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
-			wrmsr(MSR_IA32_APICBASE, l, h);
-			enabled_via_apicbase = 1;
-		}
-	}
-	/*
-	 * The APIC feature bit should now be enabled
-	 * in `cpuid'
-	 */
-	features = cpuid_edx(1);
-	if (!(features & (1 << X86_FEATURE_APIC))) {
-		pr_warning("Could not enable APIC!\n");
-		return -1;
+		if (apic_force_enable())
+			return -1;
+	} else {
+		if (apic_verify())
+			return -1;
 	}
-	set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
-	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
-	/* The BIOS may have set up the APIC at some other address */
-	rdmsr(MSR_IA32_APICBASE, l, h);
-	if (l & MSR_IA32_APICBASE_ENABLE)
-		mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
-
-	pr_info("Found and enabled local APIC!\n");
 
 	apic_pm_activate();
 
-- 
cgit v0.10.2


From 23f9b267159b4c7ff59d2e6c8ed31693eff841e3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2010 15:38:50 -0700
Subject: x86: apic: Move probe_nr_irqs_gsi() into ioapic_init_mappings()

probe_br_irqs_gsi() is called right after ioapic_init_mappings() and
there are no other users. Move it into ioapic_init_mappings() so the
declaration can disappear and the function can become static.

Rename ioapic_init_mappings() to ioapic_and_gsi_init() to reflect that
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <1287510389-8388-2-git-send-email-dirk.brandewie@gmail.com>
Signed-off-by: Dirk Brandewie <dirk.j.brandewie@intel.com>

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index c8be456..7e620ba 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -159,7 +159,7 @@ struct io_apic_irq_attr;
 extern int io_apic_set_pci_routing(struct device *dev, int irq,
 		 struct io_apic_irq_attr *irq_attr);
 void setup_IO_APIC_irq_extra(u32 gsi);
-extern void ioapic_init_mappings(void);
+extern void ioapic_and_gsi_init(void);
 extern void ioapic_insert_resources(void);
 
 extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
@@ -168,8 +168,6 @@ extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 
-extern void probe_nr_irqs_gsi(void);
-
 extern void setup_ioapic_ids_from_mpc(void);
 
 struct mp_ioapic_gsi{
@@ -190,7 +188,6 @@ extern void __init pre_init_apic_IRQ0(void);
 static const int timer_through_8259 = 0;
 static inline void ioapic_init_mappings(void)	{ }
 static inline void ioapic_insert_resources(void) { }
-static inline void probe_nr_irqs_gsi(void)	{ }
 #define gsi_top (NR_IRQS_LEGACY)
 static inline int mp_find_ioapic(u32 gsi) { return 0; }
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 8ae808d..ce3c6fb 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3639,7 +3639,7 @@ int __init io_apic_get_redir_entries (int ioapic)
 	return reg_01.bits.entries + 1;
 }
 
-void __init probe_nr_irqs_gsi(void)
+static void __init probe_nr_irqs_gsi(void)
 {
 	int nr;
 
@@ -3951,7 +3951,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
 	return res;
 }
 
-void __init ioapic_init_mappings(void)
+void __init ioapic_and_gsi_init(void)
 {
 	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
 	struct resource *ioapic_res;
@@ -3989,6 +3989,8 @@ fake_ioapic_page:
 		ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
 		ioapic_res++;
 	}
+
+	probe_nr_irqs_gsi();
 }
 
 void __init ioapic_insert_resources(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 420e641..b8982e0 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1017,10 +1017,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 	init_apic_mappings();
-	ioapic_init_mappings();
-
-	/* need to wait for io_apic is mapped */
-	probe_nr_irqs_gsi();
+	ioapic_and_gsi_init();
 
 	kvm_guest_init();
 
-- 
cgit v0.10.2


From fe7de49f9d4e53f24ec9ef762a503f70b562341c Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 20 Oct 2010 16:01:12 -0700
Subject: sched: Make sched_param argument static in sched_setscheduler()
 callers

Andrew Morton pointed out almost all sched_setscheduler() callers are
using fixed parameters and can be converted to static.  It reduces runtime
memory use a little.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: James Morris <jmorris@namei.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0383601..849c867 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1942,9 +1942,10 @@ extern int task_nice(const struct task_struct *p);
 extern int can_nice(const struct task_struct *p, const int nice);
 extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
-extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
+extern int sched_setscheduler(struct task_struct *, int,
+			      const struct sched_param *);
 extern int sched_setscheduler_nocheck(struct task_struct *, int,
-				      struct sched_param *);
+				      const struct sched_param *);
 extern struct task_struct *idle_task(int cpu);
 extern struct task_struct *curr_task(int cpu);
 extern void set_curr_task(int cpu, struct task_struct *p);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 644e8d5..850f030 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -573,7 +573,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
  */
 static int irq_thread(void *data)
 {
-	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+	static struct sched_param param = {
+		.sched_priority = MAX_USER_RT_PRIO/2,
+	};
 	struct irqaction *action = data;
 	struct irq_desc *desc = irq_to_desc(action->irq);
 	int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2dc3786..74cf6f5 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 	wait_for_completion(&create.done);
 
 	if (!IS_ERR(create.result)) {
-		struct sched_param param = { .sched_priority = 0 };
+		static struct sched_param param = { .sched_priority = 0 };
 		va_list args;
 
 		va_start(args, namefmt);
diff --git a/kernel/sched.c b/kernel/sched.c
index d42992b..51944e8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4701,7 +4701,7 @@ static bool check_same_owner(struct task_struct *p)
 }
 
 static int __sched_setscheduler(struct task_struct *p, int policy,
-				struct sched_param *param, bool user)
+				const struct sched_param *param, bool user)
 {
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
@@ -4856,7 +4856,7 @@ recheck:
  * NOTE that the task may be already dead.
  */
 int sched_setscheduler(struct task_struct *p, int policy,
-		       struct sched_param *param)
+		       const struct sched_param *param)
 {
 	return __sched_setscheduler(p, policy, param, true);
 }
@@ -4874,7 +4874,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
  * but our caller might not have that capability.
  */
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
-			       struct sched_param *param)
+			       const struct sched_param *param)
 {
 	return __sched_setscheduler(p, policy, param, false);
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fc97888..081869e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -851,7 +851,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
 			     cpumask_any(cpu_online_mask));
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN: {
-		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+		static struct sched_param param = {
+			.sched_priority = MAX_RT_PRIO-1
+		};
 
 		p = per_cpu(ksoftirqd, hotcpu);
 		per_cpu(ksoftirqd, hotcpu) = NULL;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415..562c56e 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
 static int trace_wakeup_test_thread(void *data)
 {
 	/* Make this a RT thread, doesn't need to be too high */
-	struct sched_param param = { .sched_priority = 5 };
+	static struct sched_param param = { .sched_priority = 5 };
 	struct completion *x = data;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index bafba68..94ca779 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -307,7 +307,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
  */
 static int watchdog(void *unused)
 {
-	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
-- 
cgit v0.10.2


From 7fb2b870d6a3b92f6750ac2b72858fd098dc9e3f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 24 Oct 2010 11:11:22 +0200
Subject: x86: io_apic: Fix CONFIG_X86_IO_APIC=n breakage

Stupid me forgot to change the function name for the
CONFIG_X86_IO_APIC=n case in commit 23f9b2671 (x86: apic: Move
probe_nr_irqs_gsi() into ioapic_init_mappings())

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 7e620ba..240a0a5 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -186,7 +186,7 @@ extern void __init pre_init_apic_IRQ0(void);
 #define io_apic_assign_pci_irqs 0
 #define setup_ioapic_ids_from_mpc x86_init_noop
 static const int timer_through_8259 = 0;
-static inline void ioapic_init_mappings(void)	{ }
+static inline void ioapic_and_gsi_init(void) { }
 static inline void ioapic_insert_resources(void) { }
 #define gsi_top (NR_IRQS_LEGACY)
 static inline int mp_find_ioapic(u32 gsi) { return 0; }
-- 
cgit v0.10.2


From 9c37c9d89773ee9da9f6af28ee37d931bd045711 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 25 Oct 2010 16:03:35 +0200
Subject: mce, amd: Implement mce_threshold_block_init() helper function

This patch adds a helper function for the initial setup of an
mce threshold block. The LVT offset is passed as argument. Also
making variable threshold_defaults local as it is only used in
function mce_amd_feature_init(). Function
threshold_restart_bank() is extended to setup the LVT offset,
the change is backward compatible. Thus, now there is only a
single wrmsrl() to setup the block.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1288015419-29543-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 80c4823..f438318 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -59,12 +59,6 @@ struct threshold_block {
 	struct list_head	miscj;
 };
 
-/* defaults used early on boot */
-static struct threshold_block threshold_defaults = {
-	.interrupt_enable	= 0,
-	.threshold_limit	= THRESHOLD_MAX,
-};
-
 struct threshold_bank {
 	struct kobject		*kobj;
 	struct threshold_block	*blocks;
@@ -89,6 +83,8 @@ static void amd_threshold_interrupt(void);
 struct thresh_restart {
 	struct threshold_block	*b;
 	int			reset;
+	int			set_lvt_off;
+	int			lvt_off;
 	u16			old_limit;
 };
 
@@ -116,6 +112,12 @@ static void threshold_restart_bank(void *_tr)
 		    (new_count & THRESHOLD_MAX);
 	}
 
+	if (tr->set_lvt_off) {
+		/* set new lvt offset */
+		mci_misc_hi &= ~MASK_LVTOFF_HI;
+		mci_misc_hi |= tr->lvt_off << 20;
+	}
+
 	tr->b->interrupt_enable ?
 	    (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
 	    (mci_misc_hi &= ~MASK_INT_TYPE_HI);
@@ -124,13 +126,25 @@ static void threshold_restart_bank(void *_tr)
 	wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
 }
 
+static void mce_threshold_block_init(struct threshold_block *b, int offset)
+{
+	struct thresh_restart tr = {
+		.b			= b,
+		.set_lvt_off		= 1,
+		.lvt_off		= offset,
+	};
+
+	b->threshold_limit		= THRESHOLD_MAX;
+	threshold_restart_bank(&tr);
+};
+
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
+	struct threshold_block b;
 	unsigned int cpu = smp_processor_id();
 	u32 low = 0, high = 0, address = 0;
 	unsigned int bank, block;
-	struct thresh_restart tr;
 	int lvt_off = -1;
 	u8 offset;
 
@@ -186,16 +200,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 				continue;
 			}
 
-			high &= ~MASK_LVTOFF_HI;
-			high |= lvt_off << 20;
-			wrmsr(address, low, high);
-
-			threshold_defaults.address = address;
-			tr.b = &threshold_defaults;
-			tr.reset = 0;
-			tr.old_limit = 0;
-			threshold_restart_bank(&tr);
+			memset(&b, 0, sizeof(b));
+			b.cpu		= cpu;
+			b.bank		= bank;
+			b.block		= block;
+			b.address	= address;
 
+			mce_threshold_block_init(&b, offset);
 			mce_threshold_vector = amd_threshold_interrupt;
 		}
 	}
@@ -298,9 +309,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
 
 	b->interrupt_enable = !!new;
 
+	memset(&tr, 0, sizeof(tr));
 	tr.b		= b;
-	tr.reset	= 0;
-	tr.old_limit	= 0;
 
 	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 
@@ -321,10 +331,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
 	if (new < 1)
 		new = 1;
 
+	memset(&tr, 0, sizeof(tr));
 	tr.old_limit = b->threshold_limit;
 	b->threshold_limit = new;
 	tr.b = b;
-	tr.reset = 0;
 
 	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 
-- 
cgit v0.10.2


From 7203a0494084541575bac6dfc4e153f9e28869b8 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 25 Oct 2010 16:03:36 +0200
Subject: mce, amd: Shorten local variables mci_misc_{hi,lo}

Shorten this variables to make later changes more readable.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1288015419-29543-3-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f438318..eb771b9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -93,37 +93,37 @@ struct thresh_restart {
 static void threshold_restart_bank(void *_tr)
 {
 	struct thresh_restart *tr = _tr;
-	u32 mci_misc_hi, mci_misc_lo;
+	u32 hi, lo;
 
-	rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+	rdmsr(tr->b->address, lo, hi);
 
-	if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+	if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
 		tr->reset = 1;	/* limit cannot be lower than err count */
 
 	if (tr->reset) {		/* reset err count and overflow bit */
-		mci_misc_hi =
-		    (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+		hi =
+		    (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
 		    (THRESHOLD_MAX - tr->b->threshold_limit);
 	} else if (tr->old_limit) {	/* change limit w/o reset */
-		int new_count = (mci_misc_hi & THRESHOLD_MAX) +
+		int new_count = (hi & THRESHOLD_MAX) +
 		    (tr->old_limit - tr->b->threshold_limit);
 
-		mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
+		hi = (hi & ~MASK_ERR_COUNT_HI) |
 		    (new_count & THRESHOLD_MAX);
 	}
 
 	if (tr->set_lvt_off) {
 		/* set new lvt offset */
-		mci_misc_hi &= ~MASK_LVTOFF_HI;
-		mci_misc_hi |= tr->lvt_off << 20;
+		hi &= ~MASK_LVTOFF_HI;
+		hi |= tr->lvt_off << 20;
 	}
 
 	tr->b->interrupt_enable ?
-	    (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
-	    (mci_misc_hi &= ~MASK_INT_TYPE_HI);
+	    (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
+	    (hi &= ~MASK_INT_TYPE_HI);
 
-	mci_misc_hi |= MASK_COUNT_EN_HI;
-	wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+	hi |= MASK_COUNT_EN_HI;
+	wrmsr(tr->b->address, lo, hi);
 }
 
 static void mce_threshold_block_init(struct threshold_block *b, int offset)
-- 
cgit v0.10.2


From bbaff08dca3c34d0fb6b4c4051354184e33e3df8 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 25 Oct 2010 16:03:37 +0200
Subject: mce, amd: Add helper functions to setup APIC

This patch reworks and cleans up mce_amd_feature_init() by
introducing helper functions to setup and check the LVT offset.
It also fixes line endings in pr_err() calls.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1288015419-29543-4-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index eb771b9..e316684 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -31,8 +31,6 @@
 #include <asm/mce.h>
 #include <asm/msr.h>
 
-#define PFX               "mce_threshold: "
-#define VERSION           "version 1.1.1"
 #define NR_BANKS          6
 #define NR_BLOCKS         9
 #define THRESHOLD_MAX     0xFFF
@@ -88,6 +86,27 @@ struct thresh_restart {
 	u16			old_limit;
 };
 
+static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+{
+	int msr = (hi & MASK_LVTOFF_HI) >> 20;
+
+	if (apic < 0) {
+		pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
+		       "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
+		       b->bank, b->block, b->address, hi, lo);
+		return 0;
+	}
+
+	if (apic != msr) {
+		pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
+		       "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
+		       b->cpu, apic, b->bank, b->block, b->address, hi, lo);
+		return 0;
+	}
+
+	return 1;
+};
+
 /* must be called with correct cpu affinity */
 /* Called via smp_call_function_single() */
 static void threshold_restart_bank(void *_tr)
@@ -113,9 +132,11 @@ static void threshold_restart_bank(void *_tr)
 	}
 
 	if (tr->set_lvt_off) {
-		/* set new lvt offset */
-		hi &= ~MASK_LVTOFF_HI;
-		hi |= tr->lvt_off << 20;
+		if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
+			/* set new lvt offset */
+			hi &= ~MASK_LVTOFF_HI;
+			hi |= tr->lvt_off << 20;
+		}
 	}
 
 	tr->b->interrupt_enable ?
@@ -138,6 +159,15 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset)
 	threshold_restart_bank(&tr);
 };
 
+static int setup_APIC_mce(int reserved, int new)
+{
+	if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
+					      APIC_EILVT_MSG_FIX, 0))
+		return new;
+
+	return reserved;
+}
+
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
@@ -145,8 +175,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 	unsigned int cpu = smp_processor_id();
 	u32 low = 0, high = 0, address = 0;
 	unsigned int bank, block;
-	int lvt_off = -1;
-	u8 offset;
+	int offset = -1;
 
 	for (bank = 0; bank < NR_BANKS; ++bank) {
 		for (block = 0; block < NR_BLOCKS; ++block) {
@@ -177,28 +206,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 			if (shared_bank[bank] && c->cpu_core_id)
 				break;
 #endif
-			offset = (high & MASK_LVTOFF_HI) >> 20;
-			if (lvt_off < 0) {
-				if (setup_APIC_eilvt(offset,
-						     THRESHOLD_APIC_VECTOR,
-						     APIC_EILVT_MSG_FIX, 0)) {
-					pr_err(FW_BUG "cpu %d, failed to "
-					       "setup threshold interrupt "
-					       "for bank %d, block %d "
-					       "(MSR%08X=0x%x%08x)",
-					       smp_processor_id(), bank, block,
-					       address, high, low);
-					continue;
-				}
-				lvt_off = offset;
-			} else if (lvt_off != offset) {
-				pr_err(FW_BUG "cpu %d, invalid threshold "
-				       "interrupt offset %d for bank %d,"
-				       "block %d (MSR%08X=0x%x%08x)",
-				       smp_processor_id(), lvt_off, bank,
-				       block, address, high, low);
-				continue;
-			}
+			offset = setup_APIC_mce(offset,
+						(high & MASK_LVTOFF_HI) >> 20);
 
 			memset(&b, 0, sizeof(b));
 			b.cpu		= cpu;
-- 
cgit v0.10.2


From 0a17941e71f089b128514f7b5b486e20072ca7dc Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 25 Oct 2010 16:03:38 +0200
Subject: mce, amd: Remove goto in threshold_create_device()

Removing the goto in threshold_create_device().

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1288015419-29543-5-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index e316684..5bf2fac 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -622,9 +622,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
 			continue;
 		err = threshold_create_bank(cpu, bank);
 		if (err)
-			goto out;
+			return err;
 	}
-out:
+
 	return err;
 }
 
-- 
cgit v0.10.2


From eb48c9cb2053e7bb5f7f8f0371cb578a0d439450 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 25 Oct 2010 16:03:39 +0200
Subject: apic, amd: Make firmware bug messages more meaningful

This improves error messages in case the BIOS was setting up
wrong LVT offsets.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1288015419-29543-6-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 850657d..cb13048 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -433,17 +433,18 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
 	reserved = reserve_eilvt_offset(offset, new);
 
 	if (reserved != new) {
-		pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but "
-		       "vector 0x%x was already reserved by another core, "
-		       "APIC%lX=0x%x\n",
-		       smp_processor_id(), new, reserved, reg, old);
+		pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+		       "vector 0x%x, but the register is already in use for "
+		       "vector 0x%x on another cpu\n",
+		       smp_processor_id(), reg, offset, new, reserved);
 		return -EINVAL;
 	}
 
 	if (!eilvt_entry_is_changeable(old, new)) {
-		pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but "
-		       "register already in use, APIC%lX=0x%x\n",
-		       smp_processor_id(), new, reg, old);
+		pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+		       "vector 0x%x, but the register is already in use for "
+		       "vector 0x%x on this cpu\n",
+		       smp_processor_id(), reg, offset, new, old);
 		return -EBUSY;
 	}
 
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 42fb46f..08de254 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -566,6 +566,7 @@ static int force_ibs_eilvt_setup(void)
 		ret = setup_ibs_ctl(i);
 		if (ret)
 			return ret;
+		pr_err(FW_BUG "using offset %d for IBS interrupts\n", i);
 		return 0;
 	}
 
-- 
cgit v0.10.2


From 1da4b1c6a4dfb5a13d7147a27c1ac53fed09befd Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 9 Nov 2010 11:22:58 +0000
Subject: x86/mrst: Add SFI platform device parsing code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SFI provides a series of tables. These describe the platform devices present
including SPI and I²C devices, as well as various sensors, keypads and other
glue as well as interfaces provided via the SCU IPC mechanism (intel_scu_ipc.c)

This patch is a merge of the core elements and relevant fixes from the
Intel development code by Feng, Alek, myself into a single coherent patch
for upstream submission.

It provides the needed infrastructure to register I2C, SPI and platform devices
described by the tables, as well as handlers for some of the hardware already
supported in kernel. The 0.8 firmware also provides GPIO tables.

Devices are created at boot time or if they are SCU dependant at the point an
SCU is discovered. The existing Linux device mechanisms will then handle the
device binding. At an abstract level this is an SFI to Linux device translator.

Device/platform specific setup/glue is in this file. This is done so that the
drivers for the generic I²C and SPI bus devices remain cross platform as they
should.

(Updated from RFC version to correct the emc1403 name used by the firmware
 and a wrongly used #define)

Signed-off-by: Alek Du <alek.du@linux.intel.com>
LKML-Reference: <20101109112158.20013.6158.stgit@localhost.localdomain>
[Clean ups, removal of 0.7 support]
Signed-off-by: Feng Tang <feng.tang@linux.intel.com>
[Clean ups]
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e832768..b306b84 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -385,6 +385,8 @@ config X86_MRST
 	depends on X86_EXTENDED_PLATFORM
 	depends on X86_IO_APIC
 	select APB_TIMER
+	select I2C
+	select SPI
 	---help---
 	  Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
 	  Internet Device(MID) platform. Moorestown consists of two chips:
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 4a711a6..283debd 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -50,4 +50,8 @@ extern void mrst_early_console_init(void);
 
 extern struct console early_hsu_console;
 extern void hsu_early_console_init(void);
+
+extern void intel_scu_devices_create(void);
+extern void intel_scu_devices_destroy(void);
+
 #endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 79ae681..cfa1af2 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -9,9 +9,19 @@
  * as published by the Free Software Foundation; version 2
  * of the License.
  */
+
+#define pr_fmt(fmt) "mrst: " fmt
+
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/sfi.h>
+#include <linux/intel_pmic_gpio.h>
+#include <linux/spi/spi.h>
+#include <linux/i2c.h>
+#include <linux/i2c/pca953x.h>
+#include <linux/gpio_keys.h>
+#include <linux/input.h>
+#include <linux/platform_device.h>
 #include <linux/irq.h>
 #include <linux/module.h>
 
@@ -23,8 +33,10 @@
 #include <asm/mrst.h>
 #include <asm/io.h>
 #include <asm/i8259.h>
+#include <asm/intel_scu_ipc.h>
 #include <asm/apb_timer.h>
 
+
 /*
  * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
  * cmdline option x86_mrst_timer can be used to override the configuration
@@ -102,10 +114,10 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table)
 		memcpy(sfi_mtimer_array, pentry, totallen);
 	}
 
-	printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
+	pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num);
 	pentry = sfi_mtimer_array;
 	for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
-		printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
+		pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz,"
 			" irq = %d\n", totallen, (u32)pentry->phys_addr,
 			pentry->freq_hz, pentry->irq);
 			if (!pentry->irq)
@@ -176,10 +188,10 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
 		memcpy(sfi_mrtc_array, pentry, totallen);
 	}
 
-	printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
+	pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num);
 	pentry = sfi_mrtc_array;
 	for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
-		printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
+		pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
 			totallen, (u32)pentry->phys_addr, pentry->irq);
 		mp_irq.type = MP_IOAPIC;
 		mp_irq.irqtype = mp_INT;
@@ -309,3 +321,498 @@ static inline int __init setup_x86_mrst_timer(char *arg)
 	return 0;
 }
 __setup("x86_mrst_timer=", setup_x86_mrst_timer);
+
+/*
+ * Parsing GPIO table first, since the DEVS table will need this table
+ * to map the pin name to the actual pin.
+ */
+static struct sfi_gpio_table_entry *gpio_table;
+static int gpio_num_entry;
+
+static int __init sfi_parse_gpio(struct sfi_table_header *table)
+{
+	struct sfi_table_simple *sb;
+	struct sfi_gpio_table_entry *pentry;
+	int num, i;
+
+	if (gpio_table)
+		return 0;
+	sb = (struct sfi_table_simple *)table;
+	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
+	pentry = (struct sfi_gpio_table_entry *)sb->pentry;
+
+	gpio_table = (struct sfi_gpio_table_entry *)
+				kmalloc(num * sizeof(*pentry), GFP_KERNEL);
+	if (!gpio_table)
+		return -1;
+	memcpy(gpio_table, pentry, num * sizeof(*pentry));
+	gpio_num_entry = num;
+
+	pr_debug("GPIO pin info:\n");
+	for (i = 0; i < num; i++, pentry++)
+		pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s,"
+		" pin = %d\n", i,
+			pentry->controller_name,
+			pentry->pin_name,
+			pentry->pin_no);
+	return 0;
+}
+
+static int get_gpio_by_name(const char *name)
+{
+	struct sfi_gpio_table_entry *pentry = gpio_table;
+	int i;
+
+	if (!pentry)
+		return -1;
+	for (i = 0; i < gpio_num_entry; i++, pentry++) {
+		if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
+			return pentry->pin_no;
+	}
+	return -1;
+}
+
+/*
+ * Here defines the array of devices platform data that IAFW would export
+ * through SFI "DEVS" table, we use name and type to match the device and
+ * its platform data.
+ */
+struct devs_id {
+	char name[SFI_NAME_LEN + 1];
+	u8 type;
+	u8 delay;
+	void *(*get_platform_data)(void *info);
+};
+
+/* the offset for the mapping of global gpio pin to irq */
+#define MRST_IRQ_OFFSET 0x100
+
+static void __init *pmic_gpio_platform_data(void *info)
+{
+	static struct intel_pmic_gpio_platform_data pmic_gpio_pdata;
+	int gpio_base = get_gpio_by_name("pmic_gpio_base");
+
+	if (gpio_base == -1)
+		gpio_base = 64;
+	pmic_gpio_pdata.gpio_base = gpio_base;
+	pmic_gpio_pdata.irq_base = gpio_base + MRST_IRQ_OFFSET;
+	pmic_gpio_pdata.gpiointr = 0xffffeff8;
+
+	return &pmic_gpio_pdata;
+}
+
+static void __init *max3111_platform_data(void *info)
+{
+	struct spi_board_info *spi_info = info;
+	int intr = get_gpio_by_name("max3111_int");
+
+	if (intr == -1)
+		return NULL;
+	spi_info->irq = intr + MRST_IRQ_OFFSET;
+	return NULL;
+}
+
+/* we have multiple max7315 on the board ... */
+#define MAX7315_NUM 2
+static void __init *max7315_platform_data(void *info)
+{
+	static struct pca953x_platform_data max7315_pdata[MAX7315_NUM];
+	static int nr;
+	struct pca953x_platform_data *max7315 = &max7315_pdata[nr];
+	struct i2c_board_info *i2c_info = info;
+	int gpio_base, intr;
+	char base_pin_name[SFI_NAME_LEN + 1];
+	char intr_pin_name[SFI_NAME_LEN + 1];
+
+	if (nr == MAX7315_NUM) {
+		pr_err("too many max7315s, we only support %d\n",
+				MAX7315_NUM);
+		return NULL;
+	}
+	/* we have several max7315 on the board, we only need load several
+	 * instances of the same pca953x driver to cover them
+	 */
+	strcpy(i2c_info->type, "max7315");
+	if (nr++) {
+		sprintf(base_pin_name, "max7315_%d_base", nr);
+		sprintf(intr_pin_name, "max7315_%d_int", nr);
+	} else {
+		strcpy(base_pin_name, "max7315_base");
+		strcpy(intr_pin_name, "max7315_int");
+	}
+
+	gpio_base = get_gpio_by_name(base_pin_name);
+	intr = get_gpio_by_name(intr_pin_name);
+
+	if (gpio_base == -1)
+		return NULL;
+	max7315->gpio_base = gpio_base;
+	if (intr != -1) {
+		i2c_info->irq = intr + MRST_IRQ_OFFSET;
+		max7315->irq_base = gpio_base + MRST_IRQ_OFFSET;
+	} else {
+		i2c_info->irq = -1;
+		max7315->irq_base = -1;
+	}
+	return max7315;
+}
+
+static void __init *emc1403_platform_data(void *info)
+{
+	static short intr2nd_pdata;
+	struct i2c_board_info *i2c_info = info;
+	int intr = get_gpio_by_name("thermal_int");
+	int intr2nd = get_gpio_by_name("thermal_alert");
+
+	if (intr == -1 || intr2nd == -1)
+		return NULL;
+
+	i2c_info->irq = intr + MRST_IRQ_OFFSET;
+	intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
+
+	return &intr2nd_pdata;
+}
+
+static void __init *lis331dl_platform_data(void *info)
+{
+	static short intr2nd_pdata;
+	struct i2c_board_info *i2c_info = info;
+	int intr = get_gpio_by_name("accel_int");
+	int intr2nd = get_gpio_by_name("accel_2");
+
+	if (intr == -1 || intr2nd == -1)
+		return NULL;
+
+	i2c_info->irq = intr + MRST_IRQ_OFFSET;
+	intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
+
+	return &intr2nd_pdata;
+}
+
+static const struct devs_id __initconst device_ids[] = {
+	{"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
+	{"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data},
+	{"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
+	{"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
+	{"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data},
+	{"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data},
+	{},
+};
+
+#define MAX_IPCDEVS	24
+static struct platform_device *ipc_devs[MAX_IPCDEVS];
+static int ipc_next_dev;
+
+#define MAX_SCU_SPI	24
+static struct spi_board_info *spi_devs[MAX_SCU_SPI];
+static int spi_next_dev;
+
+#define MAX_SCU_I2C	24
+static struct i2c_board_info *i2c_devs[MAX_SCU_I2C];
+static int i2c_bus[MAX_SCU_I2C];
+static int i2c_next_dev;
+
+static void __init intel_scu_device_register(struct platform_device *pdev)
+{
+	if(ipc_next_dev == MAX_IPCDEVS)
+		pr_err("too many SCU IPC devices");
+	else
+		ipc_devs[ipc_next_dev++] = pdev;
+}
+
+static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
+{
+	struct spi_board_info *new_dev;
+
+	if (spi_next_dev == MAX_SCU_SPI) {
+		pr_err("too many SCU SPI devices");
+		return;
+	}
+
+	new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+	if (!new_dev) {
+		pr_err("failed to alloc mem for delayed spi dev %s\n",
+			sdev->modalias);
+		return;
+	}
+	memcpy(new_dev, sdev, sizeof(*sdev));
+
+	spi_devs[spi_next_dev++] = new_dev;
+}
+
+static void __init intel_scu_i2c_device_register(int bus,
+						struct i2c_board_info *idev)
+{
+	struct i2c_board_info *new_dev;
+
+	if (i2c_next_dev == MAX_SCU_I2C) {
+		pr_err("too many SCU I2C devices");
+		return;
+	}
+
+	new_dev = kzalloc(sizeof(*idev), GFP_KERNEL);
+	if (!new_dev) {
+		pr_err("failed to alloc mem for delayed i2c dev %s\n",
+			idev->type);
+		return;
+	}
+	memcpy(new_dev, idev, sizeof(*idev));
+
+	i2c_bus[i2c_next_dev] = bus;
+	i2c_devs[i2c_next_dev++] = new_dev;
+}
+
+/* Called by IPC driver */
+void intel_scu_devices_create(void)
+{
+	int i;
+
+	for (i = 0; i < ipc_next_dev; i++)
+		platform_device_add(ipc_devs[i]);
+
+	for (i = 0; i < spi_next_dev; i++)
+		spi_register_board_info(spi_devs[i], 1);
+
+	for (i = 0; i < i2c_next_dev; i++) {
+		struct i2c_adapter *adapter;
+		struct i2c_client *client;
+
+		adapter = i2c_get_adapter(i2c_bus[i]);
+		if (adapter) {
+			client = i2c_new_device(adapter, i2c_devs[i]);
+			if (!client)
+				pr_err("can't create i2c device %s\n",
+					i2c_devs[i]->type);
+		} else
+			i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1);
+	}
+}
+EXPORT_SYMBOL_GPL(intel_scu_devices_create);
+
+/* Called by IPC driver */
+void intel_scu_devices_destroy(void)
+{
+	int i;
+
+	for (i = 0; i < ipc_next_dev; i++)
+		platform_device_del(ipc_devs[i]);
+}
+EXPORT_SYMBOL_GPL(intel_scu_devices_destroy);
+
+static void __init install_irq_resource(struct platform_device *pdev, int irq)
+{
+	/* Single threaded */
+	static struct resource __initdata res = {
+		.name = "IRQ",
+		.flags = IORESOURCE_IRQ,
+	};
+	res.start = irq;
+	platform_device_add_resources(pdev, &res, 1);
+}
+
+static void __init sfi_handle_ipc_dev(struct platform_device *pdev)
+{
+	const struct devs_id *dev = device_ids;
+	void *pdata = NULL;
+
+	while (dev->name[0]) {
+		if (dev->type == SFI_DEV_TYPE_IPC &&
+			!strncmp(dev->name, pdev->name, SFI_NAME_LEN)) {
+			pdata = dev->get_platform_data(pdev);
+			break;
+		}
+		dev++;
+	}
+	pdev->dev.platform_data = pdata;
+	intel_scu_device_register(pdev);
+}
+
+static void __init sfi_handle_spi_dev(struct spi_board_info *spi_info)
+{
+	const struct devs_id *dev = device_ids;
+	void *pdata = NULL;
+
+	while (dev->name[0]) {
+		if (dev->type == SFI_DEV_TYPE_SPI &&
+				!strncmp(dev->name, spi_info->modalias, SFI_NAME_LEN)) {
+			pdata = dev->get_platform_data(spi_info);
+			break;
+		}
+		dev++;
+	}
+	spi_info->platform_data = pdata;
+	if (dev->delay)
+		intel_scu_spi_device_register(spi_info);
+	else
+		spi_register_board_info(spi_info, 1);
+}
+
+static void __init sfi_handle_i2c_dev(int bus, struct i2c_board_info *i2c_info)
+{
+	const struct devs_id *dev = device_ids;
+	void *pdata = NULL;
+
+	while (dev->name[0]) {
+		if (dev->type == SFI_DEV_TYPE_I2C &&
+			!strncmp(dev->name, i2c_info->type, SFI_NAME_LEN)) {
+			pdata = dev->get_platform_data(i2c_info);
+			break;
+		}
+		dev++;
+	}
+	i2c_info->platform_data = pdata;
+
+	if (dev->delay)
+		intel_scu_i2c_device_register(bus, i2c_info);
+	else
+		i2c_register_board_info(bus, i2c_info, 1);
+ }
+
+
+static int __init sfi_parse_devs(struct sfi_table_header *table)
+{
+	struct sfi_table_simple *sb;
+	struct sfi_device_table_entry *pentry;
+	struct spi_board_info spi_info;
+	struct i2c_board_info i2c_info;
+	struct platform_device *pdev;
+	int num, i, bus;
+	int ioapic;
+	struct io_apic_irq_attr irq_attr;
+
+	sb = (struct sfi_table_simple *)table;
+	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
+	pentry = (struct sfi_device_table_entry *)sb->pentry;
+
+	for (i = 0; i < num; i++, pentry++) {
+		if (pentry->irq != (u8)0xff) { /* native RTE case */
+			/* these SPI2 devices are not exposed to system as PCI
+			 * devices, but they have separate RTE entry in IOAPIC
+			 * so we have to enable them one by one here
+			 */
+			ioapic = mp_find_ioapic(pentry->irq);
+			irq_attr.ioapic = ioapic;
+			irq_attr.ioapic_pin = pentry->irq;
+			irq_attr.trigger = 1;
+			irq_attr.polarity = 1;
+			io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr);
+		}
+		switch (pentry->type) {
+		case SFI_DEV_TYPE_IPC:
+			/* ID as IRQ is a hack that will go away */
+			pdev = platform_device_alloc(pentry->name, pentry->irq);
+			if (pdev == NULL) {
+				pr_err("out of memory for SFI platform device '%s'.\n",
+							pentry->name);
+				continue;
+			}
+			install_irq_resource(pdev, pentry->irq);
+			pr_debug("info[%2d]: IPC bus, name = %16.16s, "
+				"irq = 0x%2x\n", i, pentry->name, pentry->irq);
+			sfi_handle_ipc_dev(pdev);
+			break;
+		case SFI_DEV_TYPE_SPI:
+			memset(&spi_info, 0, sizeof(spi_info));
+			strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN);
+			spi_info.irq = pentry->irq;
+			spi_info.bus_num = pentry->host_num;
+			spi_info.chip_select = pentry->addr;
+			spi_info.max_speed_hz = pentry->max_freq;
+			pr_debug("info[%2d]: SPI bus = %d, name = %16.16s, "
+				"irq = 0x%2x, max_freq = %d, cs = %d\n", i,
+				spi_info.bus_num,
+				spi_info.modalias,
+				spi_info.irq,
+				spi_info.max_speed_hz,
+				spi_info.chip_select);
+			sfi_handle_spi_dev(&spi_info);
+			break;
+		case SFI_DEV_TYPE_I2C:
+			memset(&i2c_info, 0, sizeof(i2c_info));
+			bus = pentry->host_num;
+			strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN);
+			i2c_info.irq = pentry->irq;
+			i2c_info.addr = pentry->addr;
+			pr_debug("info[%2d]: I2C bus = %d, name = %16.16s, "
+				"irq = 0x%2x, addr = 0x%x\n", i, bus,
+				i2c_info.type,
+				i2c_info.irq,
+				i2c_info.addr);
+			sfi_handle_i2c_dev(bus, &i2c_info);
+			break;
+		case SFI_DEV_TYPE_UART:
+		case SFI_DEV_TYPE_HSI:
+		default:
+			;
+		}
+	}
+	return 0;
+}
+
+static int __init mrst_platform_init(void)
+{
+	sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
+	sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
+	return 0;
+}
+arch_initcall(mrst_platform_init);
+
+/*
+ * we will search these buttons in SFI GPIO table (by name)
+ * and register them dynamically. Please add all possible
+ * buttons here, we will shrink them if no GPIO found.
+ */
+static struct gpio_keys_button gpio_button[] = {
+	{KEY_POWER,		-1, 1, "power_btn",	EV_KEY, 0, 3000},
+	{KEY_PROG1,		-1, 1, "prog_btn1",	EV_KEY, 0, 20},
+	{KEY_PROG2,		-1, 1, "prog_btn2",	EV_KEY, 0, 20},
+	{SW_LID,		-1, 1, "lid_switch",	EV_SW,  0, 20},
+	{KEY_VOLUMEUP,		-1, 1, "vol_up",	EV_KEY, 0, 20},
+	{KEY_VOLUMEDOWN,	-1, 1, "vol_down",	EV_KEY, 0, 20},
+	{KEY_CAMERA,		-1, 1, "camera_full",	EV_KEY, 0, 20},
+	{KEY_CAMERA_FOCUS,	-1, 1, "camera_half",	EV_KEY, 0, 20},
+	{SW_KEYPAD_SLIDE,	-1, 1, "MagSw1",	EV_SW,  0, 20},
+	{SW_KEYPAD_SLIDE,	-1, 1, "MagSw2",	EV_SW,  0, 20},
+};
+
+static struct gpio_keys_platform_data mrst_gpio_keys = {
+	.buttons	= gpio_button,
+	.rep		= 1,
+	.nbuttons	= -1, /* will fill it after search */
+};
+
+static struct platform_device pb_device = {
+	.name		= "gpio-keys",
+	.id		= -1,
+	.dev		= {
+		.platform_data	= &mrst_gpio_keys,
+	},
+};
+
+/*
+ * Shrink the non-existent buttons, register the gpio button
+ * device if there is some
+ */
+static int __init pb_keys_init(void)
+{
+	struct gpio_keys_button *gb = gpio_button;
+	int i, num, good = 0;
+
+	num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
+	for (i = 0; i < num; i++) {
+		gb[i].gpio = get_gpio_by_name(gb[i].desc);
+		if (gb[i].gpio == -1)
+			continue;
+
+		if (i != good)
+			gb[good] = gb[i];
+		good++;
+	}
+
+	if (good) {
+		mrst_gpio_keys.nbuttons = good;
+		return platform_device_register(&pb_device);
+	}
+	return 0;
+}
+late_initcall(pb_keys_init);
diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c
index 41a9e34..ca35b0c 100644
--- a/drivers/platform/x86/intel_scu_ipc.c
+++ b/drivers/platform/x86/intel_scu_ipc.c
@@ -26,6 +26,7 @@
 #include <linux/sfi.h>
 #include <asm/mrst.h>
 #include <asm/intel_scu_ipc.h>
+#include <asm/mrst.h>
 
 /* IPC defines the following message types */
 #define IPCMSG_WATCHDOG_TIMER 0xF8 /* Set Kernel Watchdog Threshold */
@@ -699,6 +700,9 @@ static int ipc_probe(struct pci_dev *dev, const struct pci_device_id *id)
 		iounmap(ipcdev.ipc_base);
 		return -ENOMEM;
 	}
+
+	intel_scu_devices_create();
+
 	return 0;
 }
 
@@ -720,6 +724,7 @@ static void ipc_remove(struct pci_dev *pdev)
 	iounmap(ipcdev.ipc_base);
 	iounmap(ipcdev.i2c_base);
 	ipcdev.pdev = NULL;
+	intel_scu_devices_destroy();
 }
 
 static const struct pci_device_id pci_ids[] = {
diff --git a/include/linux/sfi.h b/include/linux/sfi.h
index 7f770c6..fe81791 100644
--- a/include/linux/sfi.h
+++ b/include/linux/sfi.h
@@ -77,6 +77,8 @@
 #define SFI_OEM_ID_SIZE		6
 #define SFI_OEM_TABLE_ID_SIZE	8
 
+#define SFI_NAME_LEN		16
+
 #define SFI_SYST_SEARCH_BEGIN		0x000E0000
 #define SFI_SYST_SEARCH_END		0x000FFFFF
 
@@ -156,13 +158,13 @@ struct sfi_device_table_entry {
 	u16	addr;
 	u8	irq;
 	u32	max_freq;
-	char	name[16];
+	char	name[SFI_NAME_LEN];
 } __packed;
 
 struct sfi_gpio_table_entry {
-	char	controller_name[16];
+	char	controller_name[SFI_NAME_LEN];
 	u16	pin_no;
-	char	pin_name[16];
+	char	pin_name[SFI_NAME_LEN];
 } __packed;
 
 typedef int (*sfi_table_handler) (struct sfi_table_header *table);
-- 
cgit v0.10.2


From 7f05dec3dd70f086870fdc1d40dbe30db1fe0994 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Tue, 9 Nov 2010 11:28:43 +0000
Subject: x86: mrst: Parse SFI timer table for all timer configs

Penwell has APB timer based watchdog timers, it requires platform code to parse
SFI MTMR tables in order to claim its timer.

This patch will always parse SFI MTMR regardless of system timer configuration
choices. Otherwise, SFI MTMR table may not get parsed if running on Medfield
with always-on local APIC timers and constant TSC. Watchdog timer driver will
then not get a timer to use.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
LKML-Reference: <20101109112800.20591.10802.stgit@localhost.localdomain>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index cfa1af2..e6f4473 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -221,6 +221,7 @@ static unsigned long __init mrst_calibrate_tsc(void)
 
 void __init mrst_time_init(void)
 {
+	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
 	switch (mrst_timer_options) {
 	case MRST_TIMER_APBT_ONLY:
 		break;
@@ -236,7 +237,6 @@ void __init mrst_time_init(void)
 		return;
 	}
 	/* we need at least one APB timer */
-	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
 	pre_init_apic_IRQ0();
 	apbt_time_init();
 }
-- 
cgit v0.10.2


From f6cd24777513fcc673d432cc29ef59881d3e4df1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 4 Nov 2010 11:13:48 +0100
Subject: irq: Better struct irqaction layout

We currently use kmalloc-96 slab for struct irqaction allocations on
64bit arches.

This is unfortunate because of possible false sharing and two cache
lines accesses.

Move 'name' and 'dir' fields at the end of the structure, and force a
suitable alignement.

Hot path fields now use one cache line on x86_64.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Reviewed-by: Andi Kleen <andi@firstfloor.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1288865628.2659.69.camel@edumazet-laptop>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 79d0c4f..55e0d42 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -114,15 +114,15 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
 struct irqaction {
 	irq_handler_t handler;
 	unsigned long flags;
-	const char *name;
 	void *dev_id;
 	struct irqaction *next;
 	int irq;
-	struct proc_dir_entry *dir;
 	irq_handler_t thread_fn;
 	struct task_struct *thread;
 	unsigned long thread_flags;
-};
+	const char *name;
+	struct proc_dir_entry *dir;
+} ____cacheline_internodealigned_in_smp;
 
 extern irqreturn_t no_action(int cpl, void *dev_id);
 
-- 
cgit v0.10.2


From 3cf9b85b474e656a0856b88290c7a289ac5ea247 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 5 Nov 2010 16:12:38 -0700
Subject: locking, lockdep: Convert sprintf_symbol to %pS

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Kosina <trivial@kernel.org>
LKML-Reference: <1288998760-11775-6-git-send-email-joe@perches.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 59b76c8..1969d2f 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -494,7 +494,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 		namelen += 2;
 
 	for (i = 0; i < LOCKSTAT_POINTS; i++) {
-		char sym[KSYM_SYMBOL_LEN];
 		char ip[32];
 
 		if (class->contention_point[i] == 0)
@@ -503,15 +502,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 		if (!i)
 			seq_line(m, '-', 40-namelen, namelen);
 
-		sprint_symbol(sym, class->contention_point[i]);
 		snprintf(ip, sizeof(ip), "[<%p>]",
 				(void *)class->contention_point[i]);
-		seq_printf(m, "%40s %14lu %29s %s\n", name,
-				stats->contention_point[i],
-				ip, sym);
+		seq_printf(m, "%40s %14lu %29s %pS\n",
+			   name, stats->contention_point[i],
+			   ip, (void *)class->contention_point[i]);
 	}
 	for (i = 0; i < LOCKSTAT_POINTS; i++) {
-		char sym[KSYM_SYMBOL_LEN];
 		char ip[32];
 
 		if (class->contending_point[i] == 0)
@@ -520,12 +517,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 		if (!i)
 			seq_line(m, '-', 40-namelen, namelen);
 
-		sprint_symbol(sym, class->contending_point[i]);
 		snprintf(ip, sizeof(ip), "[<%p>]",
 				(void *)class->contending_point[i]);
-		seq_printf(m, "%40s %14lu %29s %s\n", name,
-				stats->contending_point[i],
-				ip, sym);
+		seq_printf(m, "%40s %14lu %29s %pS\n",
+			   name, stats->contending_point[i],
+			   ip, (void *)class->contending_point[i]);
 	}
 	if (i) {
 		seq_puts(m, "\n");
-- 
cgit v0.10.2


From 5e4f083f78d03e9f8d2e327daccde16976f9bb00 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang@windriver.com>
Date: Sun, 24 Oct 2010 11:50:53 +0800
Subject: hrtimer: Remove stale comment on curr_timer

curr_timer doesn't resident in struct hrtimer_cpu_base anymore.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
LKML-Reference: <1287892253-2587-1-git-send-email-yong.zhang0@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index fd0c1b8..dd9954b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -158,7 +158,6 @@ struct hrtimer_clock_base {
  * @lock:		lock protecting the base and associated clock bases
  *			and timers
  * @clock_base:		array of clock bases for this cpu
- * @curr_timer:		the timer which is executing a callback right now
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
  * @hres_active:	State of high resolution mode
-- 
cgit v0.10.2


From 1ea6be212eea5ce1e8fabadacb0c639ad87b2f00 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Mon, 1 Nov 2010 22:44:34 +0100
Subject: x86, microcode, AMD: Replace vmalloc+memset with vzalloc

We don't have to do memset() ourselves after vmalloc() when we have
vzalloc(), so change that in
arch/x86/kernel/microcode_amd.c::get_next_ucode().

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index e1af7c0..383d4f8 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -183,16 +183,17 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
 		return NULL;
 	}
 
-	mc = vmalloc(UCODE_MAX_SIZE);
-	if (mc) {
-		memset(mc, 0, UCODE_MAX_SIZE);
-		if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
-				   total_size)) {
-			vfree(mc);
-			mc = NULL;
-		} else
-			*mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
+	mc = vzalloc(UCODE_MAX_SIZE);
+	if (!mc)
+		return NULL;
+
+	if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) {
+		vfree(mc);
+		mc = NULL;
+	} else {
+		*mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
 	}
+
 	return mc;
 }
 
-- 
cgit v0.10.2


From c7657ac0c3e4d4ab569296911164b7a2b0ff871a Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Mon, 1 Nov 2010 23:36:53 +0100
Subject: x86, microcode, AMD: Cleanup code a bit

get_ucode_data is a memcpy() wrapper which always returns 0. Move it
into the header and make it an inline. Remove all code checking its
return value and turn it into a void.

There should be no functionality change resulting from this patch.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ef51b50..2421507 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -48,6 +48,12 @@ static inline struct microcode_ops * __init init_intel_microcode(void)
 
 #ifdef CONFIG_MICROCODE_AMD
 extern struct microcode_ops * __init init_amd_microcode(void);
+
+static inline void get_ucode_data(void *to, const u8 *from, size_t n)
+{
+	memcpy(to, from, n);
+}
+
 #else
 static inline struct microcode_ops * __init init_amd_microcode(void)
 {
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 383d4f8..1583133 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -155,12 +155,6 @@ static int apply_microcode_amd(int cpu)
 	return 0;
 }
 
-static int get_ucode_data(void *to, const u8 *from, size_t n)
-{
-	memcpy(to, from, n);
-	return 0;
-}
-
 static void *
 get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
 {
@@ -168,8 +162,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
 	u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
 	void *mc;
 
-	if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
-		return NULL;
+	get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR);
 
 	if (section_hdr[0] != UCODE_UCODE_TYPE) {
 		pr_err("error: invalid type field in container file section header\n");
@@ -187,12 +180,8 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
 	if (!mc)
 		return NULL;
 
-	if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) {
-		vfree(mc);
-		mc = NULL;
-	} else {
-		*mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
-	}
+	get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size);
+	*mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
 
 	return mc;
 }
@@ -203,8 +192,7 @@ static int install_equiv_cpu_table(const u8 *buf)
 	unsigned int *buf_pos = (unsigned int *)container_hdr;
 	unsigned long size;
 
-	if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
-		return 0;
+	get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE);
 
 	size = buf_pos[2];
 
@@ -220,10 +208,7 @@ static int install_equiv_cpu_table(const u8 *buf)
 	}
 
 	buf += UCODE_CONTAINER_HEADER_SIZE;
-	if (get_ucode_data(equiv_cpu_table, buf, size)) {
-		vfree(equiv_cpu_table);
-		return 0;
-	}
+	get_ucode_data(equiv_cpu_table, buf, size);
 
 	return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
 }
-- 
cgit v0.10.2


From ae791a2d2e382adc69990a144a7f1a6c4bc24f1e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 10 Nov 2010 13:30:36 +0100
Subject: futex: Cleanup stale fshared flag interfaces

The fast GUP changes stopped using the fshared flag in
put_futex_keys(), but we kept the interface the same.

Cleanup all stale users.

This patch is split out from Darren Harts combo patch which also
combines various flags. This way the changes are clearly separated.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Darren Hart <dvhart@linux.intel.com>
LKML-Reference: <1289250609-16304-1-git-send-email-dvhart@linux.intel.com>

diff --git a/kernel/futex.c b/kernel/futex.c
index 40a8777..38cf606 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -283,8 +283,7 @@ again:
 	return 0;
 }
 
-static inline
-void put_futex_key(int fshared, union futex_key *key)
+static inline void put_futex_key(union futex_key *key)
 {
 	drop_futex_key_refs(key);
 }
@@ -907,7 +906,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
 	}
 
 	spin_unlock(&hb->lock);
-	put_futex_key(fshared, &key);
+	put_futex_key(&key);
 out:
 	return ret;
 }
@@ -965,8 +964,8 @@ retry_private:
 		if (!fshared)
 			goto retry_private;
 
-		put_futex_key(fshared, &key2);
-		put_futex_key(fshared, &key1);
+		put_futex_key(&key2);
+		put_futex_key(&key1);
 		goto retry;
 	}
 
@@ -996,9 +995,9 @@ retry_private:
 
 	double_unlock_hb(hb1, hb2);
 out_put_keys:
-	put_futex_key(fshared, &key2);
+	put_futex_key(&key2);
 out_put_key1:
-	put_futex_key(fshared, &key1);
+	put_futex_key(&key1);
 out:
 	return ret;
 }
@@ -1219,8 +1218,8 @@ retry_private:
 			if (!fshared)
 				goto retry_private;
 
-			put_futex_key(fshared, &key2);
-			put_futex_key(fshared, &key1);
+			put_futex_key(&key2);
+			put_futex_key(&key1);
 			goto retry;
 		}
 		if (curval != *cmpval) {
@@ -1260,8 +1259,8 @@ retry_private:
 			break;
 		case -EFAULT:
 			double_unlock_hb(hb1, hb2);
-			put_futex_key(fshared, &key2);
-			put_futex_key(fshared, &key1);
+			put_futex_key(&key2);
+			put_futex_key(&key1);
 			ret = fault_in_user_writeable(uaddr2);
 			if (!ret)
 				goto retry;
@@ -1269,8 +1268,8 @@ retry_private:
 		case -EAGAIN:
 			/* The owner was exiting, try again. */
 			double_unlock_hb(hb1, hb2);
-			put_futex_key(fshared, &key2);
-			put_futex_key(fshared, &key1);
+			put_futex_key(&key2);
+			put_futex_key(&key1);
 			cond_resched();
 			goto retry;
 		default:
@@ -1352,9 +1351,9 @@ out_unlock:
 		drop_futex_key_refs(&key1);
 
 out_put_keys:
-	put_futex_key(fshared, &key2);
+	put_futex_key(&key2);
 out_put_key1:
-	put_futex_key(fshared, &key1);
+	put_futex_key(&key1);
 out:
 	if (pi_state != NULL)
 		free_pi_state(pi_state);
@@ -1494,7 +1493,7 @@ static void unqueue_me_pi(struct futex_q *q)
  * private futexes.
  */
 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-				struct task_struct *newowner, int fshared)
+				struct task_struct *newowner)
 {
 	u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
 	struct futex_pi_state *pi_state = q->pi_state;
@@ -1600,7 +1599,6 @@ static long futex_wait_restart(struct restart_block *restart);
 /**
  * fixup_owner() - Post lock pi_state and corner case management
  * @uaddr:	user address of the futex
- * @fshared:	whether the futex is shared (1) or not (0)
  * @q:		futex_q (contains pi_state and access to the rt_mutex)
  * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
  *
@@ -1613,8 +1611,7 @@ static long futex_wait_restart(struct restart_block *restart);
  *  0 - success, lock not taken
  * <0 - on error (-EFAULT)
  */
-static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
-		       int locked)
+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 {
 	struct task_struct *owner;
 	int ret = 0;
@@ -1625,7 +1622,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
 		 * did a lock-steal - fix up the PI-state in that case:
 		 */
 		if (q->pi_state->owner != current)
-			ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+			ret = fixup_pi_state_owner(uaddr, q, current);
 		goto out;
 	}
 
@@ -1652,7 +1649,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
 		 * lock. Fix the state up.
 		 */
 		owner = rt_mutex_owner(&q->pi_state->pi_mutex);
-		ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+		ret = fixup_pi_state_owner(uaddr, q, owner);
 		goto out;
 	}
 
@@ -1772,7 +1769,7 @@ retry_private:
 		if (!fshared)
 			goto retry_private;
 
-		put_futex_key(fshared, &q->key);
+		put_futex_key(&q->key);
 		goto retry;
 	}
 
@@ -1783,7 +1780,7 @@ retry_private:
 
 out:
 	if (ret)
-		put_futex_key(fshared, &q->key);
+		put_futex_key(&q->key);
 	return ret;
 }
 
@@ -1941,7 +1938,7 @@ retry_private:
 			 * exit to complete.
 			 */
 			queue_unlock(&q, hb);
-			put_futex_key(fshared, &q.key);
+			put_futex_key(&q.key);
 			cond_resched();
 			goto retry;
 		default:
@@ -1971,7 +1968,7 @@ retry_private:
 	 * Fixup the pi_state owner and possibly acquire the lock if we
 	 * haven't already.
 	 */
-	res = fixup_owner(uaddr, fshared, &q, !ret);
+	res = fixup_owner(uaddr, &q, !ret);
 	/*
 	 * If fixup_owner() returned an error, proprogate that.  If it acquired
 	 * the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1995,7 +1992,7 @@ out_unlock_put_key:
 	queue_unlock(&q, hb);
 
 out_put_key:
-	put_futex_key(fshared, &q.key);
+	put_futex_key(&q.key);
 out:
 	if (to)
 		destroy_hrtimer_on_stack(&to->timer);
@@ -2011,7 +2008,7 @@ uaddr_faulted:
 	if (!fshared)
 		goto retry_private;
 
-	put_futex_key(fshared, &q.key);
+	put_futex_key(&q.key);
 	goto retry;
 }
 
@@ -2093,14 +2090,14 @@ retry:
 
 out_unlock:
 	spin_unlock(&hb->lock);
-	put_futex_key(fshared, &key);
+	put_futex_key(&key);
 
 out:
 	return ret;
 
 pi_faulted:
 	spin_unlock(&hb->lock);
-	put_futex_key(fshared, &key);
+	put_futex_key(&key);
 
 	ret = fault_in_user_writeable(uaddr);
 	if (!ret)
@@ -2273,8 +2270,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 		 */
 		if (q.pi_state && (q.pi_state->owner != current)) {
 			spin_lock(q.lock_ptr);
-			ret = fixup_pi_state_owner(uaddr2, &q, current,
-						   fshared);
+			ret = fixup_pi_state_owner(uaddr2, &q, current);
 			spin_unlock(q.lock_ptr);
 		}
 	} else {
@@ -2293,7 +2289,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 		 * Fixup the pi_state owner and possibly acquire the lock if we
 		 * haven't already.
 		 */
-		res = fixup_owner(uaddr2, fshared, &q, !ret);
+		res = fixup_owner(uaddr2, &q, !ret);
 		/*
 		 * If fixup_owner() returned an error, proprogate that.  If it
 		 * acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2324,9 +2320,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 	}
 
 out_put_keys:
-	put_futex_key(fshared, &q.key);
+	put_futex_key(&q.key);
 out_key2:
-	put_futex_key(fshared, &key2);
+	put_futex_key(&key2);
 
 out:
 	if (to) {
-- 
cgit v0.10.2


From b41277dc7a18ee332d9e8078e978bacdf6e76157 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhart@linux.intel.com>
Date: Mon, 8 Nov 2010 13:10:09 -0800
Subject: futex: Replace fshared and clockrt with combined flags

In the early days we passed the mmap sem around. That became the
"int fshared" with the fast gup improvements. Then we added
"int clockrt" in places. This patch unifies these options as "flags".

[ tglx: Split out the stale fshared cleanup ]

Signed-off-by: Darren Hart <dvhart@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1289250609-16304-1-git-send-email-dvhart@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 38cf606..87ad287 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled;
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 
 /*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SHARED		0x01
+#define FLAGS_CLOCKRT		0x02
+#define FLAGS_HAS_TIMEOUT	0x04
+
+/*
  * Priority Inheritance state:
  */
 struct futex_pi_state {
@@ -869,7 +877,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 /*
  * Wake up waiters matching bitset queued on this futex (uaddr).
  */
-static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
@@ -880,7 +889,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
 	if (!bitset)
 		return -EINVAL;
 
-	ret = get_futex_key(uaddr, fshared, &key);
+	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -916,7 +925,7 @@ out:
  * to this virtual address:
  */
 static int
-futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	      int nr_wake, int nr_wake2, int op)
 {
 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -926,10 +935,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 	int ret, op_ret;
 
 retry:
-	ret = get_futex_key(uaddr1, fshared, &key1);
+	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, fshared, &key2);
+	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
 	if (unlikely(ret != 0))
 		goto out_put_key1;
 
@@ -961,7 +970,7 @@ retry_private:
 		if (ret)
 			goto out_put_keys;
 
-		if (!fshared)
+		if (!(flags & FLAGS_SHARED))
 			goto retry_private;
 
 		put_futex_key(&key2);
@@ -1132,13 +1141,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 /**
  * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
  * @uaddr1:	source futex user address
- * @fshared:	0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags:	futex flags (FLAGS_SHARED, etc.)
  * @uaddr2:	target futex user address
  * @nr_wake:	number of waiters to wake (must be 1 for requeue_pi)
  * @nr_requeue:	number of waiters to requeue (0-INT_MAX)
  * @cmpval:	@uaddr1 expected value (or %NULL)
  * @requeue_pi:	if we are attempting to requeue from a non-pi futex to a
- * 		pi futex (pi to pi requeue is not supported)
+ *		pi futex (pi to pi requeue is not supported)
  *
  * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
  * uaddr2 atomically on behalf of the top waiter.
@@ -1147,9 +1156,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
  * >=0 - on success, the number of tasks requeued or woken
  *  <0 - on error
  */
-static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
-			 int nr_wake, int nr_requeue, u32 *cmpval,
-			 int requeue_pi)
+static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+			 u32 __user *uaddr2, int nr_wake, int nr_requeue,
+			 u32 *cmpval, int requeue_pi)
 {
 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
 	int drop_count = 0, task_count = 0, ret;
@@ -1190,10 +1199,10 @@ retry:
 		pi_state = NULL;
 	}
 
-	ret = get_futex_key(uaddr1, fshared, &key1);
+	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, fshared, &key2);
+	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
 	if (unlikely(ret != 0))
 		goto out_put_key1;
 
@@ -1215,7 +1224,7 @@ retry_private:
 			if (ret)
 				goto out_put_keys;
 
-			if (!fshared)
+			if (!(flags & FLAGS_SHARED))
 				goto retry_private;
 
 			put_futex_key(&key2);
@@ -1586,14 +1595,6 @@ handle_fault:
 	goto retry;
 }
 
-/*
- * In case we must use restart_block to restart a futex_wait,
- * we encode in the 'flags' shared capability
- */
-#define FLAGS_SHARED		0x01
-#define FLAGS_CLOCKRT		0x02
-#define FLAGS_HAS_TIMEOUT	0x04
-
 static long futex_wait_restart(struct restart_block *restart);
 
 /**
@@ -1712,7 +1713,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
  * futex_wait_setup() - Prepare to wait on a futex
  * @uaddr:	the futex userspace address
  * @val:	the expected value
- * @fshared:	whether the futex is shared (1) or not (0)
+ * @flags:	futex flags (FLAGS_SHARED, etc.)
  * @q:		the associated futex_q
  * @hb:		storage for hash_bucket pointer to be returned to caller
  *
@@ -1725,7 +1726,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
  *  0 - uaddr contains val and hb has been locked
  * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
  */
-static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 			   struct futex_q *q, struct futex_hash_bucket **hb)
 {
 	u32 uval;
@@ -1750,7 +1751,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
 	 */
 retry:
 	q->key = FUTEX_KEY_INIT;
-	ret = get_futex_key(uaddr, fshared, &q->key);
+	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
 	if (unlikely(ret != 0))
 		return ret;
 
@@ -1766,7 +1767,7 @@ retry_private:
 		if (ret)
 			goto out;
 
-		if (!fshared)
+		if (!(flags & FLAGS_SHARED))
 			goto retry_private;
 
 		put_futex_key(&q->key);
@@ -1784,8 +1785,8 @@ out:
 	return ret;
 }
 
-static int futex_wait(u32 __user *uaddr, int fshared,
-		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+		      ktime_t *abs_time, u32 bitset)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct restart_block *restart;
@@ -1804,8 +1805,9 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 	if (abs_time) {
 		to = &timeout;
 
-		hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
-				      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+				      CLOCK_REALTIME : CLOCK_MONOTONIC,
+				      HRTIMER_MODE_ABS);
 		hrtimer_init_sleeper(to, current);
 		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
 					     current->timer_slack_ns);
@@ -1816,7 +1818,7 @@ retry:
 	 * Prepare to wait on uaddr. On success, holds hb lock and increments
 	 * q.key refs.
 	 */
-	ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
 	if (ret)
 		goto out;
 
@@ -1849,12 +1851,7 @@ retry:
 	restart->futex.val = val;
 	restart->futex.time = abs_time->tv64;
 	restart->futex.bitset = bitset;
-	restart->futex.flags = FLAGS_HAS_TIMEOUT;
-
-	if (fshared)
-		restart->futex.flags |= FLAGS_SHARED;
-	if (clockrt)
-		restart->futex.flags |= FLAGS_CLOCKRT;
+	restart->futex.flags = flags;
 
 	ret = -ERESTART_RESTARTBLOCK;
 
@@ -1870,7 +1867,6 @@ out:
 static long futex_wait_restart(struct restart_block *restart)
 {
 	u32 __user *uaddr = restart->futex.uaddr;
-	int fshared = 0;
 	ktime_t t, *tp = NULL;
 
 	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1878,11 +1874,9 @@ static long futex_wait_restart(struct restart_block *restart)
 		tp = &t;
 	}
 	restart->fn = do_no_restart_syscall;
-	if (restart->futex.flags & FLAGS_SHARED)
-		fshared = 1;
-	return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
-				restart->futex.bitset,
-				restart->futex.flags & FLAGS_CLOCKRT);
+
+	return (long)futex_wait(uaddr, restart->futex.flags,
+				restart->futex.val, tp, restart->futex.bitset);
 }
 
 
@@ -1892,8 +1886,8 @@ static long futex_wait_restart(struct restart_block *restart)
  * if there are waiters then it will block, it does PI, etc. (Due to
  * races the kernel might see a 0 value of the futex too.)
  */
-static int futex_lock_pi(u32 __user *uaddr, int fshared,
-			 int detect, ktime_t *time, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+			 ktime_t *time, int trylock)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct futex_hash_bucket *hb;
@@ -1916,7 +1910,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	q.requeue_pi_key = NULL;
 retry:
 	q.key = FUTEX_KEY_INIT;
-	ret = get_futex_key(uaddr, fshared, &q.key);
+	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -2005,7 +1999,7 @@ uaddr_faulted:
 	if (ret)
 		goto out_put_key;
 
-	if (!fshared)
+	if (!(flags & FLAGS_SHARED))
 		goto retry_private;
 
 	put_futex_key(&q.key);
@@ -2017,7 +2011,7 @@ uaddr_faulted:
  * This is the in-kernel slowpath: we look up the PI state (if any),
  * and do the rt-mutex unlock.
  */
-static int futex_unlock_pi(u32 __user *uaddr, int fshared)
+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
@@ -2035,7 +2029,7 @@ retry:
 	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
 		return -EPERM;
 
-	ret = get_futex_key(uaddr, fshared, &key);
+	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -2157,7 +2151,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 /**
  * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
  * @uaddr:	the futex we initially wait on (non-pi)
- * @fshared:	whether the futexes are shared (1) or not (0).  They must be
+ * @flags:	futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
  * 		the same type, no requeueing from private to shared, etc.
  * @val:	the expected value of uaddr
  * @abs_time:	absolute timeout
@@ -2195,9 +2189,9 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  *  0 - On success
  * <0 - On error
  */
-static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 				 u32 val, ktime_t *abs_time, u32 bitset,
-				 int clockrt, u32 __user *uaddr2)
+				 u32 __user *uaddr2)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct rt_mutex_waiter rt_waiter;
@@ -2212,8 +2206,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 
 	if (abs_time) {
 		to = &timeout;
-		hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
-				      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+				      CLOCK_REALTIME : CLOCK_MONOTONIC,
+				      HRTIMER_MODE_ABS);
 		hrtimer_init_sleeper(to, current);
 		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
 					     current->timer_slack_ns);
@@ -2227,7 +2222,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 	rt_waiter.task = NULL;
 
 	key2 = FUTEX_KEY_INIT;
-	ret = get_futex_key(uaddr2, fshared, &key2);
+	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -2240,7 +2235,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 	 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
 	 * count.
 	 */
-	ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
 	if (ret)
 		goto out_key2;
 
@@ -2547,58 +2542,57 @@ void exit_robust_list(struct task_struct *curr)
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
-	int clockrt, ret = -ENOSYS;
-	int cmd = op & FUTEX_CMD_MASK;
-	int fshared = 0;
+	int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
+	unsigned int flags = 0;
 
 	if (!(op & FUTEX_PRIVATE_FLAG))
-		fshared = 1;
+		flags |= FLAGS_SHARED;
 
-	clockrt = op & FUTEX_CLOCK_REALTIME;
-	if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
-		return -ENOSYS;
+	if (op & FUTEX_CLOCK_REALTIME) {
+		flags |= FLAGS_CLOCKRT;
+		if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+			return -ENOSYS;
+	}
 
 	switch (cmd) {
 	case FUTEX_WAIT:
 		val3 = FUTEX_BITSET_MATCH_ANY;
 	case FUTEX_WAIT_BITSET:
-		ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
+		ret = futex_wait(uaddr, flags, val, timeout, val3);
 		break;
 	case FUTEX_WAKE:
 		val3 = FUTEX_BITSET_MATCH_ANY;
 	case FUTEX_WAKE_BITSET:
-		ret = futex_wake(uaddr, fshared, val, val3);
+		ret = futex_wake(uaddr, flags, val, val3);
 		break;
 	case FUTEX_REQUEUE:
-		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
+		ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
 		break;
 	case FUTEX_CMP_REQUEUE:
-		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
-				    0);
+		ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
 		break;
 	case FUTEX_WAKE_OP:
-		ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
+		ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
 		break;
 	case FUTEX_LOCK_PI:
 		if (futex_cmpxchg_enabled)
-			ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
+			ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
 		break;
 	case FUTEX_UNLOCK_PI:
 		if (futex_cmpxchg_enabled)
-			ret = futex_unlock_pi(uaddr, fshared);
+			ret = futex_unlock_pi(uaddr, flags);
 		break;
 	case FUTEX_TRYLOCK_PI:
 		if (futex_cmpxchg_enabled)
-			ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+			ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
 		break;
 	case FUTEX_WAIT_REQUEUE_PI:
 		val3 = FUTEX_BITSET_MATCH_ANY;
-		ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
-					    clockrt, uaddr2);
+		ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+					    uaddr2);
 		break;
 	case FUTEX_CMP_REQUEUE_PI:
-		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
-				    1);
+		ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
 		break;
 	default:
 		ret = -ENOSYS;
-- 
cgit v0.10.2


From 5bdb05f91b27b9361c4f348a4e05999f597df72e Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhart@linux.intel.com>
Date: Mon, 8 Nov 2010 13:40:28 -0800
Subject: futex: Add futex_q static initializer

The futex_q struct has grown considerably over the last couple years. I
believe it now merits a static initializer to avoid uninitialized data
errors (having spent more time than I care to admit debugging an uninitialized
q.bitset in an experimental new op code).

With the key initializer built in, several of the FUTEX_KEY_INIT calls can
be removed.

V2: use a static variable instead of an init macro.
    use a C99 initializer and don't rely on variable ordering in the struct.
V3: make futex_q_init const

Signed-off-by: Darren Hart <dvhart@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1289252428-18383-1-git-send-email-dvhart@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 87ad287..3019b92 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -131,6 +131,12 @@ struct futex_q {
 	u32 bitset;
 };
 
+static const struct futex_q futex_q_init = {
+	/* list gets initialized in queue_me()*/
+	.key = FUTEX_KEY_INIT,
+	.bitset = FUTEX_BITSET_MATCH_ANY
+};
+
 /*
  * Hash buckets are shared by all the futex_keys that hash to the same
  * location.  Each key may have multiple futex_q structures, one for each task
@@ -1750,7 +1756,6 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 	 * rare, but normal.
 	 */
 retry:
-	q->key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
 	if (unlikely(ret != 0))
 		return ret;
@@ -1791,16 +1796,12 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct restart_block *restart;
 	struct futex_hash_bucket *hb;
-	struct futex_q q;
+	struct futex_q q = futex_q_init;
 	int ret;
 
 	if (!bitset)
 		return -EINVAL;
-
-	q.pi_state = NULL;
 	q.bitset = bitset;
-	q.rt_waiter = NULL;
-	q.requeue_pi_key = NULL;
 
 	if (abs_time) {
 		to = &timeout;
@@ -1891,7 +1892,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct futex_hash_bucket *hb;
-	struct futex_q q;
+	struct futex_q q = futex_q_init;
 	int res, ret;
 
 	if (refill_pi_state_cache())
@@ -1905,11 +1906,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
 		hrtimer_set_expires(&to->timer, *time);
 	}
 
-	q.pi_state = NULL;
-	q.rt_waiter = NULL;
-	q.requeue_pi_key = NULL;
 retry:
-	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
 	if (unlikely(ret != 0))
 		goto out;
@@ -2197,8 +2194,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	struct rt_mutex_waiter rt_waiter;
 	struct rt_mutex *pi_mutex = NULL;
 	struct futex_hash_bucket *hb;
-	union futex_key key2;
-	struct futex_q q;
+	union futex_key key2 = FUTEX_KEY_INIT;
+	struct futex_q q = futex_q_init;
 	int res, ret;
 
 	if (!bitset)
@@ -2221,12 +2218,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	debug_rt_mutex_init_waiter(&rt_waiter);
 	rt_waiter.task = NULL;
 
-	key2 = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
 	if (unlikely(ret != 0))
 		goto out;
 
-	q.pi_state = NULL;
 	q.bitset = bitset;
 	q.rt_waiter = &rt_waiter;
 	q.requeue_pi_key = &key2;
-- 
cgit v0.10.2


From c5cbac69422a9bffe7c7fd9a115130e272b547f5 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Wed, 10 Nov 2010 10:35:51 -0800
Subject: x86, cpu: Rename verify_cpu_64.S to verify_cpu.S

The code is 32bit already, and can be used in 32bit routines.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
LKML-Reference: <1289414154-7829-2-git-send-email-kees.cook@canonical.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 52f85a1..35af09d 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -182,7 +182,7 @@ no_longmode:
 	hlt
 	jmp     1b
 
-#include "../../kernel/verify_cpu_64.S"
+#include "../../kernel/verify_cpu.S"
 
 	/*
 	 * Be careful here startup_64 needs to be at a predictable
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 3af2dff..075d130 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -127,7 +127,7 @@ startup_64:
 no_longmode:
 	hlt
 	jmp no_longmode
-#include "verify_cpu_64.S"
+#include "verify_cpu.S"
 
 	# Careful these need to be in the same 64K segment as the above;
 tidt:
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
new file mode 100644
index 0000000..56a8c2a
--- /dev/null
+++ b/arch/x86/kernel/verify_cpu.S
@@ -0,0 +1,106 @@
+/*
+ *
+ *	verify_cpu.S - Code for cpu long mode and SSE verification. This
+ *	code has been borrowed from boot/setup.S and was introduced by
+ * 	Andi Kleen.
+ *
+ *	Copyright (c) 2007  Andi Kleen (ak@suse.de)
+ *	Copyright (c) 2007  Eric Biederman (ebiederm@xmission.com)
+ *	Copyright (c) 2007  Vivek Goyal (vgoyal@in.ibm.com)
+ *
+ * 	This source code is licensed under the GNU General Public License,
+ * 	Version 2.  See the file COPYING for more details.
+ *
+ *	This is a common code for verification whether CPU supports
+ * 	long mode and SSE or not. It is not called directly instead this
+ *	file is included at various places and compiled in that context.
+ * 	Following are the current usage.
+ *
+ * 	This file is included by both 16bit and 32bit code.
+ *
+ *	arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
+ *	arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
+ *	arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
+ *	arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
+ *
+ *	verify_cpu, returns the status of cpu check in register %eax.
+ *		0: Success    1: Failure
+ *
+ * 	The caller needs to check for the error code and take the action
+ * 	appropriately. Either display a message or halt.
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/msr-index.h>
+
+verify_cpu:
+	pushfl				# Save caller passed flags
+	pushl	$0			# Kill any dangerous flags
+	popfl
+
+	pushfl				# standard way to check for cpuid
+	popl	%eax
+	movl	%eax,%ebx
+	xorl	$0x200000,%eax
+	pushl	%eax
+	popfl
+	pushfl
+	popl	%eax
+	cmpl	%eax,%ebx
+	jz	verify_cpu_no_longmode	# cpu has no cpuid
+
+	movl	$0x0,%eax		# See if cpuid 1 is implemented
+	cpuid
+	cmpl	$0x1,%eax
+	jb	verify_cpu_no_longmode	# no cpuid 1
+
+	xor	%di,%di
+	cmpl	$0x68747541,%ebx	# AuthenticAMD
+	jnz	verify_cpu_noamd
+	cmpl	$0x69746e65,%edx
+	jnz	verify_cpu_noamd
+	cmpl	$0x444d4163,%ecx
+	jnz	verify_cpu_noamd
+	mov	$1,%di			# cpu is from AMD
+
+verify_cpu_noamd:
+	movl    $0x1,%eax		# Does the cpu have what it takes
+	cpuid
+	andl	$REQUIRED_MASK0,%edx
+	xorl	$REQUIRED_MASK0,%edx
+	jnz	verify_cpu_no_longmode
+
+	movl    $0x80000000,%eax	# See if extended cpuid is implemented
+	cpuid
+	cmpl    $0x80000001,%eax
+	jb      verify_cpu_no_longmode	# no extended cpuid
+
+	movl    $0x80000001,%eax	# Does the cpu have what it takes
+	cpuid
+	andl    $REQUIRED_MASK1,%edx
+	xorl    $REQUIRED_MASK1,%edx
+	jnz     verify_cpu_no_longmode
+
+verify_cpu_sse_test:
+	movl	$1,%eax
+	cpuid
+	andl	$SSE_MASK,%edx
+	cmpl	$SSE_MASK,%edx
+	je	verify_cpu_sse_ok
+	test	%di,%di
+	jz	verify_cpu_no_longmode	# only try to force SSE on AMD
+	movl	$MSR_K7_HWCR,%ecx
+	rdmsr
+	btr	$15,%eax		# enable SSE
+	wrmsr
+	xor	%di,%di			# don't loop
+	jmp	verify_cpu_sse_test	# try again
+
+verify_cpu_no_longmode:
+	popfl				# Restore caller passed flags
+	movl $1,%eax
+	ret
+verify_cpu_sse_ok:
+	popfl				# Restore caller passed flags
+	xorl %eax, %eax
+	ret
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
deleted file mode 100644
index 56a8c2a..0000000
--- a/arch/x86/kernel/verify_cpu_64.S
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- *
- *	verify_cpu.S - Code for cpu long mode and SSE verification. This
- *	code has been borrowed from boot/setup.S and was introduced by
- * 	Andi Kleen.
- *
- *	Copyright (c) 2007  Andi Kleen (ak@suse.de)
- *	Copyright (c) 2007  Eric Biederman (ebiederm@xmission.com)
- *	Copyright (c) 2007  Vivek Goyal (vgoyal@in.ibm.com)
- *
- * 	This source code is licensed under the GNU General Public License,
- * 	Version 2.  See the file COPYING for more details.
- *
- *	This is a common code for verification whether CPU supports
- * 	long mode and SSE or not. It is not called directly instead this
- *	file is included at various places and compiled in that context.
- * 	Following are the current usage.
- *
- * 	This file is included by both 16bit and 32bit code.
- *
- *	arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
- *	arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
- *	arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
- *	arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
- *
- *	verify_cpu, returns the status of cpu check in register %eax.
- *		0: Success    1: Failure
- *
- * 	The caller needs to check for the error code and take the action
- * 	appropriately. Either display a message or halt.
- */
-
-#include <asm/cpufeature.h>
-#include <asm/msr-index.h>
-
-verify_cpu:
-	pushfl				# Save caller passed flags
-	pushl	$0			# Kill any dangerous flags
-	popfl
-
-	pushfl				# standard way to check for cpuid
-	popl	%eax
-	movl	%eax,%ebx
-	xorl	$0x200000,%eax
-	pushl	%eax
-	popfl
-	pushfl
-	popl	%eax
-	cmpl	%eax,%ebx
-	jz	verify_cpu_no_longmode	# cpu has no cpuid
-
-	movl	$0x0,%eax		# See if cpuid 1 is implemented
-	cpuid
-	cmpl	$0x1,%eax
-	jb	verify_cpu_no_longmode	# no cpuid 1
-
-	xor	%di,%di
-	cmpl	$0x68747541,%ebx	# AuthenticAMD
-	jnz	verify_cpu_noamd
-	cmpl	$0x69746e65,%edx
-	jnz	verify_cpu_noamd
-	cmpl	$0x444d4163,%ecx
-	jnz	verify_cpu_noamd
-	mov	$1,%di			# cpu is from AMD
-
-verify_cpu_noamd:
-	movl    $0x1,%eax		# Does the cpu have what it takes
-	cpuid
-	andl	$REQUIRED_MASK0,%edx
-	xorl	$REQUIRED_MASK0,%edx
-	jnz	verify_cpu_no_longmode
-
-	movl    $0x80000000,%eax	# See if extended cpuid is implemented
-	cpuid
-	cmpl    $0x80000001,%eax
-	jb      verify_cpu_no_longmode	# no extended cpuid
-
-	movl    $0x80000001,%eax	# Does the cpu have what it takes
-	cpuid
-	andl    $REQUIRED_MASK1,%edx
-	xorl    $REQUIRED_MASK1,%edx
-	jnz     verify_cpu_no_longmode
-
-verify_cpu_sse_test:
-	movl	$1,%eax
-	cpuid
-	andl	$SSE_MASK,%edx
-	cmpl	$SSE_MASK,%edx
-	je	verify_cpu_sse_ok
-	test	%di,%di
-	jz	verify_cpu_no_longmode	# only try to force SSE on AMD
-	movl	$MSR_K7_HWCR,%ecx
-	rdmsr
-	btr	$15,%eax		# enable SSE
-	wrmsr
-	xor	%di,%di			# don't loop
-	jmp	verify_cpu_sse_test	# try again
-
-verify_cpu_no_longmode:
-	popfl				# Restore caller passed flags
-	movl $1,%eax
-	ret
-verify_cpu_sse_ok:
-	popfl				# Restore caller passed flags
-	xorl %eax, %eax
-	ret
-- 
cgit v0.10.2


From ae84739c27b6b3725993202fe02ff35ab86468e1 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Wed, 10 Nov 2010 10:35:52 -0800
Subject: x86, cpu: Clear XD_DISABLED flag on Intel to regain NX

Intel CPUs have an additional MSR bit to indicate if the BIOS was
configured to disable the NX cpu feature. This bit was traditionally
used for operating systems that did not understand how to handle the
NX bit. Since Linux understands this, this BIOS flag should be ignored
by default.

In a review[1] of reported hardware being used by Ubuntu bug reporters,
almost 10% of systems had an incorrectly configured BIOS, leaving their
systems unable to use the NX features of their CPU.

This change will clear the MSR_IA32_MISC_ENABLE_XD_DISABLE bit so that NX
cannot be inappropriately controlled by the BIOS on Intel CPUs. If, under
very strange hardware configurations, NX actually needs to be disabled,
"noexec=off" can be used to restore the prior behavior.

[1] http://www.outflux.net/blog/archives/2010/02/18/data-mining-for-nx-bit/

Signed-off-by: Kees Cook <kees.cook@canonical.com>
LKML-Reference: <1289414154-7829-3-git-send-email-kees.cook@canonical.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 56a8c2a..ccb4136 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -7,6 +7,7 @@
  *	Copyright (c) 2007  Andi Kleen (ak@suse.de)
  *	Copyright (c) 2007  Eric Biederman (ebiederm@xmission.com)
  *	Copyright (c) 2007  Vivek Goyal (vgoyal@in.ibm.com)
+ *	Copyright (c) 2010  Kees Cook (kees.cook@canonical.com)
  *
  * 	This source code is licensed under the GNU General Public License,
  * 	Version 2.  See the file COPYING for more details.
@@ -14,18 +15,16 @@
  *	This is a common code for verification whether CPU supports
  * 	long mode and SSE or not. It is not called directly instead this
  *	file is included at various places and compiled in that context.
- * 	Following are the current usage.
+ *	This file is expected to run in 32bit code.  Currently:
  *
- * 	This file is included by both 16bit and 32bit code.
+ *	arch/x86_64/boot/compressed/head_64.S: Boot cpu verification
+ *	arch/x86_64/kernel/trampoline_64.S: secondary processor verfication
  *
- *	arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
- *	arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
- *	arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
- *	arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
- *
- *	verify_cpu, returns the status of cpu check in register %eax.
+ *	verify_cpu, returns the status of longmode and SSE in register %eax.
  *		0: Success    1: Failure
  *
+ *	On Intel, the XD_DISABLE flag will be cleared as a side-effect.
+ *
  * 	The caller needs to check for the error code and take the action
  * 	appropriately. Either display a message or halt.
  */
@@ -62,8 +61,41 @@ verify_cpu:
 	cmpl	$0x444d4163,%ecx
 	jnz	verify_cpu_noamd
 	mov	$1,%di			# cpu is from AMD
+	jmp	verify_cpu_check
 
 verify_cpu_noamd:
+	cmpl	$0x756e6547,%ebx        # GenuineIntel?
+	jnz	verify_cpu_check
+	cmpl	$0x49656e69,%edx
+	jnz	verify_cpu_check
+	cmpl	$0x6c65746e,%ecx
+	jnz	verify_cpu_check
+
+	# only call IA32_MISC_ENABLE when:
+	# family > 6 || (family == 6 && model >= 0xd)
+	movl	$0x1, %eax		# check CPU family and model
+	cpuid
+	movl	%eax, %ecx
+
+	andl	$0x0ff00f00, %eax	# mask family and extended family
+	shrl	$8, %eax
+	cmpl	$6, %eax
+	ja	verify_cpu_clear_xd	# family > 6, ok
+	jb	verify_cpu_check	# family < 6, skip
+
+	andl	$0x000f00f0, %ecx	# mask model and extended model
+	shrl	$4, %ecx
+	cmpl	$0xd, %ecx
+	jb	verify_cpu_check	# family == 6, model < 0xd, skip
+
+verify_cpu_clear_xd:
+	movl	$MSR_IA32_MISC_ENABLE, %ecx
+	rdmsr
+	btrl	$2, %edx		# clear MSR_IA32_MISC_ENABLE_XD_DISABLE
+	jnc	verify_cpu_check	# only write MSR if bit was changed
+	wrmsr
+
+verify_cpu_check:
 	movl    $0x1,%eax		# Does the cpu have what it takes
 	cpuid
 	andl	$REQUIRED_MASK0,%edx
-- 
cgit v0.10.2


From ebba638ae723d8a8fc2f7abce5ec18b688b791d7 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Wed, 10 Nov 2010 10:35:53 -0800
Subject: x86, cpu: Call verify_cpu during 32bit CPU startup

The XD_DISABLE-clearing side-effect needs to happen for both 32bit
and 64bit, but the 32bit init routines were not calling verify_cpu()
yet. This adds that call to gain the side-effect.

The longmode/SSE tests being performed in verify_cpu() need to happen very
early for 64bit but not for 32bit. Instead of including it in two places
for 32bit, we can just include it once in arch/x86/kernel/head_32.S.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
LKML-Reference: <1289414154-7829-4-git-send-email-kees.cook@canonical.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index bcece91..fdaea52 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -314,6 +314,10 @@ ENTRY(startup_32_smp)
 	subl $0x80000001, %eax
 	cmpl $(0x8000ffff-0x80000001), %eax
 	ja 6f
+
+	/* Clear bogus XD_DISABLE bits */
+	call verify_cpu
+
 	mov $0x80000001, %eax
 	cpuid
 	/* Execute Disable bit supported? */
@@ -609,6 +613,8 @@ ignore_int:
 #endif
 	iret
 
+#include "verify_cpu.S"
+
 	__REFDATA
 .align 4
 ENTRY(initial_code)
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index ccb4136..5644b4b 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -19,6 +19,7 @@
  *
  *	arch/x86_64/boot/compressed/head_64.S: Boot cpu verification
  *	arch/x86_64/kernel/trampoline_64.S: secondary processor verfication
+ *	arch/x86_64/kernel/head_32.S: processor startup
  *
  *	verify_cpu, returns the status of longmode and SSE in register %eax.
  *		0: Success    1: Failure
-- 
cgit v0.10.2


From 6036f373ea03687d355634fa70fb04baa95ab75e Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Wed, 10 Nov 2010 10:35:54 -0800
Subject: x86, cpu: Only CPU features determine NX capabilities

Fix the NX feature boot warning when NX is missing to correctly
reflect that BIOSes cannot disable NX now.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
LKML-Reference: <1289414154-7829-5-git-send-email-kees.cook@canonical.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index a3250aa..410531d 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
 {
 	if (!cpu_has_nx) {
 		printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
-		       "missing in CPU or disabled in BIOS!\n");
+		       "missing in CPU!\n");
 	} else {
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 		if (disable_nx) {
-- 
cgit v0.10.2


From cfb505a7ebd4c84206b4cc7d9f966d864a2ac05a Mon Sep 17 00:00:00 2001
From: Alek Du <alek.du@intel.com>
Date: Wed, 10 Nov 2010 16:50:08 +0000
Subject: x86: mrst: Add Moorestown specific reboot/shutdown support

Moorestowns needs to use a special IPC command to reboot or shutdown the
platform.

Signed-off-by: Alek Du <alek.du@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
LKML-Reference: <20101110164928.6365.94243.stgit@localhost.localdomain>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index e6f4473..c727d97 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -35,6 +35,7 @@
 #include <asm/i8259.h>
 #include <asm/intel_scu_ipc.h>
 #include <asm/apb_timer.h>
+#include <asm/reboot.h>
 
 
 /*
@@ -268,6 +269,17 @@ static int mrst_i8042_detect(void)
 	return 0;
 }
 
+/* Reboot and power off are handled by the SCU on a MID device */
+static void mrst_power_off(void)
+{
+	intel_scu_ipc_simple_command(0xf1, 1);
+}
+
+static void mrst_reboot(void)
+{
+	intel_scu_ipc_simple_command(0xf1, 0);
+}
+
 /*
  * Moorestown specific x86_init function overrides and early setup
  * calls.
@@ -293,6 +305,10 @@ void __init x86_mrst_early_setup(void)
 
 	legacy_pic = &null_legacy_pic;
 
+	/* Moorestown specific power_off/restart method */
+	pm_power_off = mrst_power_off;
+	machine_ops.emergency_restart  = mrst_reboot;
+
 	/* Avoid searching for BIOS MP tables */
 	x86_init.mpparse.find_smp_config = x86_init_noop;
 	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
-- 
cgit v0.10.2


From 7309282c90d251cde77fe3b520a8276e25315c49 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Wed, 10 Nov 2010 17:29:00 +0000
Subject: x86: mrst: Add vrtc driver which serves as a wall clock device

Moorestown platform doesn't have a m146818 RTC device like traditional
x86 PC, but a firmware emulated virtual RTC device(vrtc), which provides
some basic RTC functions like get/set time. vrtc serves as the only
wall clock device on Moorestown platform.

[ tglx: Changed the exports to _GPL ]

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
LKML-Reference: <20101110172837.3311.40483.stgit@localhost.localdomain>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4d293dc..139591a 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -117,6 +117,10 @@ enum fixed_addresses {
 	FIX_TEXT_POKE1,	/* reserve 2 pages for text_poke() */
 	FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
 	__end_of_permanent_fixed_addresses,
+
+#ifdef	CONFIG_X86_MRST
+	FIX_LNW_VRTC,
+#endif
 	/*
 	 * 256 temporary boot-time mappings, used by early_ioremap(),
 	 * before ioremap() is functional.
diff --git a/arch/x86/include/asm/mrst-vrtc.h b/arch/x86/include/asm/mrst-vrtc.h
new file mode 100644
index 0000000..73668ab
--- /dev/null
+++ b/arch/x86/include/asm/mrst-vrtc.h
@@ -0,0 +1,9 @@
+#ifndef _MRST_VRTC_H
+#define _MRST_VRTC_H
+
+extern unsigned char vrtc_cmos_read(unsigned char reg);
+extern void vrtc_cmos_write(unsigned char val, unsigned char reg);
+extern unsigned long vrtc_get_time(void);
+extern int vrtc_set_mmss(unsigned long nowtime);
+
+#endif
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 283debd..719f00b 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -14,7 +14,9 @@
 #include <linux/sfi.h>
 
 extern int pci_mrst_init(void);
-int __init sfi_parse_mrtc(struct sfi_table_header *table);
+extern int __init sfi_parse_mrtc(struct sfi_table_header *table);
+extern int sfi_mrtc_num;
+extern struct sfi_rtc_table_entry sfi_mrtc_array[];
 
 /*
  * Medfield is the follow-up of Moorestown, it combines two chip solution into
@@ -54,4 +56,10 @@ extern void hsu_early_console_init(void);
 extern void intel_scu_devices_create(void);
 extern void intel_scu_devices_destroy(void);
 
+/* VRTC timer */
+#define MRST_VRTC_MAP_SZ	(1024)
+/*#define MRST_VRTC_PGOFFSET	(0xc00) */
+
+extern void mrst_rtc_init(void);
+
 #endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index efbbc55..4d3e256 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1 +1,2 @@
 obj-$(CONFIG_X86_MRST)		+= mrst.o
+obj-$(CONFIG_X86_MRST)		+= vrtc.o
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index c727d97..42a0351f 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -37,7 +37,6 @@
 #include <asm/apb_timer.h>
 #include <asm/reboot.h>
 
-
 /*
  * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
  * cmdline option x86_mrst_timer can be used to override the configuration
@@ -242,11 +241,6 @@ void __init mrst_time_init(void)
 	apbt_time_init();
 }
 
-void __init mrst_rtc_init(void)
-{
-	sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
-}
-
 void __cpuinit mrst_arch_setup(void)
 {
 	if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
new file mode 100644
index 0000000..4944bd5
--- /dev/null
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -0,0 +1,120 @@
+/*
+ * vrtc.c: Driver for virtual RTC device on Intel MID platform
+ *
+ * (C) Copyright 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * VRTC is emulated by system controller firmware, the real HW
+ * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
+ * in a memory mapped IO space that is visible to the host IA
+ * processor.
+ *
+ * This driver is based on RTC CMOS driver.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+
+#include <asm/mrst.h>
+#include <asm/mrst-vrtc.h>
+#include <asm/time.h>
+#include <asm/fixmap.h>
+
+static unsigned char __iomem *vrtc_virt_base;
+
+unsigned char vrtc_cmos_read(unsigned char reg)
+{
+	unsigned char retval;
+
+	/* vRTC's registers range from 0x0 to 0xD */
+	if (reg > 0xd || !vrtc_virt_base)
+		return 0xff;
+
+	lock_cmos_prefix(reg);
+	retval = __raw_readb(vrtc_virt_base + (reg << 2));
+	lock_cmos_suffix(reg);
+	return retval;
+}
+EXPORT_SYMBOL_GPL(vrtc_cmos_read);
+
+void vrtc_cmos_write(unsigned char val, unsigned char reg)
+{
+	if (reg > 0xd || !vrtc_virt_base)
+		return;
+
+	lock_cmos_prefix(reg);
+	__raw_writeb(val, vrtc_virt_base + (reg << 2));
+	lock_cmos_suffix(reg);
+}
+EXPORT_SYMBOL_GPL(vrtc_cmos_write);
+
+unsigned long vrtc_get_time(void)
+{
+	u8 sec, min, hour, mday, mon;
+	u32 year;
+
+	while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP))
+		cpu_relax();
+
+	sec = vrtc_cmos_read(RTC_SECONDS);
+	min = vrtc_cmos_read(RTC_MINUTES);
+	hour = vrtc_cmos_read(RTC_HOURS);
+	mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
+	mon = vrtc_cmos_read(RTC_MONTH);
+	year = vrtc_cmos_read(RTC_YEAR);
+
+	/* vRTC YEAR reg contains the offset to 1960 */
+	year += 1960;
+
+	printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d "
+		"mon: %d year: %d\n", sec, min, hour, mday, mon, year);
+
+	return mktime(year, mon, mday, hour, min, sec);
+}
+
+/* Only care about the minutes and seconds */
+int vrtc_set_mmss(unsigned long nowtime)
+{
+	int real_sec, real_min;
+	int vrtc_min;
+
+	vrtc_min = vrtc_cmos_read(RTC_MINUTES);
+
+	real_sec = nowtime % 60;
+	real_min = nowtime / 60;
+	if (((abs(real_min - vrtc_min) + 15)/30) & 1)
+		real_min += 30;
+	real_min %= 60;
+
+	vrtc_cmos_write(real_sec, RTC_SECONDS);
+	vrtc_cmos_write(real_min, RTC_MINUTES);
+	return 0;
+}
+
+void __init mrst_rtc_init(void)
+{
+	unsigned long rtc_paddr;
+	void __iomem *virt_base;
+
+	sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
+	if (!sfi_mrtc_num)
+		return;
+
+	rtc_paddr = sfi_mrtc_array[0].phys_addr;
+
+	/* vRTC's register address may not be page aligned */
+	set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr);
+
+	virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC);
+	virt_base += rtc_paddr & ~PAGE_MASK;
+	vrtc_virt_base = virt_base;
+
+	x86_platform.get_wallclock = vrtc_get_time;
+	x86_platform.set_wallclock = vrtc_set_mmss;
+}
-- 
cgit v0.10.2


From 0146f26145af75d53e12dbf23a36996aff373680 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Wed, 10 Nov 2010 17:29:17 +0000
Subject: rtc: Add drivers/rtc/rtc-mrst.c

Provide the standard kernel rtc driver interface on top of the vrtc layer
added in the previous patch.

Signed-off-by: Feng Tang <feng.tang@intel.com>
LKML-Reference: <20101110172911.3311.20593.stgit@localhost.localdomain>
[Fixed swapped arguments on IPC]
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
[Cleaned up and the device creation moved to arch/x86/platform]
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
index 4944bd5..4d3f770 100644
--- a/arch/x86/platform/mrst/vrtc.c
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sfi.h>
+#include <linux/platform_device.h>
 
 #include <asm/mrst.h>
 #include <asm/mrst-vrtc.h>
@@ -118,3 +119,48 @@ void __init mrst_rtc_init(void)
 	x86_platform.get_wallclock = vrtc_get_time;
 	x86_platform.set_wallclock = vrtc_set_mmss;
 }
+
+/*
+ * The Moorestown platform has a memory mapped virtual RTC device that emulates
+ * the programming interface of the RTC.
+ */
+
+static struct resource vrtc_resources[] = {
+	[0] = {
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.flags	= IORESOURCE_IRQ,
+	}
+};
+
+static struct platform_device vrtc_device = {
+	.name		= "rtc_mrst",
+	.id		= -1,
+	.resource	= vrtc_resources,
+	.num_resources	= ARRAY_SIZE(vrtc_resources),
+};
+
+/* Register the RTC device if appropriate */
+static int __init mrst_device_create(void)
+{
+	/* No Moorestown, no device */
+	if (!mrst_identify_cpu())
+		return -ENODEV;
+	/* No timer, no device */
+	if (!sfi_mrtc_num)
+		return -ENODEV;
+
+	/* iomem resource */
+	vrtc_resources[0].start = sfi_mrtc_array[0].phys_addr;
+	vrtc_resources[0].end = sfi_mrtc_array[0].phys_addr +
+				MRST_VRTC_MAP_SZ;
+	/* irq resource */
+	vrtc_resources[1].start = sfi_mrtc_array[0].irq;
+	vrtc_resources[1].end = sfi_mrtc_array[0].irq;
+
+	platform_device_register(&vrtc_device);
+	return 0;
+}
+
+module_init(mrst_device_create);
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 2883428..4941cad 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -463,6 +463,18 @@ config RTC_DRV_CMOS
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-cmos.
 
+config RTC_DRV_VRTC
+	tristate "Virtual RTC for Moorestown platforms"
+	depends on X86_MRST
+	default y if X86_MRST
+
+	help
+	Say "yes" here to get direct support for the real time clock
+	found on Moorestown platforms. The VRTC is a emulated RTC that
+	derives its clock source from a real RTC in the PMIC. The MC146818
+	style programming interface is mostly conserved, but any
+	updates are done via IPC calls to the system controller FW.
+
 config RTC_DRV_DS1216
 	tristate "Dallas DS1216"
 	depends on SNI_RM
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 4c2832d..2afdaf3 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_RTC_DRV_CMOS)	+= rtc-cmos.o
 obj-$(CONFIG_RTC_DRV_COH901331)	+= rtc-coh901331.o
 obj-$(CONFIG_RTC_DRV_DAVINCI)	+= rtc-davinci.o
 obj-$(CONFIG_RTC_DRV_DM355EVM)	+= rtc-dm355evm.o
+obj-$(CONFIG_RTC_DRV_VRTC)	+= rtc-mrst.o
 obj-$(CONFIG_RTC_DRV_DS1216)	+= rtc-ds1216.o
 obj-$(CONFIG_RTC_DRV_DS1286)	+= rtc-ds1286.o
 obj-$(CONFIG_RTC_DRV_DS1302)	+= rtc-ds1302.o
diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c
new file mode 100644
index 0000000..67b6be2
--- /dev/null
+++ b/drivers/rtc/rtc-mrst.c
@@ -0,0 +1,578 @@
+/*
+ * rtc-mrst.c: Driver for Moorestown virtual RTC
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *	   Feng Tang (feng.tang@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * VRTC is emulated by system controller firmware, the real HW
+ * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
+ * in a memory mapped IO space that is visible to the host IA
+ * processor.
+ *
+ * This driver is based upon drivers/rtc/rtc-cmos.c
+ */
+
+/*
+ * Note:
+ *  * vRTC only supports binary mode and 24H mode
+ *  * vRTC only support PIE and AIE, no UIE, and its PIE only happens
+ *    at 23:59:59pm everyday, no support for adjustable frequency
+ *  * Alarm function is also limited to hr/min/sec.
+ */
+
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+
+#include <asm-generic/rtc.h>
+#include <asm/intel_scu_ipc.h>
+#include <asm/mrst.h>
+#include <asm/mrst-vrtc.h>
+
+struct mrst_rtc {
+	struct rtc_device	*rtc;
+	struct device		*dev;
+	int			irq;
+	struct resource		*iomem;
+
+	u8			enabled_wake;
+	u8			suspend_ctrl;
+};
+
+static const char driver_name[] = "rtc_mrst";
+
+#define	RTC_IRQMASK	(RTC_PF | RTC_AF)
+
+static inline int is_intr(u8 rtc_intr)
+{
+	if (!(rtc_intr & RTC_IRQF))
+		return 0;
+	return rtc_intr & RTC_IRQMASK;
+}
+
+/*
+ * rtc_time's year contains the increment over 1900, but vRTC's YEAR
+ * register can't be programmed to value larger than 0x64, so vRTC
+ * driver chose to use 1960 (1970 is UNIX time start point) as the base,
+ * and does the translation at read/write time
+ */
+static int mrst_read_time(struct device *dev, struct rtc_time *time)
+{
+	unsigned long flags;
+
+	if (rtc_is_updating())
+		mdelay(20);
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	time->tm_sec = vrtc_cmos_read(RTC_SECONDS);
+	time->tm_min = vrtc_cmos_read(RTC_MINUTES);
+	time->tm_hour = vrtc_cmos_read(RTC_HOURS);
+	time->tm_mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
+	time->tm_mon = vrtc_cmos_read(RTC_MONTH);
+	time->tm_year = vrtc_cmos_read(RTC_YEAR);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	/* Adjust for the 1960/1900 */
+	time->tm_year += 60;
+	time->tm_mon--;
+	return RTC_24H;
+}
+
+static int mrst_set_time(struct device *dev, struct rtc_time *time)
+{
+	int ret;
+	unsigned long flags;
+	unsigned char mon, day, hrs, min, sec;
+	unsigned int yrs;
+
+	yrs = time->tm_year;
+	mon = time->tm_mon + 1;   /* tm_mon starts at zero */
+	day = time->tm_mday;
+	hrs = time->tm_hour;
+	min = time->tm_min;
+	sec = time->tm_sec;
+
+	if (yrs < 70 || yrs > 138)
+		return -EINVAL;
+	yrs -= 60;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+
+	vrtc_cmos_write(yrs, RTC_YEAR);
+	vrtc_cmos_write(mon, RTC_MONTH);
+	vrtc_cmos_write(day, RTC_DAY_OF_MONTH);
+	vrtc_cmos_write(hrs, RTC_HOURS);
+	vrtc_cmos_write(min, RTC_MINUTES);
+	vrtc_cmos_write(sec, RTC_SECONDS);
+
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETTIME);
+	return ret;
+}
+
+static int mrst_read_alarm(struct device *dev, struct rtc_wkalrm *t)
+{
+	struct mrst_rtc	*mrst = dev_get_drvdata(dev);
+	unsigned char rtc_control;
+
+	if (mrst->irq <= 0)
+		return -EIO;
+
+	/* Basic alarms only support hour, minute, and seconds fields.
+	 * Some also support day and month, for alarms up to a year in
+	 * the future.
+	 */
+	t->time.tm_mday = -1;
+	t->time.tm_mon = -1;
+	t->time.tm_year = -1;
+
+	/* vRTC only supports binary mode */
+	spin_lock_irq(&rtc_lock);
+	t->time.tm_sec = vrtc_cmos_read(RTC_SECONDS_ALARM);
+	t->time.tm_min = vrtc_cmos_read(RTC_MINUTES_ALARM);
+	t->time.tm_hour = vrtc_cmos_read(RTC_HOURS_ALARM);
+
+	rtc_control = vrtc_cmos_read(RTC_CONTROL);
+	spin_unlock_irq(&rtc_lock);
+
+	t->enabled = !!(rtc_control & RTC_AIE);
+	t->pending = 0;
+
+	return 0;
+}
+
+static void mrst_checkintr(struct mrst_rtc *mrst, unsigned char rtc_control)
+{
+	unsigned char	rtc_intr;
+
+	/*
+	 * NOTE after changing RTC_xIE bits we always read INTR_FLAGS;
+	 * allegedly some older rtcs need that to handle irqs properly
+	 */
+	rtc_intr = vrtc_cmos_read(RTC_INTR_FLAGS);
+	rtc_intr &= (rtc_control & RTC_IRQMASK) | RTC_IRQF;
+	if (is_intr(rtc_intr))
+		rtc_update_irq(mrst->rtc, 1, rtc_intr);
+}
+
+static void mrst_irq_enable(struct mrst_rtc *mrst, unsigned char mask)
+{
+	unsigned char	rtc_control;
+
+	/*
+	 * Flush any pending IRQ status, notably for update irqs,
+	 * before we enable new IRQs
+	 */
+	rtc_control = vrtc_cmos_read(RTC_CONTROL);
+	mrst_checkintr(mrst, rtc_control);
+
+	rtc_control |= mask;
+	vrtc_cmos_write(rtc_control, RTC_CONTROL);
+
+	mrst_checkintr(mrst, rtc_control);
+}
+
+static void mrst_irq_disable(struct mrst_rtc *mrst, unsigned char mask)
+{
+	unsigned char	rtc_control;
+
+	rtc_control = vrtc_cmos_read(RTC_CONTROL);
+	rtc_control &= ~mask;
+	vrtc_cmos_write(rtc_control, RTC_CONTROL);
+	mrst_checkintr(mrst, rtc_control);
+}
+
+static int mrst_set_alarm(struct device *dev, struct rtc_wkalrm *t)
+{
+	struct mrst_rtc	*mrst = dev_get_drvdata(dev);
+	unsigned char hrs, min, sec;
+	int ret = 0;
+
+	if (!mrst->irq)
+		return -EIO;
+
+	hrs = t->time.tm_hour;
+	min = t->time.tm_min;
+	sec = t->time.tm_sec;
+
+	spin_lock_irq(&rtc_lock);
+	/* Next rtc irq must not be from previous alarm setting */
+	mrst_irq_disable(mrst, RTC_AIE);
+
+	/* Update alarm */
+	vrtc_cmos_write(hrs, RTC_HOURS_ALARM);
+	vrtc_cmos_write(min, RTC_MINUTES_ALARM);
+	vrtc_cmos_write(sec, RTC_SECONDS_ALARM);
+
+	spin_unlock_irq(&rtc_lock);
+
+	ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETALARM);
+	if (ret)
+		return ret;
+
+	spin_lock_irq(&rtc_lock);
+	if (t->enabled)
+		mrst_irq_enable(mrst, RTC_AIE);
+
+	spin_unlock_irq(&rtc_lock);
+
+	return 0;
+}
+
+static int mrst_irq_set_state(struct device *dev, int enabled)
+{
+	struct mrst_rtc	*mrst = dev_get_drvdata(dev);
+	unsigned long	flags;
+
+	if (!mrst->irq)
+		return -ENXIO;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+
+	if (enabled)
+		mrst_irq_enable(mrst, RTC_PIE);
+	else
+		mrst_irq_disable(mrst, RTC_PIE);
+
+	spin_unlock_irqrestore(&rtc_lock, flags);
+	return 0;
+}
+
+#if defined(CONFIG_RTC_INTF_DEV) || defined(CONFIG_RTC_INTF_DEV_MODULE)
+
+/* Currently, the vRTC doesn't support UIE ON/OFF */
+static int
+mrst_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
+{
+	struct mrst_rtc	*mrst = dev_get_drvdata(dev);
+	unsigned long	flags;
+
+	switch (cmd) {
+	case RTC_AIE_OFF:
+	case RTC_AIE_ON:
+		if (!mrst->irq)
+			return -EINVAL;
+		break;
+	default:
+		/* PIE ON/OFF is handled by mrst_irq_set_state() */
+		return -ENOIOCTLCMD;
+	}
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	switch (cmd) {
+	case RTC_AIE_OFF:	/* alarm off */
+		mrst_irq_disable(mrst, RTC_AIE);
+		break;
+	case RTC_AIE_ON:	/* alarm on */
+		mrst_irq_enable(mrst, RTC_AIE);
+		break;
+	}
+	spin_unlock_irqrestore(&rtc_lock, flags);
+	return 0;
+}
+
+#else
+#define	mrst_rtc_ioctl	NULL
+#endif
+
+#if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE)
+
+static int mrst_procfs(struct device *dev, struct seq_file *seq)
+{
+	unsigned char	rtc_control, valid;
+
+	spin_lock_irq(&rtc_lock);
+	rtc_control = vrtc_cmos_read(RTC_CONTROL);
+	valid = vrtc_cmos_read(RTC_VALID);
+	spin_unlock_irq(&rtc_lock);
+
+	return seq_printf(seq,
+			"periodic_IRQ\t: %s\n"
+			"alarm\t\t: %s\n"
+			"BCD\t\t: no\n"
+			"periodic_freq\t: daily (not adjustable)\n",
+			(rtc_control & RTC_PIE) ? "on" : "off",
+			(rtc_control & RTC_AIE) ? "on" : "off");
+}
+
+#else
+#define	mrst_procfs	NULL
+#endif
+
+static const struct rtc_class_ops mrst_rtc_ops = {
+	.ioctl		= mrst_rtc_ioctl,
+	.read_time	= mrst_read_time,
+	.set_time	= mrst_set_time,
+	.read_alarm	= mrst_read_alarm,
+	.set_alarm	= mrst_set_alarm,
+	.proc		= mrst_procfs,
+	.irq_set_state	= mrst_irq_set_state,
+};
+
+static struct mrst_rtc	mrst_rtc;
+
+/*
+ * When vRTC IRQ is captured by SCU FW, FW will clear the AIE bit in
+ * Reg B, so no need for this driver to clear it
+ */
+static irqreturn_t mrst_rtc_irq(int irq, void *p)
+{
+	u8 irqstat;
+
+	spin_lock(&rtc_lock);
+	/* This read will clear all IRQ flags inside Reg C */
+	irqstat = vrtc_cmos_read(RTC_INTR_FLAGS);
+	spin_unlock(&rtc_lock);
+
+	irqstat &= RTC_IRQMASK | RTC_IRQF;
+	if (is_intr(irqstat)) {
+		rtc_update_irq(p, 1, irqstat);
+		return IRQ_HANDLED;
+	}
+	return IRQ_NONE;
+}
+
+static int __init
+vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq)
+{
+	int retval = 0;
+	unsigned char rtc_control;
+
+	/* There can be only one ... */
+	if (mrst_rtc.dev)
+		return -EBUSY;
+
+	if (!iomem)
+		return -ENODEV;
+
+	iomem = request_mem_region(iomem->start,
+			iomem->end + 1 - iomem->start,
+			driver_name);
+	if (!iomem) {
+		dev_dbg(dev, "i/o mem already in use.\n");
+		return -EBUSY;
+	}
+
+	mrst_rtc.irq = rtc_irq;
+	mrst_rtc.iomem = iomem;
+
+	mrst_rtc.rtc = rtc_device_register(driver_name, dev,
+				&mrst_rtc_ops, THIS_MODULE);
+	if (IS_ERR(mrst_rtc.rtc)) {
+		retval = PTR_ERR(mrst_rtc.rtc);
+		goto cleanup0;
+	}
+
+	mrst_rtc.dev = dev;
+	dev_set_drvdata(dev, &mrst_rtc);
+	rename_region(iomem, dev_name(&mrst_rtc.rtc->dev));
+
+	spin_lock_irq(&rtc_lock);
+	mrst_irq_disable(&mrst_rtc, RTC_PIE | RTC_AIE);
+	rtc_control = vrtc_cmos_read(RTC_CONTROL);
+	spin_unlock_irq(&rtc_lock);
+
+	if (!(rtc_control & RTC_24H) || (rtc_control & (RTC_DM_BINARY)))
+		dev_dbg(dev, "TODO: support more than 24-hr BCD mode\n");
+
+	if (rtc_irq) {
+		retval = request_irq(rtc_irq, mrst_rtc_irq,
+				IRQF_DISABLED, dev_name(&mrst_rtc.rtc->dev),
+				mrst_rtc.rtc);
+		if (retval < 0) {
+			dev_dbg(dev, "IRQ %d is already in use, err %d\n",
+				rtc_irq, retval);
+			goto cleanup1;
+		}
+	}
+	dev_dbg(dev, "initialised\n");
+	return 0;
+
+cleanup1:
+	mrst_rtc.dev = NULL;
+	rtc_device_unregister(mrst_rtc.rtc);
+cleanup0:
+	release_region(iomem->start, iomem->end + 1 - iomem->start);
+	dev_err(dev, "rtc-mrst: unable to initialise\n");
+	return retval;
+}
+
+static void rtc_mrst_do_shutdown(void)
+{
+	spin_lock_irq(&rtc_lock);
+	mrst_irq_disable(&mrst_rtc, RTC_IRQMASK);
+	spin_unlock_irq(&rtc_lock);
+}
+
+static void __exit rtc_mrst_do_remove(struct device *dev)
+{
+	struct mrst_rtc	*mrst = dev_get_drvdata(dev);
+	struct resource *iomem;
+
+	rtc_mrst_do_shutdown();
+
+	if (mrst->irq)
+		free_irq(mrst->irq, mrst->rtc);
+
+	rtc_device_unregister(mrst->rtc);
+	mrst->rtc = NULL;
+
+	iomem = mrst->iomem;
+	release_region(iomem->start, iomem->end + 1 - iomem->start);
+	mrst->iomem = NULL;
+
+	mrst->dev = NULL;
+	dev_set_drvdata(dev, NULL);
+}
+
+#ifdef	CONFIG_PM
+static int mrst_suspend(struct device *dev, pm_message_t mesg)
+{
+	struct mrst_rtc	*mrst = dev_get_drvdata(dev);
+	unsigned char	tmp;
+
+	/* Only the alarm might be a wakeup event source */
+	spin_lock_irq(&rtc_lock);
+	mrst->suspend_ctrl = tmp = vrtc_cmos_read(RTC_CONTROL);
+	if (tmp & (RTC_PIE | RTC_AIE)) {
+		unsigned char	mask;
+
+		if (device_may_wakeup(dev))
+			mask = RTC_IRQMASK & ~RTC_AIE;
+		else
+			mask = RTC_IRQMASK;
+		tmp &= ~mask;
+		vrtc_cmos_write(tmp, RTC_CONTROL);
+
+		mrst_checkintr(mrst, tmp);
+	}
+	spin_unlock_irq(&rtc_lock);
+
+	if (tmp & RTC_AIE) {
+		mrst->enabled_wake = 1;
+		enable_irq_wake(mrst->irq);
+	}
+
+	dev_dbg(&mrst_rtc.rtc->dev, "suspend%s, ctrl %02x\n",
+			(tmp & RTC_AIE) ? ", alarm may wake" : "",
+			tmp);
+
+	return 0;
+}
+
+/*
+ * We want RTC alarms to wake us from the deep power saving state
+ */
+static inline int mrst_poweroff(struct device *dev)
+{
+	return mrst_suspend(dev, PMSG_HIBERNATE);
+}
+
+static int mrst_resume(struct device *dev)
+{
+	struct mrst_rtc	*mrst = dev_get_drvdata(dev);
+	unsigned char tmp = mrst->suspend_ctrl;
+
+	/* Re-enable any irqs previously active */
+	if (tmp & RTC_IRQMASK) {
+		unsigned char	mask;
+
+		if (mrst->enabled_wake) {
+			disable_irq_wake(mrst->irq);
+			mrst->enabled_wake = 0;
+		}
+
+		spin_lock_irq(&rtc_lock);
+		do {
+			vrtc_cmos_write(tmp, RTC_CONTROL);
+
+			mask = vrtc_cmos_read(RTC_INTR_FLAGS);
+			mask &= (tmp & RTC_IRQMASK) | RTC_IRQF;
+			if (!is_intr(mask))
+				break;
+
+			rtc_update_irq(mrst->rtc, 1, mask);
+			tmp &= ~RTC_AIE;
+		} while (mask & RTC_AIE);
+		spin_unlock_irq(&rtc_lock);
+	}
+
+	dev_dbg(&mrst_rtc.rtc->dev, "resume, ctrl %02x\n", tmp);
+
+	return 0;
+}
+
+#else
+#define	mrst_suspend	NULL
+#define	mrst_resume	NULL
+
+static inline int mrst_poweroff(struct device *dev)
+{
+	return -ENOSYS;
+}
+
+#endif
+
+static int __init vrtc_mrst_platform_probe(struct platform_device *pdev)
+{
+	return vrtc_mrst_do_probe(&pdev->dev,
+			platform_get_resource(pdev, IORESOURCE_MEM, 0),
+			platform_get_irq(pdev, 0));
+}
+
+static int __exit vrtc_mrst_platform_remove(struct platform_device *pdev)
+{
+	rtc_mrst_do_remove(&pdev->dev);
+	return 0;
+}
+
+static void vrtc_mrst_platform_shutdown(struct platform_device *pdev)
+{
+	if (system_state == SYSTEM_POWER_OFF && !mrst_poweroff(&pdev->dev))
+		return;
+
+	rtc_mrst_do_shutdown();
+}
+
+MODULE_ALIAS("platform:vrtc_mrst");
+
+static struct platform_driver vrtc_mrst_platform_driver = {
+	.probe		= vrtc_mrst_platform_probe,
+	.remove		= __exit_p(vrtc_mrst_platform_remove),
+	.shutdown	= vrtc_mrst_platform_shutdown,
+	.driver = {
+		.name		= (char *) driver_name,
+		.suspend	= mrst_suspend,
+		.resume		= mrst_resume,
+	}
+};
+
+static int __init vrtc_mrst_init(void)
+{
+	return platform_driver_register(&vrtc_mrst_platform_driver);
+}
+
+static void __exit vrtc_mrst_exit(void)
+{
+	platform_driver_unregister(&vrtc_mrst_platform_driver);
+}
+
+module_init(vrtc_mrst_init);
+module_exit(vrtc_mrst_exit);
+
+MODULE_AUTHOR("Jacob Pan; Feng Tang");
+MODULE_DESCRIPTION("Driver for Moorestown virtual RTC");
+MODULE_LICENSE("GPL");
-- 
cgit v0.10.2


From 86071535f845fd054753122e564cee9406c84e70 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vinod.koul@intel.com>
Date: Wed, 10 Nov 2010 17:40:48 +0000
Subject: x86: mrst: Add audio driver bindings

This patch adds the sound card bindings for Moorestown (pmic_audio) and
the Medfield platform (msic_audio) as IPC devices. This ensures they will be
created at the right time.

Signed-off-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
LKML-Reference: <20101110174044.11340.78008.stgit@localhost.localdomain>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 42a0351f..237e28f 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -499,6 +499,11 @@ static void __init *lis331dl_platform_data(void *info)
 	return &intr2nd_pdata;
 }
 
+static void __init *no_platform_data(void *info)
+{
+	return NULL;
+}
+
 static const struct devs_id __initconst device_ids[] = {
 	{"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
 	{"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data},
@@ -506,6 +511,8 @@ static const struct devs_id __initconst device_ids[] = {
 	{"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
 	{"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data},
 	{"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data},
+	{"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
+	{"msic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
 	{},
 };
 
-- 
cgit v0.10.2


From 6f207e9bb4219d261d9326597ca533f954f31755 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Thu, 11 Nov 2010 15:50:50 +0000
Subject: x86: mrst: Set vRTC's IRQ to level trigger type

When setting up the mpc_intsrc structure for vRTC's IRQ,
we need to set its irqflag to level trigger, otherwise
it will be taken as edge triggered and the vRTC IRQ will
fire only once, as there is never a EOI issued from the
IA core for it.

The original code worked in previous kernel. This is  because it
was configured to level trigger type by luck. It fell
into the default PCI trigger category which is level triggered.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
LKML-Reference: <20101111155019.12924.569.stgit@localhost.localdomain>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 237e28f..fee0b49 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -195,7 +195,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
 			totallen, (u32)pentry->phys_addr, pentry->irq);
 		mp_irq.type = MP_IOAPIC;
 		mp_irq.irqtype = mp_INT;
-		mp_irq.irqflag = 0;
+		mp_irq.irqflag = 0xf;	/* level trigger and active low */
 		mp_irq.srcbus = 0;
 		mp_irq.srcbusirq = pentry->irq;	/* IRQ */
 		mp_irq.dstapic = MP_APIC_ALL;
-- 
cgit v0.10.2


From c751e17b5371ad86cdde6cf5c0175e06f3ff0347 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Nov 2010 12:08:04 -0800
Subject: x86: Add CE4100 platform support

Add CE4100 platform support. CE4100 needs early setup like
moorestown.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Dirk Brandewie <dirk.brandewie@gmail.com>
LKML-Reference: <94720fd7f5564a12ebf202cf2c4f4c0d619aab35.1289331834.git.dirk.brandewie@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index 30b43e1..bdeb81c 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -600,6 +600,7 @@ Protocol:	2.07+
   0x00000001	lguest
   0x00000002	Xen
   0x00000003	Moorestown MID
+  0x00000004	CE4100 TV Platform
 
 Field name:	hardware_subarch_data
 Type:		write (subarch-dependent)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b306b84..43951c9c7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -377,6 +377,17 @@ config X86_ELAN
 
 	  If unsure, choose "PC-compatible" instead.
 
+config X86_INTEL_CE
+	bool "CE4100 TV platform"
+	depends on PCI
+	depends on PCI_GODIRECT
+	depends on X86_32
+	depends on X86_EXTENDED_PLATFORM
+	---help---
+	  Select for the Intel CE media processor (CE4100) SOC.
+	  This option compiles in support for the CE4100 SOC for settop
+	  boxes and media devices.
+
 config X86_MRST
        bool "Moorestown MID platform"
 	depends on PCI
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 8e62185..c8bfe63 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -124,6 +124,7 @@ enum {
 	X86_SUBARCH_LGUEST,
 	X86_SUBARCH_XEN,
 	X86_SUBARCH_MRST,
+	X86_SUBARCH_CE4100,
 	X86_NR_SUBARCHS,
 };
 
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index d6763b139a..db8aa19 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -53,6 +53,12 @@ extern void x86_mrst_early_setup(void);
 static inline void x86_mrst_early_setup(void) { }
 #endif
 
+#ifdef CONFIG_X86_INTEL_CE
+extern void x86_ce4100_early_setup(void);
+#else
+static inline void x86_ce4100_early_setup(void) { }
+#endif
+
 #ifndef _SETUP
 
 /*
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 7633101..7f138b3 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -61,6 +61,9 @@ void __init i386_start_kernel(void)
 	case X86_SUBARCH_MRST:
 		x86_mrst_early_setup();
 		break;
+	case X86_SUBARCH_CE4100:
+		x86_ce4100_early_setup();
+		break;
 	default:
 		i386_default_early_setup();
 		break;
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 7bf70b8..a019bc3 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -1,4 +1,5 @@
 # Platform specific code goes here
+obj-y	+= ce4100/
 obj-y	+= efi/
 obj-y	+= mrst/
 obj-y	+= olpc/
diff --git a/arch/x86/platform/ce4100/Makefile b/arch/x86/platform/ce4100/Makefile
new file mode 100644
index 0000000..91fc929
--- /dev/null
+++ b/arch/x86/platform/ce4100/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_INTEL_CE)	+= ce4100.o
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
new file mode 100644
index 0000000..0ede12b
--- /dev/null
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -0,0 +1,42 @@
+/*
+ * Intel CE4100  platform specific setup code
+ *
+ * (C) Copyright 2010 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+
+#include <asm/setup.h>
+
+static int ce4100_i8042_detect(void)
+{
+	return 0;
+}
+
+static void __init sdv_arch_setup(void)
+{
+}
+
+static void __init sdv_find_smp_config(void)
+{
+}
+
+/*
+ * CE4100 specific x86_init function overrides and early setup
+ * calls.
+ */
+void __init x86_ce4100_early_setup(void)
+{
+	x86_init.oem.arch_setup = sdv_arch_setup;
+	x86_platform.i8042_detect = ce4100_i8042_detect;
+	x86_init.resources.probe_roms = x86_init_noop;
+	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+	x86_init.mpparse.find_smp_config = sdv_find_smp_config;
+}
-- 
cgit v0.10.2


From 91d8037f563e4a86ff8b02c994530989c7936427 Mon Sep 17 00:00:00 2001
From: Dirk Brandewie <dirk.j.brandewie@intel.com>
Date: Tue, 9 Nov 2010 12:08:05 -0800
Subject: ce4100: Add PCI register emulation for CE4100

This patch provides access methods for PCI registers that mis-behave on
the CE4100. Each register can be assigned a private init, read and
write routine. The exception to this is the bridge device. The
bridge device is the only device on bus zero (0) that requires any
fixup so it is a special case.

[ tglx: minor coding style cleanups, __init annotation and
  	simplification of ce4100_conf_read/write ]

Signed-off-by: Dirk Brandewie <dirk.j.brandewie@intel.com>
LKML-Reference: <40b6751381c2275dc359db5a17989cce22ad8db7.1289331834.git.dirk.brandewie@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index effd96e..6b8759f 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_PCI_OLPC)		+= olpc.o
 obj-$(CONFIG_PCI_XEN)		+= xen.o
 
 obj-y				+= fixup.o
+obj-$(CONFIG_X86_INTEL_CE)      += ce4100.o
 obj-$(CONFIG_ACPI)		+= acpi.o
 obj-y				+= legacy.o irq.o
 
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
new file mode 100644
index 0000000..85b68ef
--- /dev/null
+++ b/arch/x86/pci/ce4100.c
@@ -0,0 +1,315 @@
+/*
+ *  GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2010 Intel Corporation. All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *  The full GNU General Public License is included in this distribution
+ *  in the file called LICENSE.GPL.
+ *
+ *  Contact Information:
+ *    Intel Corporation
+ *    2200 Mission College Blvd.
+ *    Santa Clara, CA  97052
+ *
+ * This provides access methods for PCI registers that mis-behave on
+ * the CE4100. Each register can be assigned a private init, read and
+ * write routine. The exception to this is the bridge device.  The
+ * bridge device is the only device on bus zero (0) that requires any
+ * fixup so it is a special case ATM
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+
+#include <asm/pci_x86.h>
+
+struct sim_reg {
+	u32 value;
+	u32 mask;
+};
+
+struct sim_dev_reg {
+	int dev_func;
+	int reg;
+	void (*init)(struct sim_dev_reg *reg);
+	void (*read)(struct sim_dev_reg *reg, u32 *value);
+	void (*write)(struct sim_dev_reg *reg, u32 value);
+	struct sim_reg sim_reg;
+};
+
+struct sim_reg_op {
+	void (*init)(struct sim_dev_reg *reg);
+	void (*read)(struct sim_dev_reg *reg, u32 value);
+	void (*write)(struct sim_dev_reg *reg, u32 value);
+};
+
+#define MB (1024 * 1024)
+#define KB (1024)
+#define SIZE_TO_MASK(size) (~(size - 1))
+
+#define DEFINE_REG(device, func, offset, size, init_op, read_op, write_op)\
+{ PCI_DEVFN(device, func), offset, init_op, read_op, write_op,\
+	{0, SIZE_TO_MASK(size)} },
+
+static void reg_init(struct sim_dev_reg *reg)
+{
+	pci_direct_conf1.read(0, 1, reg->dev_func, reg->reg, 4,
+			      &reg->sim_reg.value);
+}
+
+static void reg_read(struct sim_dev_reg *reg, u32 *value)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
+	*value = reg->sim_reg.value;
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+}
+
+static void reg_write(struct sim_dev_reg *reg, u32 value)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
+	reg->sim_reg.value = (value & reg->sim_reg.mask) |
+		(reg->sim_reg.value & ~reg->sim_reg.mask);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+}
+
+static void sata_reg_init(struct sim_dev_reg *reg)
+{
+	pci_direct_conf1.read(0, 1, PCI_DEVFN(14, 0), 0x10, 4,
+			      &reg->sim_reg.value);
+	reg->sim_reg.value += 0x400;
+}
+
+static void ehci_reg_read(struct sim_dev_reg *reg, u32 *value)
+{
+	reg_read(reg, value);
+	if (*value != reg->sim_reg.mask)
+		*value |= 0x100;
+}
+
+void sata_revid_init(struct sim_dev_reg *reg)
+{
+	reg->sim_reg.value = 0x01060100;
+	reg->sim_reg.mask = 0;
+}
+
+static void sata_revid_read(struct sim_dev_reg *reg, u32 *value)
+{
+	reg_read(reg, value);
+}
+
+static struct sim_dev_reg bus1_fixups[] = {
+	DEFINE_REG(2, 0, 0x10, (16*MB), reg_init, reg_read, reg_write)
+	DEFINE_REG(2, 0, 0x14, (256), reg_init, reg_read, reg_write)
+	DEFINE_REG(2, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(3, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(4, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(4, 1, 0x10, (128*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(6, 0, 0x10, (512*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(6, 1, 0x10, (512*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(6, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(8, 0, 0x10, (1*MB), reg_init, reg_read, reg_write)
+	DEFINE_REG(8, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(8, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(9, 0, 0x10 , (1*MB), reg_init, reg_read, reg_write)
+	DEFINE_REG(9, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(10, 0, 0x10, (256), reg_init, reg_read, reg_write)
+	DEFINE_REG(10, 0, 0x14, (256*MB), reg_init, reg_read, reg_write)
+	DEFINE_REG(11, 0, 0x10, (256), reg_init, reg_read, reg_write)
+	DEFINE_REG(11, 0, 0x14, (256), reg_init, reg_read, reg_write)
+	DEFINE_REG(11, 1, 0x10, (256), reg_init, reg_read, reg_write)
+	DEFINE_REG(11, 2, 0x10, (256), reg_init, reg_read, reg_write)
+	DEFINE_REG(11, 2, 0x14, (256), reg_init, reg_read, reg_write)
+	DEFINE_REG(11, 2, 0x18, (256), reg_init, reg_read, reg_write)
+	DEFINE_REG(11, 3, 0x10, (256), reg_init, reg_read, reg_write)
+	DEFINE_REG(11, 3, 0x14, (256), reg_init, reg_read, reg_write)
+	DEFINE_REG(11, 4, 0x10, (256), reg_init, reg_read, reg_write)
+	DEFINE_REG(11, 5, 0x10, (64*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(11, 6, 0x10, (256), reg_init, reg_read, reg_write)
+	DEFINE_REG(11, 7, 0x10, (64*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(12, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(12, 0, 0x14, (256), reg_init, reg_read, reg_write)
+	DEFINE_REG(12, 1, 0x10, (1024), reg_init, reg_read, reg_write)
+	DEFINE_REG(13, 0, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
+	DEFINE_REG(13, 1, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
+	DEFINE_REG(14, 0, 0x8,  0, sata_revid_init, sata_revid_read, 0)
+	DEFINE_REG(14, 0, 0x10, 0, reg_init, reg_read, reg_write)
+	DEFINE_REG(14, 0, 0x14, 0, reg_init, reg_read, reg_write)
+	DEFINE_REG(14, 0, 0x18, 0, reg_init, reg_read, reg_write)
+	DEFINE_REG(14, 0, 0x1C, 0, reg_init, reg_read, reg_write)
+	DEFINE_REG(14, 0, 0x20, 0, reg_init, reg_read, reg_write)
+	DEFINE_REG(14, 0, 0x24, (0x200), sata_reg_init, reg_read, reg_write)
+	DEFINE_REG(15, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(15, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(16, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(16, 0, 0x14, (64*MB), reg_init, reg_read, reg_write)
+	DEFINE_REG(16, 0, 0x18, (64*MB), reg_init, reg_read, reg_write)
+	DEFINE_REG(17, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+	DEFINE_REG(18, 0, 0x10, (1*KB), reg_init, reg_read, reg_write)
+};
+
+static void __init init_sim_regs(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+		if (bus1_fixups[i].init)
+			bus1_fixups[i].init(&bus1_fixups[i]);
+	}
+}
+
+static inline void extract_bytes(u32 *value, int reg, int len)
+{
+	uint32_t mask;
+
+	*value >>= ((reg & 3) * 8);
+	mask = 0xFFFFFFFF >> ((4 - len) * 8);
+	*value &= mask;
+}
+
+int bridge_read(unsigned int devfn, int reg, int len, u32 *value)
+{
+	u32 av_bridge_base, av_bridge_limit;
+	int retval = 0;
+
+	switch (reg) {
+	/* Make BARs appear to not request any memory. */
+	case PCI_BASE_ADDRESS_0:
+	case PCI_BASE_ADDRESS_0 + 1:
+	case PCI_BASE_ADDRESS_0 + 2:
+	case PCI_BASE_ADDRESS_0 + 3:
+		*value = 0;
+		break;
+
+		/* Since subordinate bus number register is hardwired
+		 * to zero and read only, so do the simulation.
+		 */
+	case PCI_PRIMARY_BUS:
+		if (len == 4)
+			*value = 0x00010100;
+		break;
+
+	case PCI_SUBORDINATE_BUS:
+		*value = 1;
+		break;
+
+	case PCI_MEMORY_BASE:
+	case PCI_MEMORY_LIMIT:
+		/* Get the A/V bridge base address. */
+		pci_direct_conf1.read(0, 0, devfn,
+				PCI_BASE_ADDRESS_0, 4, &av_bridge_base);
+
+		av_bridge_limit = av_bridge_base + (512*MB - 1);
+		av_bridge_limit >>= 16;
+		av_bridge_limit &= 0xFFF0;
+
+		av_bridge_base >>= 16;
+		av_bridge_base &= 0xFFF0;
+
+		if (reg == PCI_MEMORY_LIMIT)
+			*value = av_bridge_limit;
+		else if (len == 2)
+			*value = av_bridge_base;
+		else
+			*value = (av_bridge_limit << 16) | av_bridge_base;
+		break;
+		/* Make prefetchable memory limit smaller than prefetchable
+		 * memory base, so not claim prefetchable memory space.
+		 */
+	case PCI_PREF_MEMORY_BASE:
+		*value = 0xFFF0;
+		break;
+	case PCI_PREF_MEMORY_LIMIT:
+		*value = 0x0;
+		break;
+		/* Make IO limit smaller than IO base, so not claim IO space. */
+	case PCI_IO_BASE:
+		*value = 0xF0;
+		break;
+	case PCI_IO_LIMIT:
+		*value = 0;
+		break;
+	default:
+		retval = 1;
+	}
+	return retval;
+}
+
+static int ce4100_conf_read(unsigned int seg, unsigned int bus,
+			    unsigned int devfn, int reg, int len, u32 *value)
+{
+	int i, retval = 1;
+
+	if (bus == 1) {
+		for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+			if (bus1_fixups[i].dev_func == devfn &&
+			    bus1_fixups[i].reg == (reg & ~3) &&
+			    bus1_fixups[i].read) {
+				bus1_fixups[i].read(&(bus1_fixups[i]),
+						    value);
+				extract_bytes(value, reg, len);
+				return 0;
+			}
+		}
+	}
+
+	if (bus == 0 && (PCI_DEVFN(1, 0) == devfn) &&
+	    !bridge_read(devfn, reg, len, value))
+		return 0;
+
+	return pci_direct_conf1.read(seg, bus, devfn, reg, len, value);
+}
+
+static int ce4100_conf_write(unsigned int seg, unsigned int bus,
+			     unsigned int devfn, int reg, int len, u32 value)
+{
+	int i;
+
+	if (bus == 1) {
+		for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+			if (bus1_fixups[i].dev_func == devfn &&
+			    bus1_fixups[i].reg == (reg & ~3) &&
+			    bus1_fixups[i].write) {
+				bus1_fixups[i].write(&(bus1_fixups[i]),
+						     value);
+				return 0;
+			}
+		}
+	}
+
+	/* Discard writes to A/V bridge BAR. */
+	if (bus == 0 && PCI_DEVFN(1, 0) == devfn &&
+	    ((reg & ~3) == PCI_BASE_ADDRESS_0))
+		return 0;
+
+	return pci_direct_conf1.write(seg, bus, devfn, reg, len, value);
+}
+
+struct pci_raw_ops ce4100_pci_conf = {
+	.read =	ce4100_conf_read,
+	.write = ce4100_conf_write,
+};
+
+static int __init ce4100_pci_init(void)
+{
+	init_sim_regs();
+	raw_pci_ops = &ce4100_pci_conf;
+	return 0;
+}
+subsys_initcall(ce4100_pci_init);
-- 
cgit v0.10.2


From 37bc9f5078c62bfa73edeb0053edceb3ed5e46a4 Mon Sep 17 00:00:00 2001
From: Dirk Brandewie <dirk.brandewie@gmail.com>
Date: Tue, 9 Nov 2010 12:08:08 -0800
Subject: x86: Ce4100: Add reboot_fixup() for CE4100

This patch adds the CE4100 reboot fixup to reboot_fixups_32.c

[ tglx: Moved PCI id to reboot_fixups_32.c ]

Signed-off-by: Dirk Brandewie <dirk.j.brandewie@intel.com>
LKML-Reference: <5bdcfb4f0206fa721570504e95659a03b815bc5e.1289331834.git.dirk.brandewie@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 43951c9c7..e5510cf 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -383,6 +383,7 @@ config X86_INTEL_CE
 	depends on PCI_GODIRECT
 	depends on X86_32
 	depends on X86_EXTENDED_PLATFORM
+	select X86_REBOOTFIXUPS
 	---help---
 	  Select for the Intel CE media processor (CE4100) SOC.
 	  This option compiles in support for the CE4100 SOC for settop
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index fda313e..c8e41e9 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -43,17 +43,33 @@ static void rdc321x_reset(struct pci_dev *dev)
 	outb(1, 0x92);
 }
 
+static void ce4100_reset(struct pci_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < 10; i++) {
+		outb(0x2, 0xcf9);
+		udelay(50);
+	}
+}
+
 struct device_fixup {
 	unsigned int vendor;
 	unsigned int device;
 	void (*reboot_fixup)(struct pci_dev *);
 };
 
+/*
+ * PCI ids solely used for fixups_table go here
+ */
+#define PCI_DEVICE_ID_INTEL_CE4100	0x0708
+
 static const struct device_fixup fixups_table[] = {
 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
 { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
 { PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
+{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset },
 };
 
 /*
-- 
cgit v0.10.2


From b9fc71f47dc060c588e5099638242fad44eeecbc Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 15 Nov 2010 17:31:19 +0000
Subject: x86, mrst: The shutdown for MRST requires the SCU IPC mechanism

Fix the build failure reported by Randy.

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
LKML-Reference: <20101115173110.6877.83958.stgit@localhost.localdomain>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e5510cf..6e877b9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -399,6 +399,7 @@ config X86_MRST
 	select APB_TIMER
 	select I2C
 	select SPI
+	select INTEL_SCU_IPC
 	---help---
 	  Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
 	  Internet Device(MID) platform. Moorestown consists of two chips:
-- 
cgit v0.10.2


From ad02519a0d27da4a0a50cbc696e810c94e27c28e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 15 Nov 2010 10:14:06 -0800
Subject: x86, mrst: Fix dependencies of "select INTEL_SCU_IPC"

commit b9fc71f47 (x86, mrst: The shutdown for MRST requires the SCU
IPC mechanism) introduced the following warning:

 warning: (X86_MRST && PCI && PCI_GOANY && X86_32 &&
 X86_EXTENDED_PLATFORM && X86_IO_APIC) selects INTEL_SCU_IPC which has
 unmet direct dependencies (X86 && X86_PLATFORM_DEVICES && X86_MRST)

which is due to the hierarchical menu structure.

Select X86_PLATFORM_DEVICES as well.

Originally-from: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <20101115101406.77e072ef.randy.dunlap@oracle.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6e877b9..655fcf5a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -400,6 +400,7 @@ config X86_MRST
 	select I2C
 	select SPI
 	select INTEL_SCU_IPC
+	select X86_PLATFORM_DEVICES
 	---help---
 	  Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
 	  Internet Device(MID) platform. Moorestown consists of two chips:
-- 
cgit v0.10.2


From d3e1884bc585a43674d2cb0d3f0aeeb0ae43bc04 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Wed, 17 Nov 2010 12:11:24 +0000
Subject: x86, mrst: Add explanation for using 1960 as the year offset for vrtc

Explain the reason for the apparently odd choice of year offset so we don't
get more questions about it.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
LKML-Reference: <20101117121050.9998.89348.stgit@localhost.localdomain>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c
index 67b6be2..bcd0cf6 100644
--- a/drivers/rtc/rtc-mrst.c
+++ b/drivers/rtc/rtc-mrst.c
@@ -66,7 +66,11 @@ static inline int is_intr(u8 rtc_intr)
  * rtc_time's year contains the increment over 1900, but vRTC's YEAR
  * register can't be programmed to value larger than 0x64, so vRTC
  * driver chose to use 1960 (1970 is UNIX time start point) as the base,
- * and does the translation at read/write time
+ * and does the translation at read/write time.
+ *
+ * Why not just use 1970 as the offset? it's because using 1960 will
+ * make it consistent in leap year setting for both vrtc and low-level
+ * physical rtc devices.
  */
 static int mrst_read_time(struct device *dev, struct rtc_time *time)
 {
-- 
cgit v0.10.2


From b2c0710c464ede15e1fc52fb1e7ee9ba54cea186 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 9 Sep 2010 13:40:39 -0700
Subject: rcu: move TINY_RCU from softirq to kthread

If RCU priority boosting is to be meaningful, callback invocation must
be boosted in addition to preempted RCU readers.  Otherwise, in presence
of CPU real-time threads, the grace period ends, but the callbacks don't
get invoked.  If the callbacks don't get invoked, the associated memory
doesn't get freed, so the system is still subject to OOM.

But it is not reasonable to priority-boost RCU_SOFTIRQ, so this commit
moves the callback invocations to a kthread, which can be boosted easily.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 03cda7b..7142ee3 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -118,7 +118,6 @@ static inline int rcu_preempt_depth(void)
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 /* Internal to kernel */
-extern void rcu_init(void);
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern void rcu_check_callbacks(int cpu, int user);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 13877cb..ea025a6 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,7 +27,9 @@
 
 #include <linux/cache.h>
 
-#define rcu_init_sched()	do { } while (0)
+static inline void rcu_init(void)
+{
+}
 
 #ifdef CONFIG_TINY_RCU
 
@@ -125,16 +127,12 @@ static inline void rcu_cpu_stall_reset(void)
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-
 extern int rcu_scheduler_active __read_mostly;
 extern void rcu_scheduler_starting(void);
-
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
 static inline void rcu_scheduler_starting(void)
 {
 }
-
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 95518e6..c0e9683 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,6 +30,7 @@
 #ifndef __LINUX_RCUTREE_H
 #define __LINUX_RCUTREE_H
 
+extern void rcu_init(void);
 extern void rcu_note_context_switch(int cpu);
 extern int rcu_needs_cpu(int cpu);
 extern void rcu_cpu_stall_reset(void);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d806735..86eef29 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -59,8 +59,15 @@ int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
+/* Controls for rcu_cbs() kthread, replacing RCU_SOFTIRQ used previously. */
+static struct task_struct *rcu_cbs_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_cbs_wq);
+static unsigned long have_rcu_cbs;
+static void invoke_rcu_cbs(void);
+
 /* Forward declarations for rcutiny_plugin.h. */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static int rcu_cbs(void *arg);
 static void __call_rcu(struct rcu_head *head,
 		       void (*func)(struct rcu_head *rcu),
 		       struct rcu_ctrlblk *rcp);
@@ -123,7 +130,7 @@ void rcu_sched_qs(int cpu)
 {
 	if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
 	    rcu_qsctr_help(&rcu_bh_ctrlblk))
-		raise_softirq(RCU_SOFTIRQ);
+		invoke_rcu_cbs();
 }
 
 /*
@@ -132,7 +139,7 @@ void rcu_sched_qs(int cpu)
 void rcu_bh_qs(int cpu)
 {
 	if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-		raise_softirq(RCU_SOFTIRQ);
+		invoke_rcu_cbs();
 }
 
 /*
@@ -152,10 +159,10 @@ void rcu_check_callbacks(int cpu, int user)
 }
 
 /*
- * Helper function for rcu_process_callbacks() that operates on the
- * specified rcu_ctrlkblk structure.
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
+ * whose grace period has elapsed.
  */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
 	struct rcu_head *next, *list;
 	unsigned long flags;
@@ -180,19 +187,52 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 		next = list->next;
 		prefetch(next);
 		debug_rcu_head_unqueue(list);
+		local_bh_disable();
 		list->func(list);
+		local_bh_enable();
 		list = next;
 	}
 }
 
 /*
- * Invoke any callbacks whose grace period has completed.
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed.  It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
+ */
+static int rcu_cbs(void *arg)
+{
+	unsigned long work;
+	unsigned long flags;
+
+	for (;;) {
+		wait_event(rcu_cbs_wq, have_rcu_cbs != 0);
+		local_irq_save(flags);
+		work = have_rcu_cbs;
+		have_rcu_cbs = 0;
+		local_irq_restore(flags);
+		if (work) {
+			rcu_process_callbacks(&rcu_sched_ctrlblk);
+			rcu_process_callbacks(&rcu_bh_ctrlblk);
+			rcu_preempt_process_callbacks();
+		}
+	}
+
+	return 0;  /* Not reached, but needed to shut gcc up. */
+}
+
+/*
+ * Wake up rcu_cbs() to process callbacks now eligible for invocation.
  */
-static void rcu_process_callbacks(struct softirq_action *unused)
+static void invoke_rcu_cbs(void)
 {
-	__rcu_process_callbacks(&rcu_sched_ctrlblk);
-	__rcu_process_callbacks(&rcu_bh_ctrlblk);
-	rcu_preempt_process_callbacks();
+	unsigned long flags;
+
+	local_irq_save(flags);
+	have_rcu_cbs = 1;
+	wake_up(&rcu_cbs_wq);
+	local_irq_restore(flags);
 }
 
 /*
@@ -282,7 +322,12 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
-void __init rcu_init(void)
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
 {
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+	rcu_cbs_task = kthread_run(rcu_cbs, NULL, "rcu_cbs");
+	return 0;
 }
+early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 6ceca4f..95f9239 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -22,6 +22,8 @@
  * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
 
+#include <linux/kthread.h>
+
 #ifdef CONFIG_TINY_PREEMPT_RCU
 
 #include <linux/delay.h>
@@ -164,9 +166,9 @@ static void rcu_preempt_cpu_qs(void)
 	if (!rcu_preempt_blocked_readers_any())
 		rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
 
-	/* If there are done callbacks, make RCU_SOFTIRQ process them. */
+	/* If there are done callbacks, cause them to be invoked. */
 	if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-		raise_softirq(RCU_SOFTIRQ);
+		invoke_rcu_cbs();
 }
 
 /*
@@ -374,7 +376,7 @@ static void rcu_preempt_check_callbacks(void)
 		rcu_preempt_cpu_qs();
 	if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
 	    rcu_preempt_ctrlblk.rcb.donetail)
-		raise_softirq(RCU_SOFTIRQ);
+		invoke_rcu_cbs();
 	if (rcu_preempt_gp_in_progress() &&
 	    rcu_cpu_blocking_cur_gp() &&
 	    rcu_preempt_running_reader())
@@ -383,7 +385,7 @@ static void rcu_preempt_check_callbacks(void)
 
 /*
  * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
- * update, so this is invoked from __rcu_process_callbacks() to
+ * update, so this is invoked from rcu_process_callbacks() to
  * handle that case.  Of course, it is invoked for all flavors of
  * RCU, but RCU callbacks can appear only on one of the lists, and
  * neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +402,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
  */
 static void rcu_preempt_process_callbacks(void)
 {
-	__rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+	rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
 }
 
 /*
@@ -599,14 +601,13 @@ static void rcu_preempt_process_callbacks(void)
 #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-
 #include <linux/kernel_stat.h>
 
 /*
  * During boot, we forgive RCU lockdep issues.  After this function is
  * invoked, we start taking RCU lockdep issues seriously.
  */
-void rcu_scheduler_starting(void)
+void __init rcu_scheduler_starting(void)
 {
 	WARN_ON(nr_context_switches() > 0);
 	rcu_scheduler_active = 1;
-- 
cgit v0.10.2


From 79250af2d5953b69380a6319b493862bf4ece972 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Tue, 16 Nov 2010 10:10:04 -0800
Subject: x86: Fix included-by file reference comments

Adjust the paths for files that are including verify_cpu.S.

Reported-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Kees Cook <kees.cook@canonical.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
LKML-Reference: <1289931004-16066-1-git-send-email-kees.cook@canonical.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 5644b4b..0edefc1 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -17,9 +17,9 @@
  *	file is included at various places and compiled in that context.
  *	This file is expected to run in 32bit code.  Currently:
  *
- *	arch/x86_64/boot/compressed/head_64.S: Boot cpu verification
- *	arch/x86_64/kernel/trampoline_64.S: secondary processor verfication
- *	arch/x86_64/kernel/head_32.S: processor startup
+ *	arch/x86/boot/compressed/head_64.S: Boot cpu verification
+ *	arch/x86/kernel/trampoline_64.S: secondary processor verfication
+ *	arch/x86/kernel/head_32.S: processor startup
  *
  *	verify_cpu, returns the status of longmode and SSE in register %eax.
  *		0: Success    1: Failure
-- 
cgit v0.10.2


From 82148d1d0b2f369851f2dff5088f7840f9f16abf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sh=C3=A9rab?= <Sebastien.Hinderer@ens-lyon.org>
Date: Sat, 25 Sep 2010 06:06:57 +0200
Subject: x86/platform: Add Eurobraille/Iris power off support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Iris machines from Eurobraille do not have APM or ACPI support
to shut themselves down properly.  A special I/O sequence is
needed to do so.  This modle runs this I/O sequence at
kernel shutdown when its force parameter is set to 1.

Signed-off-by: Shérab <Sebastien.Hinderer@ens-lyon.org>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
[ did minor coding style edits ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 655fcf5a..6ab6310 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -482,6 +482,19 @@ config X86_ES7000
 	  Support for Unisys ES7000 systems.  Say 'Y' here if this kernel is
 	  supposed to run on an IA32-based Unisys ES7000 system.
 
+config X86_32_IRIS
+	tristate "Eurobraille/Iris poweroff module"
+	depends on X86_32
+	---help---
+	  The Iris machines from EuroBraille do not have APM or ACPI support
+	  to shut themselves down properly.  A special I/O sequence is
+	  needed to do so, which is what this module does at
+	  kernel shutdown.
+
+	  This is only for Iris machines from EuroBraille.
+
+	  If unused, say N.
+
 config SCHED_OMIT_FRAME_POINTER
 	def_bool y
 	prompt "Single-depth WCHAN output"
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9e13763..beac17a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -109,6 +109,7 @@ obj-$(CONFIG_MICROCODE)			+= microcode.o
 obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
 
 obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
+obj-$(CONFIG_X86_32_IRIS)		+= iris.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/iris.c b/arch/x86/kernel/iris.c
new file mode 100644
index 0000000..1ba7f5e
--- /dev/null
+++ b/arch/x86/kernel/iris.c
@@ -0,0 +1,91 @@
+/*
+ * Eurobraille/Iris power off support.
+ *
+ * Eurobraille's Iris machine is a PC with no APM or ACPI support.
+ * It is shutdown by a special I/O sequence which this module provides.
+ *
+ *  Copyright (C) Shérab <Sebastien.Hinderer@ens-lyon.org>
+ *
+ * This program is free software ; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation ; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with the program ; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/moduleparam.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <asm/io.h>
+
+#define IRIS_GIO_BASE		0x340
+#define IRIS_GIO_INPUT		IRIS_GIO_BASE
+#define IRIS_GIO_OUTPUT		(IRIS_GIO_BASE + 1)
+#define IRIS_GIO_PULSE		0x80 /* First byte to send */
+#define IRIS_GIO_REST		0x00 /* Second byte to send */
+#define IRIS_GIO_NODEV		0xff /* Likely not an Iris */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
+MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
+MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
+
+static int force;
+
+module_param(force, bool, 0);
+MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
+
+static void (*old_pm_power_off)(void);
+
+static void iris_power_off(void)
+{
+	outb(IRIS_GIO_PULSE, IRIS_GIO_OUTPUT);
+	msleep(850);
+	outb(IRIS_GIO_REST, IRIS_GIO_OUTPUT);
+}
+
+/*
+ * Before installing the power_off handler, try to make sure the OS is
+ * running on an Iris.  Since Iris does not support DMI, this is done
+ * by reading its input port and seeing whether the read value is
+ * meaningful.
+ */
+static int iris_init(void)
+{
+	unsigned char status;
+	if (force != 1) {
+		printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n");
+		return -ENODEV;
+	}
+	status = inb(IRIS_GIO_INPUT);
+	if (status == IRIS_GIO_NODEV) {
+		printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n");
+		return -ENODEV;
+	}
+	old_pm_power_off = pm_power_off;
+	pm_power_off = &iris_power_off;
+	printk(KERN_INFO "Iris power_off handler installed.\n");
+
+	return 0;
+}
+
+static void iris_exit(void)
+{
+	pm_power_off = old_pm_power_off;
+	printk(KERN_INFO "Iris power_off handler uninstalled.\n");
+}
+
+module_init(iris_init);
+module_exit(iris_exit);
-- 
cgit v0.10.2


From 64edc8ed5ffae999d8d413ba006850e9e34166cb Mon Sep 17 00:00:00 2001
From: matthieu castet <castet.matthieu@free.fr>
Date: Tue, 16 Nov 2010 22:30:27 +0100
Subject: x86: Fix improper large page preservation

This patch fixes a bug in try_preserve_large_page() which may
result in improper large page preservation and improper
application of page attributes to the memory area outside of the
original change request.

More specifically, the problem manifests itself when set_memory_*()
is called for several pages at the beginning of the large page and
try_preserve_large_page() erroneously concludes that the change can
be applied to whole large page.

The fix consists of 3 parts:

  1. Addition of "required" protection attributes in
     static_protections(), so .data and .bss can be guaranteed to
     stay "RW"

  2. static_protections() is now called for every small
     page within large page to determine compatibility of new
     protection attributes (instead of just small pages within the
     requested range).

  3. Large page can be preserved only if attribute change is
     large-page-aligned and covers whole large page.

 -v1: Try_preserve_large_page() patch for Linux 2.6.34-rc2
 -v2: Replaced pfn check with address check for kernel rw-data

Signed-off-by: Siarhei Liakh <sliakh.lkml@gmail.com>
Signed-off-by: Xuxian Jiang <jiang@cs.ncsu.edu>
Reviewed-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: James Morris <jmorris@namei.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Dave Jones <davej@redhat.com>
Cc: Kees Cook <kees.cook@canonical.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4CE2F7F3.8030809@free.fr>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 532e793..6f2a6b6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -255,6 +255,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 				   unsigned long pfn)
 {
 	pgprot_t forbidden = __pgprot(0);
+	pgprot_t required = __pgprot(0);
 
 	/*
 	 * The BIOS area between 640k and 1Mb needs to be executable for
@@ -278,6 +279,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 	if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
 		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_RW;
+	/*
+	 * .data and .bss should always be writable.
+	 */
+	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
+	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
+		pgprot_val(required) |= _PAGE_RW;
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
 	/*
@@ -317,6 +324,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 #endif
 
 	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+	prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
 
 	return prot;
 }
@@ -393,7 +401,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 {
 	unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
 	pte_t new_pte, old_pte, *tmp;
-	pgprot_t old_prot, new_prot;
+	pgprot_t old_prot, new_prot, req_prot;
 	int i, do_split = 1;
 	unsigned int level;
 
@@ -438,10 +446,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 * We are safe now. Check whether the new pgprot is the same:
 	 */
 	old_pte = *kpte;
-	old_prot = new_prot = pte_pgprot(old_pte);
+	old_prot = new_prot = req_prot = pte_pgprot(old_pte);
 
-	pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
-	pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
+	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
 
 	/*
 	 * old_pte points to the large page base address. So we need
@@ -450,17 +458,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
 	cpa->pfn = pfn;
 
-	new_prot = static_protections(new_prot, address, pfn);
+	new_prot = static_protections(req_prot, address, pfn);
 
 	/*
 	 * We need to check the full range, whether
 	 * static_protection() requires a different pgprot for one of
 	 * the pages in the range we try to preserve:
 	 */
-	addr = address + PAGE_SIZE;
-	pfn++;
-	for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
-		pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
+	addr = address & pmask;
+	pfn = pte_pfn(old_pte);
+	for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
+		pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
 
 		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
 			goto out_unlock;
@@ -483,7 +491,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 * that we limited the number of possible pages already to
 	 * the number of pages in the large page.
 	 */
-	if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+	if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
 		/*
 		 * The address is aligned and the number of pages
 		 * covers the full page.
-- 
cgit v0.10.2


From 5bd5a452662bc37c54fb6828db1a3faf87e6511c Mon Sep 17 00:00:00 2001
From: Matthieu Castet <castet.matthieu@free.fr>
Date: Tue, 16 Nov 2010 22:31:26 +0100
Subject: x86: Add NX protection for kernel data

This patch expands functionality of CONFIG_DEBUG_RODATA to set main
(static) kernel data area as NX.

The following steps are taken to achieve this:

 1. Linker script is adjusted so .text always starts and ends on a page bound
 2. Linker script is adjusted so .rodata always start and end on a page boundary
 3. NX is set for all pages from _etext through _end in mark_rodata_ro.
 4. free_init_pages() sets released memory NX in arch/x86/mm/init.c
 5. bios rom is set to x when pcibios is used.

The results of patch application may be observed in the diff of kernel page
table dumps:

pcibios:

 -- data_nx_pt_before.txt       2009-10-13 07:48:59.000000000 -0400
 ++ data_nx_pt_after.txt        2009-10-13 07:26:46.000000000 -0400
  0x00000000-0xc0000000           3G                           pmd
  ---[ Kernel Mapping ]---
 -0xc0000000-0xc0100000           1M     RW             GLB x  pte
 +0xc0000000-0xc00a0000         640K     RW             GLB NX pte
 +0xc00a0000-0xc0100000         384K     RW             GLB x  pte
 -0xc0100000-0xc03d7000        2908K     ro             GLB x  pte
 +0xc0100000-0xc0318000        2144K     ro             GLB x  pte
 +0xc0318000-0xc03d7000         764K     ro             GLB NX pte
 -0xc03d7000-0xc0600000        2212K     RW             GLB x  pte
 +0xc03d7000-0xc0600000        2212K     RW             GLB NX pte
  0xc0600000-0xf7a00000         884M     RW         PSE GLB NX pmd
  0xf7a00000-0xf7bfe000        2040K     RW             GLB NX pte
  0xf7bfe000-0xf7c00000           8K                           pte

No pcibios:

 -- data_nx_pt_before.txt       2009-10-13 07:48:59.000000000 -0400
 ++ data_nx_pt_after.txt        2009-10-13 07:26:46.000000000 -0400
  0x00000000-0xc0000000           3G                           pmd
  ---[ Kernel Mapping ]---
 -0xc0000000-0xc0100000           1M     RW             GLB x  pte
 +0xc0000000-0xc0100000           1M     RW             GLB NX pte
 -0xc0100000-0xc03d7000        2908K     ro             GLB x  pte
 +0xc0100000-0xc0318000        2144K     ro             GLB x  pte
 +0xc0318000-0xc03d7000         764K     ro             GLB NX pte
 -0xc03d7000-0xc0600000        2212K     RW             GLB x  pte
 +0xc03d7000-0xc0600000        2212K     RW             GLB NX pte
  0xc0600000-0xf7a00000         884M     RW         PSE GLB NX pmd
  0xf7a00000-0xf7bfe000        2040K     RW             GLB NX pte
  0xf7bfe000-0xf7c00000           8K                           pte

The patch has been originally developed for Linux 2.6.34-rc2 x86 by
Siarhei Liakh <sliakh.lkml@gmail.com> and Xuxian Jiang <jiang@cs.ncsu.edu>.

 -v1:  initial patch for 2.6.30
 -v2:  patch for 2.6.31-rc7
 -v3:  moved all code into arch/x86, adjusted credits
 -v4:  fixed ifdef, removed credits from CREDITS
 -v5:  fixed an address calculation bug in mark_nxdata_nx()
 -v6:  added acked-by and PT dump diff to commit log
 -v7:  minor adjustments for -tip
 -v8:  rework with the merge of "Set first MB as RW+NX"

Signed-off-by: Siarhei Liakh <sliakh.lkml@gmail.com>
Signed-off-by: Xuxian Jiang <jiang@cs.ncsu.edu>
Signed-off-by: Matthieu CASTET <castet.matthieu@free.fr>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: James Morris <jmorris@namei.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Dave Jones <davej@redhat.com>
Cc: Kees Cook <kees.cook@canonical.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4CE2F82E.60601@free.fr>
[ minor cleanliness edits ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ca0437c..6761292 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -65,6 +65,7 @@ extern unsigned long pci_mem_start;
 
 #define PCIBIOS_MIN_CARDBUS_IO	0x4000
 
+extern int pcibios_enabled;
 void pcibios_config_init(void);
 struct pci_bus *pcibios_scan_root(int bus);
 
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e03530a..bf47007 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -69,7 +69,7 @@ jiffies_64 = jiffies;
 
 PHDRS {
 	text PT_LOAD FLAGS(5);          /* R_E */
-	data PT_LOAD FLAGS(7);          /* RWE */
+	data PT_LOAD FLAGS(6);          /* RW_ */
 #ifdef CONFIG_X86_64
 	user PT_LOAD FLAGS(5);          /* R_E */
 #ifdef CONFIG_SMP
@@ -116,6 +116,10 @@ SECTIONS
 
 	EXCEPTION_TABLE(16) :text = 0x9090
 
+#if defined(CONFIG_DEBUG_RODATA)
+	/* .text should occupy whole number of pages */
+	. = ALIGN(PAGE_SIZE);
+#endif
 	X64_ALIGN_DEBUG_RODATA_BEGIN
 	RO_DATA(PAGE_SIZE)
 	X64_ALIGN_DEBUG_RODATA_END
@@ -335,7 +339,7 @@ SECTIONS
 		__bss_start = .;
 		*(.bss..page_aligned)
 		*(.bss)
-		. = ALIGN(4);
+		. = ALIGN(PAGE_SIZE);
 		__bss_stop = .;
 	}
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c0e28a1..947f42ab 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	/*
 	 * We just marked the kernel text read only above, now that
 	 * we are going to free part of that, we need to make that
-	 * writeable first.
+	 * writeable and non-executable first.
 	 */
+	set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
 	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
 
 	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0e969f9..f89b5bb 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -226,7 +226,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
 
 static inline int is_kernel_text(unsigned long addr)
 {
-	if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+	if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
 		return 1;
 	return 0;
 }
@@ -912,6 +912,23 @@ void set_kernel_text_ro(void)
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 }
 
+static void mark_nxdata_nx(void)
+{
+	/*
+	 * When this called, init has already been executed and released,
+	 * so everything past _etext sould be NX.
+	 */
+	unsigned long start = PFN_ALIGN(_etext);
+	/*
+	 * This comes from is_kernel_text upper limit. Also HPAGE where used:
+	 */
+	unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+
+	if (__supported_pte_mask & _PAGE_NX)
+		printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+	set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
 void mark_rodata_ro(void)
 {
 	unsigned long start = PFN_ALIGN(_text);
@@ -946,6 +963,7 @@ void mark_rodata_ro(void)
 	printk(KERN_INFO "Testing CPA: write protecting again\n");
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 #endif
+	mark_nxdata_nx();
 }
 #endif
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 71a5929..ce59c05 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -788,6 +788,7 @@ void mark_rodata_ro(void)
 	unsigned long rodata_start =
 		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
 	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
+	unsigned long kernel_end = (((unsigned long)&__init_end + HPAGE_SIZE) & HPAGE_MASK);
 	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
 	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
 	unsigned long data_start = (unsigned long) &_sdata;
@@ -802,7 +803,7 @@ void mark_rodata_ro(void)
 	 * The rodata section (but not the kernel text!) should also be
 	 * not-executable.
 	 */
-	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
+	set_memory_nx(rodata_start, (kernel_end - rodata_start) >> PAGE_SHIFT);
 
 	rodata_test();
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6f2a6b6..8b830ca 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
 #include <linux/pfn.h>
 #include <linux/percpu.h>
 #include <linux/gfp.h>
+#include <linux/pci.h>
 
 #include <asm/e820.h>
 #include <asm/processor.h>
@@ -261,8 +262,10 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 	 * The BIOS area between 640k and 1Mb needs to be executable for
 	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
 	 */
-	if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+#ifdef CONFIG_PCI_BIOS
+	if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_NX;
+#endif
 
 	/*
 	 * The kernel text needs to be executable for obvious reasons
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 2492d16..a5f7d0d 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -9,6 +9,7 @@
 #include <linux/uaccess.h>
 #include <asm/pci_x86.h>
 #include <asm/pci-functions.h>
+#include <asm/cacheflush.h>
 
 /* BIOS32 signature: "_32_" */
 #define BIOS32_SIGNATURE	(('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
@@ -25,6 +26,27 @@
 #define PCIBIOS_HW_TYPE1_SPEC		0x10
 #define PCIBIOS_HW_TYPE2_SPEC		0x20
 
+int pcibios_enabled;
+
+/* According to the BIOS specification at:
+ * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could
+ * restrict the x zone to some pages and make it ro. But this may be
+ * broken on some bios, complex to handle with static_protections.
+ * We could make the 0xe0000-0x100000 range rox, but this can break
+ * some ISA mapping.
+ *
+ * So we let's an rw and x hole when pcibios is used. This shouldn't
+ * happen for modern system with mmconfig, and if you don't want it
+ * you could disable pcibios...
+ */
+static inline void set_bios_x(void)
+{
+	pcibios_enabled = 1;
+	set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
+	if (__supported_pte_mask & _PAGE_NX)
+		printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n");
+}
+
 /*
  * This is the standard structure used to identify the entry point
  * to the BIOS32 Service Directory, as documented in
@@ -332,6 +354,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
 			DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
 					bios32_entry);
 			bios32_indirect.address = bios32_entry + PAGE_OFFSET;
+			set_bios_x();
 			if (check_pcibios())
 				return &pci_bios_access;
 		}
-- 
cgit v0.10.2


From 48c5ccae88dcd989d9de507e8510313c6cbd352b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 13 Nov 2010 19:32:29 +0100
Subject: sched: Simplify cpu-hot-unplug task migration

While discussing the need for sched_idle_next(), Oleg remarked that
since try_to_wake_up() ensures sleeping tasks will end up running on a
sane cpu, we can do away with migrate_live_tasks().

If we then extend the existing hack of migrating current from
CPU_DYING to migrating the full rq worth of tasks from CPU_DYING, the
need for the sched_idle_next() abomination disappears as well, since
idle will be the only possible thread left after the migration thread
stops.

This greatly simplifies the hot-unplug task migration path, as can be
seen from the resulting code reduction (and about half the new lines
are comments).

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1289851597.2109.547.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3cd70cf..29d953a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1871,14 +1871,11 @@ extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
 #ifdef CONFIG_HOTPLUG_CPU
-extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
 extern void idle_task_exit(void);
 #else
 static inline void idle_task_exit(void) {}
 #endif
 
-extern void sched_idle_next(void);
-
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
 extern void wake_up_idle_cpu(int cpu);
 #else
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f..8615aa6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
 }
 
 struct take_cpu_down_param {
-	struct task_struct *caller;
 	unsigned long mod;
 	void *hcpu;
 };
@@ -208,11 +207,6 @@ static int __ref take_cpu_down(void *_param)
 
 	cpu_notify(CPU_DYING | param->mod, param->hcpu);
 
-	if (task_cpu(param->caller) == cpu)
-		move_task_off_dead_cpu(cpu, param->caller);
-	/* Force idle task to run as soon as we yield: it should
-	   immediately notice cpu is offline and die quickly. */
-	sched_idle_next();
 	return 0;
 }
 
@@ -223,7 +217,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 	struct take_cpu_down_param tcd_param = {
-		.caller = current,
 		.mod = mod,
 		.hcpu = hcpu,
 	};
@@ -253,9 +246,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	}
 	BUG_ON(cpu_online(cpu));
 
-	/* Wait for it to sleep (leaving idle task). */
-	while (!idle_cpu(cpu))
-		yield();
+	/*
+	 * The migration_call() CPU_DYING callback will have removed all
+	 * runnable tasks from the cpu, there's only the idle task left now
+	 * that the migration thread is done doing the stop_machine thing.
+	 */
+	BUG_ON(!idle_cpu(cpu));
 
 	/* This actually kills the CPU. */
 	__cpu_die(cpu);
diff --git a/kernel/sched.c b/kernel/sched.c
index 41f1869..b0d5f1b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2366,18 +2366,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 		return dest_cpu;
 
 	/* No more Mr. Nice Guy. */
-	if (unlikely(dest_cpu >= nr_cpu_ids)) {
-		dest_cpu = cpuset_cpus_allowed_fallback(p);
-		/*
-		 * Don't tell them about moving exiting tasks or
-		 * kernel threads (both mm NULL), since they never
-		 * leave kernel.
-		 */
-		if (p->mm && printk_ratelimit()) {
-			printk(KERN_INFO "process %d (%s) no "
-			       "longer affine to cpu%d\n",
-			       task_pid_nr(p), p->comm, cpu);
-		}
+	dest_cpu = cpuset_cpus_allowed_fallback(p);
+	/*
+	 * Don't tell them about moving exiting tasks or
+	 * kernel threads (both mm NULL), since they never
+	 * leave kernel.
+	 */
+	if (p->mm && printk_ratelimit()) {
+		printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+				task_pid_nr(p), p->comm, cpu);
 	}
 
 	return dest_cpu;
@@ -5712,29 +5709,20 @@ static int migration_cpu_stop(void *data)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+
 /*
- * Figure out where task on dead CPU should go, use force if necessary.
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
  */
-void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void idle_task_exit(void)
 {
-	struct rq *rq = cpu_rq(dead_cpu);
-	int needs_cpu, uninitialized_var(dest_cpu);
-	unsigned long flags;
+	struct mm_struct *mm = current->active_mm;
 
-	local_irq_save(flags);
+	BUG_ON(cpu_online(smp_processor_id()));
 
-	raw_spin_lock(&rq->lock);
-	needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
-	if (needs_cpu)
-		dest_cpu = select_fallback_rq(dead_cpu, p);
-	raw_spin_unlock(&rq->lock);
-	/*
-	 * It can only fail if we race with set_cpus_allowed(),
-	 * in the racer should migrate the task anyway.
-	 */
-	if (needs_cpu)
-		__migrate_task(p, dead_cpu, dest_cpu);
-	local_irq_restore(flags);
+	if (mm != &init_mm)
+		switch_mm(mm, &init_mm, current);
+	mmdrop(mm);
 }
 
 /*
@@ -5747,128 +5735,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
 	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-	unsigned long flags;
 
-	local_irq_save(flags);
-	double_rq_lock(rq_src, rq_dest);
 	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
 	rq_src->nr_uninterruptible = 0;
-	double_rq_unlock(rq_src, rq_dest);
-	local_irq_restore(flags);
-}
-
-/* Run through task list and migrate tasks from the dead cpu. */
-static void migrate_live_tasks(int src_cpu)
-{
-	struct task_struct *p, *t;
-
-	read_lock(&tasklist_lock);
-
-	do_each_thread(t, p) {
-		if (p == current)
-			continue;
-
-		if (task_cpu(p) == src_cpu)
-			move_task_off_dead_cpu(src_cpu, p);
-	} while_each_thread(t, p);
-
-	read_unlock(&tasklist_lock);
 }
 
 /*
- * Schedules idle task to be the next runnable task on current CPU.
- * It does so by boosting its priority to highest possible.
- * Used by CPU offline code.
+ * remove the tasks which were accounted by rq from calc_load_tasks.
  */
-void sched_idle_next(void)
+static void calc_global_load_remove(struct rq *rq)
 {
-	int this_cpu = smp_processor_id();
-	struct rq *rq = cpu_rq(this_cpu);
-	struct task_struct *p = rq->idle;
-	unsigned long flags;
-
-	/* cpu has to be offline */
-	BUG_ON(cpu_online(this_cpu));
-
-	/*
-	 * Strictly not necessary since rest of the CPUs are stopped by now
-	 * and interrupts disabled on the current cpu.
-	 */
-	raw_spin_lock_irqsave(&rq->lock, flags);
-
-	__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-
-	activate_task(rq, p, 0);
-
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+	rq->calc_load_active = 0;
 }
 
 /*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
  */
-void idle_task_exit(void)
-{
-	struct mm_struct *mm = current->active_mm;
-
-	BUG_ON(cpu_online(smp_processor_id()));
-
-	if (mm != &init_mm)
-		switch_mm(mm, &init_mm, current);
-	mmdrop(mm);
-}
-
-/* called under rq->lock with disabled interrupts */
-static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
+static void migrate_tasks(unsigned int dead_cpu)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
-
-	/* Must be exiting, otherwise would be on tasklist. */
-	BUG_ON(!p->exit_state);
-
-	/* Cannot have done final schedule yet: would have vanished. */
-	BUG_ON(p->state == TASK_DEAD);
-
-	get_task_struct(p);
+	struct task_struct *next, *stop = rq->stop;
+	int dest_cpu;
 
 	/*
-	 * Drop lock around migration; if someone else moves it,
-	 * that's OK. No task can be added to this CPU, so iteration is
-	 * fine.
+	 * Fudge the rq selection such that the below task selection loop
+	 * doesn't get stuck on the currently eligible stop task.
+	 *
+	 * We're currently inside stop_machine() and the rq is either stuck
+	 * in the stop_machine_cpu_stop() loop, or we're executing this code,
+	 * either way we should never end up calling schedule() until we're
+	 * done here.
 	 */
-	raw_spin_unlock_irq(&rq->lock);
-	move_task_off_dead_cpu(dead_cpu, p);
-	raw_spin_lock_irq(&rq->lock);
-
-	put_task_struct(p);
-}
-
-/* release_task() removes task from tasklist, so we won't find dead tasks. */
-static void migrate_dead_tasks(unsigned int dead_cpu)
-{
-	struct rq *rq = cpu_rq(dead_cpu);
-	struct task_struct *next;
+	rq->stop = NULL;
 
 	for ( ; ; ) {
-		if (!rq->nr_running)
+		/*
+		 * There's this thread running, bail when that's the only
+		 * remaining thread.
+		 */
+		if (rq->nr_running == 1)
 			break;
+
 		next = pick_next_task(rq);
-		if (!next)
-			break;
+		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
-		migrate_dead(dead_cpu, next);
 
+		/* Find suitable destination for @next, with force if needed. */
+		dest_cpu = select_fallback_rq(dead_cpu, next);
+		raw_spin_unlock(&rq->lock);
+
+		__migrate_task(next, dead_cpu, dest_cpu);
+
+		raw_spin_lock(&rq->lock);
 	}
-}
 
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
-{
-	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
-	rq->calc_load_active = 0;
+	rq->stop = stop;
 }
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6078,15 +6007,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	unsigned long flags;
 	struct rq *rq = cpu_rq(cpu);
 
-	switch (action) {
+	switch (action & ~CPU_TASKS_FROZEN) {
 
 	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
 		rq->calc_load_update = calc_load_update;
 		break;
 
 	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
 		/* Update our root-domain */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
@@ -6098,30 +6025,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		migrate_live_tasks(cpu);
-		/* Idle task back to normal (off runqueue, low prio) */
-		raw_spin_lock_irq(&rq->lock);
-		deactivate_task(rq, rq->idle, 0);
-		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
-		rq->idle->sched_class = &idle_sched_class;
-		migrate_dead_tasks(cpu);
-		raw_spin_unlock_irq(&rq->lock);
-		migrate_nr_uninterruptible(rq);
-		BUG_ON(rq->nr_running != 0);
-		calc_global_load_remove(rq);
-		break;
-
 	case CPU_DYING:
-	case CPU_DYING_FROZEN:
 		/* Update our root-domain */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
+		migrate_tasks(cpu);
+		BUG_ON(rq->nr_running != 1); /* the migration thread */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+		migrate_nr_uninterruptible(rq);
+		calc_global_load_remove(rq);
 		break;
 #endif
 	}
-- 
cgit v0.10.2


From 2069dd75c7d0f49355939e5586daf5a9ab216db7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Nov 2010 15:47:00 -0800
Subject: sched: Rewrite tg_shares_up)

By tracking a per-cpu load-avg for each cfs_rq and folding it into a
global task_group load on each tick we can rework tg_shares_up to be
strictly per-cpu.

This should improve cpu-cgroup performance for smp systems
significantly.

[ Paul: changed to use queueing cfs_rq + bug fixes ]

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.580480400@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29d953a..8abb8aa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1885,8 +1885,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_shares_ratelimit;
-extern unsigned int sysctl_sched_shares_thresh;
 extern unsigned int sysctl_sched_child_runs_first;
 
 enum sched_tunable_scaling {
diff --git a/kernel/sched.c b/kernel/sched.c
index b0d5f1b..e2f1a30 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -253,6 +253,8 @@ struct task_group {
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 	unsigned long shares;
+
+	atomic_t load_weight;
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -359,15 +361,11 @@ struct cfs_rq {
 	 */
 	unsigned long h_load;
 
-	/*
-	 * this cpu's part of tg->shares
-	 */
-	unsigned long shares;
+	u64 load_avg;
+	u64 load_period;
+	u64 load_stamp;
 
-	/*
-	 * load.weight at the time we set shares
-	 */
-	unsigned long rq_weight;
+	unsigned long load_contribution;
 #endif
 #endif
 };
@@ -807,20 +805,6 @@ late_initcall(sched_init_debug);
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-
-/*
  * period over which we average the RT time consumption, measured
  * in ms.
  *
@@ -1369,6 +1353,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 	lw->inv_weight = 0;
 }
 
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+	lw->weight = w;
+	lw->inv_weight = 0;
+}
+
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1557,97 +1547,44 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-static __read_mostly unsigned long __percpu *update_shares_data;
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
-				    unsigned long sd_shares,
-				    unsigned long sd_rq_weight,
-				    unsigned long *usd_rq_weight)
-{
-	unsigned long shares, rq_weight;
-	int boost = 0;
-
-	rq_weight = usd_rq_weight[cpu];
-	if (!rq_weight) {
-		boost = 1;
-		rq_weight = NICE_0_LOAD;
-	}
-
-	/*
-	 *             \Sum_j shares_j * rq_weight_i
-	 * shares_i =  -----------------------------
-	 *                  \Sum_j rq_weight_j
-	 */
-	shares = (sd_shares * rq_weight) / sd_rq_weight;
-	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-
-	if (abs(shares - tg->se[cpu]->load.weight) >
-			sysctl_sched_shares_thresh) {
-		struct rq *rq = cpu_rq(cpu);
-		unsigned long flags;
-
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
-		tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-		__set_se_shares(tg->se[cpu], shares);
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-	}
-}
+static void update_cfs_load(struct cfs_rq *cfs_rq);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
 
 /*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
+ * update tg->load_weight by folding this cpu's load_avg
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-	unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
-	unsigned long *usd_rq_weight;
-	struct sched_domain *sd = data;
+	long load_avg;
+	struct cfs_rq *cfs_rq;
 	unsigned long flags;
-	int i;
+	int cpu = (long)data;
+	struct rq *rq;
 
-	if (!tg->se[0])
+	if (!tg->se[cpu])
 		return 0;
 
-	local_irq_save(flags);
-	usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-
-	for_each_cpu(i, sched_domain_span(sd)) {
-		weight = tg->cfs_rq[i]->load.weight;
-		usd_rq_weight[i] = weight;
-
-		rq_weight += weight;
-		/*
-		 * If there are currently no tasks on the cpu pretend there
-		 * is one of average load so that when a new task gets to
-		 * run here it will not get delayed by group starvation.
-		 */
-		if (!weight)
-			weight = NICE_0_LOAD;
+	rq = cpu_rq(cpu);
+	cfs_rq = tg->cfs_rq[cpu];
 
-		sum_weight += weight;
-		shares += tg->cfs_rq[i]->shares;
-	}
+	raw_spin_lock_irqsave(&rq->lock, flags);
 
-	if (!rq_weight)
-		rq_weight = sum_weight;
+	update_rq_clock(rq);
+	update_cfs_load(cfs_rq);
 
-	if ((!shares && rq_weight) || shares > tg->shares)
-		shares = tg->shares;
+	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+	load_avg -= cfs_rq->load_contribution;
 
-	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-		shares = tg->shares;
+	atomic_add(load_avg, &tg->load_weight);
+	cfs_rq->load_contribution += load_avg;
 
-	for_each_cpu(i, sched_domain_span(sd))
-		update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
+	/*
+	 * We need to update shares after updating tg->load_weight in
+	 * order to adjust the weight of groups with long running tasks.
+	 */
+	update_cfs_shares(cfs_rq);
 
-	local_irq_restore(flags);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
 	return 0;
 }
@@ -1666,7 +1603,7 @@ static int tg_load_down(struct task_group *tg, void *data)
 		load = cpu_rq(cpu)->load.weight;
 	} else {
 		load = tg->parent->cfs_rq[cpu]->h_load;
-		load *= tg->cfs_rq[cpu]->shares;
+		load *= tg->se[cpu]->load.weight;
 		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
 	}
 
@@ -1675,21 +1612,16 @@ static int tg_load_down(struct task_group *tg, void *data)
 	return 0;
 }
 
-static void update_shares(struct sched_domain *sd)
+static void update_shares(long cpu)
 {
-	s64 elapsed;
-	u64 now;
-
 	if (root_task_group_empty())
 		return;
 
-	now = local_clock();
-	elapsed = now - sd->last_update;
+	/*
+	 * XXX: replace with an on-demand list
+	 */
 
-	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
-		sd->last_update = now;
-		walk_tg_tree(tg_nop, tg_shares_up, sd);
-	}
+	walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu);
 }
 
 static void update_h_load(long cpu)
@@ -1699,7 +1631,7 @@ static void update_h_load(long cpu)
 
 #else
 
-static inline void update_shares(struct sched_domain *sd)
+static inline void update_shares(int cpu)
 {
 }
 
@@ -1824,15 +1756,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 
 #endif
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-#ifdef CONFIG_SMP
-	cfs_rq->shares = shares;
-#endif
-}
-#endif
-
 static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
@@ -5551,7 +5474,6 @@ static void update_sysctl(void)
 	SET_SYSCTL(sched_min_granularity);
 	SET_SYSCTL(sched_latency);
 	SET_SYSCTL(sched_wakeup_granularity);
-	SET_SYSCTL(sched_shares_ratelimit);
 #undef SET_SYSCTL
 }
 
@@ -7787,8 +7709,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 		se->cfs_rq = parent->my_q;
 
 	se->my_q = cfs_rq;
-	se->load.weight = tg->shares;
-	se->load.inv_weight = 0;
+	update_load_set(&se->load, tg->shares);
 	se->parent = parent;
 }
 #endif
@@ -7881,10 +7802,6 @@ void __init sched_init(void)
 
 #endif /* CONFIG_CGROUP_SCHED */
 
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
-	update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
-					    __alignof__(unsigned long));
-#endif
 	for_each_possible_cpu(i) {
 		struct rq *rq;
 
@@ -8452,8 +8369,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 	if (on_rq)
 		dequeue_entity(cfs_rq, se, 0);
 
-	se->load.weight = shares;
-	se->load.inv_weight = 0;
+	update_load_set(&se->load, shares);
 
 	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
@@ -8510,7 +8426,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		/*
 		 * force a rebalance
 		 */
-		cfs_rq_set_shares(tg->cfs_rq[i], 0);
 		set_se_shares(tg->se[i], shares);
 	}
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d1..e6590e7 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -202,15 +202,22 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	spread0 = min_vruntime - rq0_min_vruntime;
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
 			SPLIT_NS(spread0));
-	SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
-	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
-
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
 			cfs_rq->nr_spread_over);
+	SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
-	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
+			SPLIT_NS(cfs_rq->load_avg));
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
+			SPLIT_NS(cfs_rq->load_period));
+	SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
+			cfs_rq->load_contribution);
+	SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
+			atomic_read(&tg->load_weight));
 #endif
+
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f4f6a83..d86544b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 	WRT_SYSCTL(sched_min_granularity);
 	WRT_SYSCTL(sched_latency);
 	WRT_SYSCTL(sched_wakeup_granularity);
-	WRT_SYSCTL(sched_shares_ratelimit);
 #undef WRT_SYSCTL
 
 	return 0;
@@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		list_add(&se->group_node, &cfs_rq->tasks);
 	}
 	cfs_rq->nr_running++;
-	se->on_rq = 1;
 }
 
 static void
@@ -647,9 +645,89 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		list_del_init(&se->group_node);
 	}
 	cfs_rq->nr_running--;
-	se->on_rq = 0;
 }
 
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void update_cfs_load(struct cfs_rq *cfs_rq)
+{
+	u64 period = sched_avg_period();
+	u64 now, delta;
+
+	if (!cfs_rq)
+		return;
+
+	now = rq_of(cfs_rq)->clock;
+	delta = now - cfs_rq->load_stamp;
+
+	cfs_rq->load_stamp = now;
+	cfs_rq->load_period += delta;
+	cfs_rq->load_avg += delta * cfs_rq->load.weight;
+
+	while (cfs_rq->load_period > period) {
+		/*
+		 * Inline assembly required to prevent the compiler
+		 * optimising this loop into a divmod call.
+		 * See __iter_div_u64_rem() for another example of this.
+		 */
+		asm("" : "+rm" (cfs_rq->load_period));
+		cfs_rq->load_period /= 2;
+		cfs_rq->load_avg /= 2;
+	}
+}
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+			    unsigned long weight)
+{
+	if (se->on_rq)
+		account_entity_dequeue(cfs_rq, se);
+
+	update_load_set(&se->load, weight);
+
+	if (se->on_rq)
+		account_entity_enqueue(cfs_rq, se);
+}
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+	struct task_group *tg;
+	struct sched_entity *se;
+	long load_weight, load, shares;
+
+	if (!cfs_rq)
+		return;
+
+	tg = cfs_rq->tg;
+	se = tg->se[cpu_of(rq_of(cfs_rq))];
+	if (!se)
+		return;
+
+	load = cfs_rq->load.weight;
+
+	load_weight = atomic_read(&tg->load_weight);
+	load_weight -= cfs_rq->load_contribution;
+	load_weight += load;
+
+	shares = (tg->shares * load);
+	if (load_weight)
+		shares /= load_weight;
+
+	if (shares < MIN_SHARES)
+		shares = MIN_SHARES;
+	if (shares > tg->shares)
+		shares = tg->shares;
+
+	reweight_entity(cfs_rq_of(se), se, shares);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static inline void update_cfs_load(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
@@ -771,7 +849,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+	update_cfs_load(cfs_rq);
 	account_entity_enqueue(cfs_rq, se);
+	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
@@ -782,6 +862,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	check_spread(cfs_rq, se);
 	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
+	se->on_rq = 1;
 }
 
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +906,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
+	se->on_rq = 0;
+	update_cfs_load(cfs_rq);
 	account_entity_dequeue(cfs_rq, se);
 	update_min_vruntime(cfs_rq);
+	update_cfs_shares(cfs_rq);
 
 	/*
 	 * Normalize the entity after updating the min_vruntime because the
@@ -1055,6 +1139,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		flags = ENQUEUE_WAKEUP;
 	}
 
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+		update_cfs_load(cfs_rq);
+		update_cfs_shares(cfs_rq);
+	}
+
 	hrtick_update(rq);
 }
 
@@ -1071,12 +1162,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
+
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight)
 			break;
 		flags |= DEQUEUE_SLEEP;
 	}
 
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+		update_cfs_load(cfs_rq);
+		update_cfs_shares(cfs_rq);
+	}
+
 	hrtick_update(rq);
 }
 
@@ -1143,51 +1242,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
  * Adding load to a group doesn't make a group heavier, but can cause movement
  * of group shares between cpus. Assuming the shares were perfectly aligned one
  * can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
  */
-static long effective_load(struct task_group *tg, int cpu,
-		long wl, long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
 	struct sched_entity *se = tg->se[cpu];
 
 	if (!tg->parent)
 		return wl;
 
-	/*
-	 * By not taking the decrease of shares on the other cpu into
-	 * account our error leans towards reducing the affine wakeups.
-	 */
-	if (!wl && sched_feat(ASYM_EFF_LOAD))
-		return wl;
-
 	for_each_sched_entity(se) {
 		long S, rw, s, a, b;
-		long more_w;
-
-		/*
-		 * Instead of using this increment, also add the difference
-		 * between when the shares were last updated and now.
-		 */
-		more_w = se->my_q->load.weight - se->my_q->rq_weight;
-		wl += more_w;
-		wg += more_w;
 
 		S = se->my_q->tg->shares;
-		s = se->my_q->shares;
-		rw = se->my_q->rq_weight;
+		s = se->load.weight;
+		rw = se->my_q->load.weight;
 
 		a = S*(rw + wl);
 		b = S*rw + s*wg;
@@ -1508,23 +1576,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 			sd = tmp;
 	}
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	if (sched_feat(LB_SHARES_UPDATE)) {
-		/*
-		 * Pick the largest domain to update shares over
-		 */
-		tmp = sd;
-		if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
-			tmp = affine_sd;
-
-		if (tmp) {
-			raw_spin_unlock(&rq->lock);
-			update_shares(tmp);
-			raw_spin_lock(&rq->lock);
-		}
-	}
-#endif
-
 	if (affine_sd) {
 		if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
 			return select_idle_sibling(p, cpu);
@@ -3014,7 +3065,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
-	update_shares(sd);
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
 				   cpus, balance);
 
@@ -3156,8 +3206,6 @@ out_one_pinned:
 	else
 		ld_moved = 0;
 out:
-	if (ld_moved)
-		update_shares(sd);
 	return ld_moved;
 }
 
@@ -3549,6 +3597,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 	int update_next_balance = 0;
 	int need_serialize;
 
+	update_shares(cpu);
+
 	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 185f920..68e69ac 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
 SCHED_FEAT(HRTICK, 0)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(LB_BIAS, 1)
-SCHED_FEAT(LB_SHARES_UPDATE, 1)
-SCHED_FEAT(ASYM_EFF_LOAD, 1)
 
 /*
  * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b65bf63..3132b25 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns;			/* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-static int min_sched_shares_ratelimit = 100000; /* 100 usec */
-static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
 #endif
 
 #ifdef CONFIG_COMPACTION
@@ -305,15 +303,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &max_wakeup_granularity_ns,
 	},
 	{
-		.procname	= "sched_shares_ratelimit",
-		.data		= &sysctl_sched_shares_ratelimit,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sched_proc_update_handler,
-		.extra1		= &min_sched_shares_ratelimit,
-		.extra2		= &max_sched_shares_ratelimit,
-	},
-	{
 		.procname	= "sched_tunable_scaling",
 		.data		= &sysctl_sched_tunable_scaling,
 		.maxlen		= sizeof(enum sched_tunable_scaling),
@@ -323,14 +312,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &max_sched_tunable_scaling,
 	},
 	{
-		.procname	= "sched_shares_thresh",
-		.data		= &sysctl_sched_shares_thresh,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-	},
-	{
 		.procname	= "sched_migration_cost",
 		.data		= &sysctl_sched_migration_cost,
 		.maxlen		= sizeof(unsigned int),
-- 
cgit v0.10.2


From 3d4b47b4b040c9d77dd68104cfc1055d89a55afd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Nov 2010 15:47:01 -0800
Subject: sched: Implement on-demand (active) cfs_rq list

Make certain load-balance actions scale per number of active cgroups
instead of the number of existing cgroups.

This makes wakeup/sleep paths more expensive, but is a win for systems
where the vast majority of existing cgroups are idle.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.666535048@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index e2f1a30..22436dd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -274,9 +274,7 @@ struct task_group {
 
 #define root_task_group init_task_group
 
-/* task_group_lock serializes add/remove of task groups and also changes to
- * a task group's cpu shares.
- */
+/* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -344,6 +342,7 @@ struct cfs_rq {
 	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 	 * list is used during load balance.
 	 */
+	int on_list;
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
 
@@ -1547,7 +1546,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-static void update_cfs_load(struct cfs_rq *cfs_rq);
+static void update_cfs_load(struct cfs_rq *cfs_rq, int lb);
 static void update_cfs_shares(struct cfs_rq *cfs_rq);
 
 /*
@@ -1570,7 +1569,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
 	raw_spin_lock_irqsave(&rq->lock, flags);
 
 	update_rq_clock(rq);
-	update_cfs_load(cfs_rq);
+	update_cfs_load(cfs_rq, 1);
 
 	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
 	load_avg -= cfs_rq->load_contribution;
@@ -7688,15 +7687,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
-				struct sched_entity *se, int cpu, int add,
+				struct sched_entity *se, int cpu,
 				struct sched_entity *parent)
 {
 	struct rq *rq = cpu_rq(cpu);
 	tg->cfs_rq[cpu] = cfs_rq;
 	init_cfs_rq(cfs_rq, rq);
 	cfs_rq->tg = tg;
-	if (add)
-		list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 
 	tg->se[cpu] = se;
 	/* se could be NULL for init_task_group */
@@ -7716,7 +7713,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 
 #ifdef CONFIG_RT_GROUP_SCHED
 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-		struct sched_rt_entity *rt_se, int cpu, int add,
+		struct sched_rt_entity *rt_se, int cpu,
 		struct sched_rt_entity *parent)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -7725,8 +7722,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 	init_rt_rq(rt_rq, rq);
 	rt_rq->tg = tg;
 	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
-	if (add)
-		list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 
 	tg->rt_se[cpu] = rt_se;
 	if (!rt_se)
@@ -7835,7 +7830,7 @@ void __init sched_init(void)
 		 * We achieve this by letting init_task_group's tasks sit
 		 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
 		 */
-		init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
+		init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
 #endif
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
@@ -7843,7 +7838,7 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 #ifdef CONFIG_CGROUP_SCHED
-		init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
+		init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
 #endif
 #endif
 
@@ -8119,7 +8114,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		if (!se)
 			goto err_free_rq;
 
-		init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
+		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 	}
 
 	return 1;
@@ -8130,15 +8125,22 @@ err:
 	return 0;
 }
 
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-	list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
-			&cpu_rq(cpu)->leaf_cfs_rq_list);
-}
-
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
-	list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+	int i;
+
+	/*
+	* Only empty task groups can be destroyed; so we can speculatively
+	* check on_list without danger of it being re-added.
+	*/
+	if (!tg->cfs_rq[cpu]->on_list)
+		return;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	list_del_leaf_cfs_rq(tg->cfs_rq[i]);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else /* !CONFG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
@@ -8151,10 +8153,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	return 1;
 }
 
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
-
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
@@ -8209,7 +8207,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 		if (!rt_se)
 			goto err_free_rq;
 
-		init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
+		init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 	}
 
 	return 1;
@@ -8219,17 +8217,6 @@ err_free_rq:
 err:
 	return 0;
 }
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-	list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
-			&cpu_rq(cpu)->leaf_rt_rq_list);
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-	list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
-}
 #else /* !CONFIG_RT_GROUP_SCHED */
 static inline void free_rt_sched_group(struct task_group *tg)
 {
@@ -8240,14 +8227,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	return 1;
 }
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_CGROUP_SCHED
@@ -8263,7 +8242,6 @@ struct task_group *sched_create_group(struct task_group *parent)
 {
 	struct task_group *tg;
 	unsigned long flags;
-	int i;
 
 	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
 	if (!tg)
@@ -8276,10 +8254,6 @@ struct task_group *sched_create_group(struct task_group *parent)
 		goto err;
 
 	spin_lock_irqsave(&task_group_lock, flags);
-	for_each_possible_cpu(i) {
-		register_fair_sched_group(tg, i);
-		register_rt_sched_group(tg, i);
-	}
 	list_add_rcu(&tg->list, &task_groups);
 
 	WARN_ON(!parent); /* root should already exist */
@@ -8309,11 +8283,11 @@ void sched_destroy_group(struct task_group *tg)
 	unsigned long flags;
 	int i;
 
-	spin_lock_irqsave(&task_group_lock, flags);
-	for_each_possible_cpu(i) {
+	/* end participation in shares distribution */
+	for_each_possible_cpu(i)
 		unregister_fair_sched_group(tg, i);
-		unregister_rt_sched_group(tg, i);
-	}
+
+	spin_lock_irqsave(&task_group_lock, flags);
 	list_del_rcu(&tg->list);
 	list_del_rcu(&tg->siblings);
 	spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8391,7 +8365,6 @@ static DEFINE_MUTEX(shares_mutex);
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
-	unsigned long flags;
 
 	/*
 	 * We can't change the weight of the root cgroup.
@@ -8408,19 +8381,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	if (tg->shares == shares)
 		goto done;
 
-	spin_lock_irqsave(&task_group_lock, flags);
-	for_each_possible_cpu(i)
-		unregister_fair_sched_group(tg, i);
-	list_del_rcu(&tg->siblings);
-	spin_unlock_irqrestore(&task_group_lock, flags);
-
-	/* wait for any ongoing reference to this group to finish */
-	synchronize_sched();
-
-	/*
-	 * Now we are free to modify the group's share on each cpu
-	 * w/o tripping rebalance_share or load_balance_fair.
-	 */
 	tg->shares = shares;
 	for_each_possible_cpu(i) {
 		/*
@@ -8429,15 +8389,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		set_se_shares(tg->se[i], shares);
 	}
 
-	/*
-	 * Enable load balance activity on this group, by inserting it back on
-	 * each cpu's rq->leaf_cfs_rq_list.
-	 */
-	spin_lock_irqsave(&task_group_lock, flags);
-	for_each_possible_cpu(i)
-		register_fair_sched_group(tg, i);
-	list_add_rcu(&tg->siblings, &tg->parent->children);
-	spin_unlock_irqrestore(&task_group_lock, flags);
 done:
 	mutex_unlock(&shares_mutex);
 	return 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d86544b..0560e72 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -143,6 +143,24 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 	return cfs_rq->tg->cfs_rq[this_cpu];
 }
 
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	if (!cfs_rq->on_list) {
+		list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+				&rq_of(cfs_rq)->leaf_cfs_rq_list);
+
+		cfs_rq->on_list = 1;
+	}
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->on_list) {
+		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+		cfs_rq->on_list = 0;
+	}
+}
+
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +264,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 	return &cpu_rq(this_cpu)->cfs;
 }
 
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
@@ -648,7 +674,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void update_cfs_load(struct cfs_rq *cfs_rq)
+static void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
 {
 	u64 period = sched_avg_period();
 	u64 now, delta;
@@ -673,6 +699,11 @@ static void update_cfs_load(struct cfs_rq *cfs_rq)
 		cfs_rq->load_period /= 2;
 		cfs_rq->load_avg /= 2;
 	}
+
+	if (lb && !cfs_rq->nr_running) {
+		if (cfs_rq->load_avg < (period / 8))
+			list_del_leaf_cfs_rq(cfs_rq);
+	}
 }
 
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
@@ -719,7 +750,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_load(struct cfs_rq *cfs_rq)
+static inline void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
 {
 }
 
@@ -849,7 +880,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	update_cfs_load(cfs_rq);
+	update_cfs_load(cfs_rq, 0);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
 
@@ -863,6 +894,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
 	se->on_rq = 1;
+
+	if (cfs_rq->nr_running == 1)
+		list_add_leaf_cfs_rq(cfs_rq);
 }
 
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -907,7 +941,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
-	update_cfs_load(cfs_rq);
+	update_cfs_load(cfs_rq, 0);
 	account_entity_dequeue(cfs_rq, se);
 	update_min_vruntime(cfs_rq);
 	update_cfs_shares(cfs_rq);
@@ -1142,7 +1176,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-		update_cfs_load(cfs_rq);
+		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
 
@@ -1172,7 +1206,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-		update_cfs_load(cfs_rq);
+		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bea7d79..c914ec7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 	return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 }
 
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+	list_add_rcu(&rt_rq->leaf_rt_rq_list,
+			&rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
+}
+
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+	list_del_rcu(&rt_rq->leaf_rt_rq_list);
+}
+
 #define for_each_leaf_rt_rq(rt_rq, rq) \
 	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 	return ktime_to_ns(def_rt_bandwidth.rt_period);
 }
 
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+
 #define for_each_leaf_rt_rq(rt_rq, rq) \
 	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
 		return;
 
+	if (!rt_rq->rt_nr_running)
+		list_add_leaf_rt_rq(rt_rq);
+
 	if (head)
 		list_add(&rt_se->run_list, queue);
 	else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
 		__clear_bit(rt_se_prio(rt_se), array->bitmap);
 
 	dec_rt_tasks(rt_se, rt_rq);
+	if (!rt_rq->rt_nr_running)
+		list_del_leaf_rt_rq(rt_rq);
 }
 
 /*
-- 
cgit v0.10.2


From 9e3081ca61147b29f52fddb4f7c6b6b82ea5eb7a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Nov 2010 15:47:02 -0800
Subject: sched: Make tg_shares_up() walk on-demand

Make tg_shares_up() use the active cgroup list, this means we cannot
do a strict bottom-up walk of the hierarchy, but assuming its a very
wide tree with a small number of active groups it should be a win.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.754159484@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 22436dd..6268d2d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -279,13 +279,6 @@ static DEFINE_SPINLOCK(task_group_lock);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
-	return list_empty(&root_task_group.children);
-}
-#endif
-
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 
 /*
@@ -1546,48 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-static void update_cfs_load(struct cfs_rq *cfs_rq, int lb);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
-
-/*
- * update tg->load_weight by folding this cpu's load_avg
- */
-static int tg_shares_up(struct task_group *tg, void *data)
-{
-	long load_avg;
-	struct cfs_rq *cfs_rq;
-	unsigned long flags;
-	int cpu = (long)data;
-	struct rq *rq;
-
-	if (!tg->se[cpu])
-		return 0;
-
-	rq = cpu_rq(cpu);
-	cfs_rq = tg->cfs_rq[cpu];
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
-
-	update_rq_clock(rq);
-	update_cfs_load(cfs_rq, 1);
-
-	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
-	load_avg -= cfs_rq->load_contribution;
-
-	atomic_add(load_avg, &tg->load_weight);
-	cfs_rq->load_contribution += load_avg;
-
-	/*
-	 * We need to update shares after updating tg->load_weight in
-	 * order to adjust the weight of groups with long running tasks.
-	 */
-	update_cfs_shares(cfs_rq);
-
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-
-	return 0;
-}
-
 /*
  * Compute the cpu's hierarchical load factor for each task group.
  * This needs to be done in a top-down fashion because the load of a child
@@ -1611,29 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data)
 	return 0;
 }
 
-static void update_shares(long cpu)
-{
-	if (root_task_group_empty())
-		return;
-
-	/*
-	 * XXX: replace with an on-demand list
-	 */
-
-	walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu);
-}
-
 static void update_h_load(long cpu)
 {
 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 
-#else
-
-static inline void update_shares(int cpu)
-{
-}
-
 #endif
 
 #ifdef CONFIG_PREEMPT
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0560e72..46ff658 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2004,6 +2004,60 @@ out:
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int tg_shares_up(struct task_group *tg, int cpu)
+{
+	struct cfs_rq *cfs_rq;
+	unsigned long flags;
+	struct rq *rq;
+	long load_avg;
+
+	if (!tg->se[cpu])
+		return 0;
+
+	rq = cpu_rq(cpu);
+	cfs_rq = tg->cfs_rq[cpu];
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	update_rq_clock(rq);
+	update_cfs_load(cfs_rq, 1);
+
+	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+	load_avg -= cfs_rq->load_contribution;
+	atomic_add(load_avg, &tg->load_weight);
+	cfs_rq->load_contribution += load_avg;
+
+	/*
+	 * We need to update shares after updating tg->load_weight in
+	 * order to adjust the weight of groups with long running tasks.
+	 */
+	update_cfs_shares(cfs_rq);
+
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	return 0;
+}
+
+static void update_shares(int cpu)
+{
+	struct cfs_rq *cfs_rq;
+	struct rq *rq = cpu_rq(cpu);
+
+	rcu_read_lock();
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		struct task_group *tg = cfs_rq->tg;
+
+		do {
+			tg_shares_up(tg, cpu);
+			tg = tg->parent;
+		} while (tg);
+	}
+	rcu_read_unlock();
+}
+
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
@@ -2051,6 +2105,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	return max_load_move - rem_load_move;
 }
 #else
+static inline void update_shares(int cpu)
+{
+}
+
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
-- 
cgit v0.10.2


From f0d7442a5924a802b66eef79b3708f77297bfb35 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:03 -0800
Subject: sched: Fix load corruption from update_cfs_shares()

As part of enqueue_entity both a new entity weight and its contribution to the
queuing cfs_rq / rq are updated.  Since update_cfs_shares will only update the
queueing weights when the entity is on_rq (which in this case it is not yet),
there's a dependency loop here:

update_cfs_shares needs account_entity_enqueue to update cfs_rq->load.weight
account_entity_enqueue needs the updated weight for the queuing cfs_rq load[*]

Fix this and avoid spurious dequeue/enqueues by issuing update_cfs_shares as
if we had accounted the enqueue already.

This was also resulting in rq->load corruption previously.

[*]: this dependency also exists when using the group cfs_rq w/
     update_cfs_shares as the weight of the enqueued entity changes
     without the load being updated.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.844900206@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 46ff658..d52b97a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -718,7 +718,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 		account_entity_enqueue(cfs_rq, se);
 }
 
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 {
 	struct task_group *tg;
 	struct sched_entity *se;
@@ -732,7 +732,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
 	if (!se)
 		return;
 
-	load = cfs_rq->load.weight;
+	load = cfs_rq->load.weight + weight_delta;
 
 	load_weight = atomic_read(&tg->load_weight);
 	load_weight -= cfs_rq->load_contribution;
@@ -754,7 +754,7 @@ static inline void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
 {
 }
 
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -881,8 +881,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	update_curr(cfs_rq);
 	update_cfs_load(cfs_rq, 0);
+	update_cfs_shares(cfs_rq, se->load.weight);
 	account_entity_enqueue(cfs_rq, se);
-	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
@@ -944,7 +944,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_cfs_load(cfs_rq, 0);
 	account_entity_dequeue(cfs_rq, se);
 	update_min_vruntime(cfs_rq);
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(cfs_rq, 0);
 
 	/*
 	 * Normalize the entity after updating the min_vruntime because the
@@ -1177,7 +1177,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(cfs_rq, 0);
 	}
 
 	hrtick_update(rq);
@@ -1207,7 +1207,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(cfs_rq, 0);
 	}
 
 	hrtick_update(rq);
@@ -2034,7 +2034,7 @@ static int tg_shares_up(struct task_group *tg, int cpu)
 	 * We need to update shares after updating tg->load_weight in
 	 * order to adjust the weight of groups with long running tasks.
 	 */
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(cfs_rq, 0);
 
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
-- 
cgit v0.10.2


From e33078baa4d30ad1d0e46d1f62b9e5a63a3e6ee3 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:04 -0800
Subject: sched: Fix update_cfs_load() synchronization

Using cfs_rq->nr_running is not sufficient to synchronize update_cfs_load with
the put path since nr_running accounting occurs at deactivation.

It's also not safe to make the removal decision based on load_avg as this fails
with both high periods and low shares.  Resolve this by clipping history after
4 periods without activity.

Note: the above will always occur from update_shares() since in the
last-task-sleep-case that task will still be cfs_rq->curr when update_cfs_load
is called.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.933428187@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 6268d2d..dadab4d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -355,7 +355,7 @@ struct cfs_rq {
 
 	u64 load_avg;
 	u64 load_period;
-	u64 load_stamp;
+	u64 load_stamp, load_last;
 
 	unsigned long load_contribution;
 #endif
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d52b97a..a543a5b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -674,10 +674,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
+static void update_cfs_load(struct cfs_rq *cfs_rq)
 {
 	u64 period = sched_avg_period();
 	u64 now, delta;
+	unsigned long load = cfs_rq->load.weight;
 
 	if (!cfs_rq)
 		return;
@@ -685,9 +686,19 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
 	now = rq_of(cfs_rq)->clock;
 	delta = now - cfs_rq->load_stamp;
 
+	/* truncate load history at 4 idle periods */
+	if (cfs_rq->load_stamp > cfs_rq->load_last &&
+	    now - cfs_rq->load_last > 4 * period) {
+		cfs_rq->load_period = 0;
+		cfs_rq->load_avg = 0;
+	}
+
 	cfs_rq->load_stamp = now;
 	cfs_rq->load_period += delta;
-	cfs_rq->load_avg += delta * cfs_rq->load.weight;
+	if (load) {
+		cfs_rq->load_last = now;
+		cfs_rq->load_avg += delta * load;
+	}
 
 	while (cfs_rq->load_period > period) {
 		/*
@@ -700,10 +711,8 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
 		cfs_rq->load_avg /= 2;
 	}
 
-	if (lb && !cfs_rq->nr_running) {
-		if (cfs_rq->load_avg < (period / 8))
-			list_del_leaf_cfs_rq(cfs_rq);
-	}
+	if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+		list_del_leaf_cfs_rq(cfs_rq);
 }
 
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
@@ -750,7 +759,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
+static inline void update_cfs_load(struct cfs_rq *cfs_rq)
 {
 }
 
@@ -880,7 +889,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	update_cfs_load(cfs_rq, 0);
+	update_cfs_load(cfs_rq);
 	update_cfs_shares(cfs_rq, se->load.weight);
 	account_entity_enqueue(cfs_rq, se);
 
@@ -941,7 +950,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
-	update_cfs_load(cfs_rq, 0);
+	update_cfs_load(cfs_rq);
 	account_entity_dequeue(cfs_rq, se);
 	update_min_vruntime(cfs_rq);
 	update_cfs_shares(cfs_rq, 0);
@@ -1176,7 +1185,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-		update_cfs_load(cfs_rq, 0);
+		update_cfs_load(cfs_rq);
 		update_cfs_shares(cfs_rq, 0);
 	}
 
@@ -1206,7 +1215,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-		update_cfs_load(cfs_rq, 0);
+		update_cfs_load(cfs_rq);
 		update_cfs_shares(cfs_rq, 0);
 	}
 
@@ -2023,7 +2032,7 @@ static int tg_shares_up(struct task_group *tg, int cpu)
 	raw_spin_lock_irqsave(&rq->lock, flags);
 
 	update_rq_clock(rq);
-	update_cfs_load(cfs_rq, 1);
+	update_cfs_load(cfs_rq);
 
 	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
 	load_avg -= cfs_rq->load_contribution;
-- 
cgit v0.10.2


From 67e86250f8ea7b8f7da53ac25ea73c6bd71f5cd9 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:05 -0800
Subject: sched: Introduce hierarchal order on shares update list

Avoid duplicate shares update calls by ensuring children always appear before
parents in rq->leaf_cfs_rq_list.

This allows us to do a single in-order traversal for update_shares().

Since we always enqueue in bottom-up order this reduces to 2 cases:

1) Our parent is already in the list, e.g.

   root
     \
      b
      /\
      c d* (root->b->c already enqueued)

Since d's parent is enqueued we push it to the head of the list, implicitly ahead of b.

2) Our parent does not appear in the list (or we have no parent)

In this case we enqueue to the tail of the list, if our parent is subsequently enqueued
(bottom-up) it will appear to our right by the same rule.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234938.022488865@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a543a5b..b320753 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -146,8 +146,20 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_rq->on_list) {
-		list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+		/*
+		 * Ensure we either appear before our parent (if already
+		 * enqueued) or force our parent to appear after us when it is
+		 * enqueued.  The fact that we always enqueue bottom-up
+		 * reduces this to two cases.
+		 */
+		if (cfs_rq->tg->parent &&
+		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+				&rq_of(cfs_rq)->leaf_cfs_rq_list);
+		} else {
+			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
+		}
 
 		cfs_rq->on_list = 1;
 	}
@@ -2016,7 +2028,7 @@ out:
 /*
  * update tg->load_weight by folding this cpu's load_avg
  */
-static int tg_shares_up(struct task_group *tg, int cpu)
+static int update_shares_cpu(struct task_group *tg, int cpu)
 {
 	struct cfs_rq *cfs_rq;
 	unsigned long flags;
@@ -2056,14 +2068,8 @@ static void update_shares(int cpu)
 	struct rq *rq = cpu_rq(cpu);
 
 	rcu_read_lock();
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		struct task_group *tg = cfs_rq->tg;
-
-		do {
-			tg_shares_up(tg, cpu);
-			tg = tg->parent;
-		} while (tg);
-	}
+	for_each_leaf_cfs_rq(rq, cfs_rq)
+		update_shares_cpu(cfs_rq->tg, cpu);
 	rcu_read_unlock();
 }
 
-- 
cgit v0.10.2


From a7a4f8a752ec734b2eab904fc863d5dc873de338 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:06 -0800
Subject: sched: Add sysctl_sched_shares_window

Introduce a new sysctl for the shares window and disambiguate it from
sched_time_avg.

A 10ms window appears to be a good compromise between accuracy and performance.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234938.112173964@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8abb8aa..840f127 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1900,6 +1900,7 @@ extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_time_avg;
 extern unsigned int sysctl_timer_migration;
+extern unsigned int sysctl_sched_shares_window;
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b320753..6c84439 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 
+/*
+ * The exponential sliding  window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
 static const struct sched_class fair_sched_class;
 
 /**************************************************************
@@ -688,7 +695,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
 static void update_cfs_load(struct cfs_rq *cfs_rq)
 {
-	u64 period = sched_avg_period();
+	u64 period = sysctl_sched_shares_window;
 	u64 now, delta;
 	unsigned long load = cfs_rq->load.weight;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3132b25..9b520d7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -333,6 +333,13 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
+		.procname	= "sched_shares_window",
+		.data		= &sysctl_sched_shares_window,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "timer_migration",
 		.data		= &sysctl_timer_migration,
 		.maxlen		= sizeof(unsigned int),
-- 
cgit v0.10.2


From c66eaf619c0c7937e9ded160ae83b5a7a6b19b56 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:07 -0800
Subject: sched: Update shares on idle_balance

Since shares updates are no longer expensive and effectively local, update them
at idle_balance().  This allows us to more quickly redistribute shares to
another cpu when our load becomes idle.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234938.204191702@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6c84439..33f941d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3343,6 +3343,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 	 */
 	raw_spin_unlock(&this_rq->lock);
 
+	update_shares(this_cpu);
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
 		int balance = 1;
-- 
cgit v0.10.2


From 3b3d190ec3683d568fd2ebaead5e1ec7f97b6e37 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:08 -0800
Subject: sched: Implement demand based update_cfs_load()

When the system is busy, dilation of rq->next_balance makes lb->update_shares()
insufficiently frequent for threads which don't sleep (no dequeue/enqueue
updates).  Adjust for this by making demand based updates based on the
accumulation of execution time sufficient to wrap our averaging window.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234938.291159744@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index dadab4d..e914a71 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -353,9 +353,16 @@ struct cfs_rq {
 	 */
 	unsigned long h_load;
 
+	/*
+	 * Maintaining per-cpu shares distribution for group scheduling
+	 *
+	 * load_stamp is the last time we updated the load average
+	 * load_last is the last time we updated the load average and saw load
+	 * load_unacc_exec_time is currently unaccounted execution time
+	 */
 	u64 load_avg;
 	u64 load_period;
-	u64 load_stamp, load_last;
+	u64 load_stamp, load_last, load_unacc_exec_time;
 
 	unsigned long load_contribution;
 #endif
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 33f941d..e7e2f08 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -539,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
 
+static void update_cfs_load(struct cfs_rq *cfs_rq);
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -558,6 +561,14 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
 	curr->vruntime += delta_exec_weighted;
 	update_min_vruntime(cfs_rq);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	cfs_rq->load_unacc_exec_time += delta_exec;
+	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+		update_cfs_load(cfs_rq);
+		update_cfs_shares(cfs_rq, 0);
+	}
+#endif
 }
 
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -713,6 +724,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq)
 	}
 
 	cfs_rq->load_stamp = now;
+	cfs_rq->load_unacc_exec_time = 0;
 	cfs_rq->load_period += delta;
 	if (load) {
 		cfs_rq->load_last = now;
-- 
cgit v0.10.2


From d6b5591829bd348a5fbe1c428d28dea00621cdba Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:09 -0800
Subject: sched: Allow update_cfs_load() to update global load

Refactor the global load updates from update_shares_cpu() so that
update_cfs_load() can update global load when it is more than ~10%
out of sync.

The new global_load parameter allows us to force an update, regardless of
the error factor so that we can synchronize w/ update_shares().

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234938.377473595@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e7e2f08..390ce30 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -539,7 +539,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
 
-static void update_cfs_load(struct cfs_rq *cfs_rq);
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
 static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
 
 /*
@@ -565,7 +565,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->load_unacc_exec_time += delta_exec;
 	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
-		update_cfs_load(cfs_rq);
+		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq, 0);
 	}
 #endif
@@ -704,7 +704,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void update_cfs_load(struct cfs_rq *cfs_rq)
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+					    int global_update)
+{
+	struct task_group *tg = cfs_rq->tg;
+	long load_avg;
+
+	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+	load_avg -= cfs_rq->load_contribution;
+
+	if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+		atomic_add(load_avg, &tg->load_weight);
+		cfs_rq->load_contribution += load_avg;
+	}
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
 	u64 period = sysctl_sched_shares_window;
 	u64 now, delta;
@@ -731,6 +746,11 @@ static void update_cfs_load(struct cfs_rq *cfs_rq)
 		cfs_rq->load_avg += delta * load;
 	}
 
+	/* consider updating load contribution on each fold or truncate */
+	if (global_update || cfs_rq->load_period > period
+	    || !cfs_rq->load_period)
+		update_cfs_rq_load_contribution(cfs_rq, global_update);
+
 	while (cfs_rq->load_period > period) {
 		/*
 		 * Inline assembly required to prevent the compiler
@@ -790,7 +810,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_load(struct cfs_rq *cfs_rq)
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
 }
 
@@ -920,7 +940,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	update_cfs_load(cfs_rq);
+	update_cfs_load(cfs_rq, 0);
 	update_cfs_shares(cfs_rq, se->load.weight);
 	account_entity_enqueue(cfs_rq, se);
 
@@ -981,7 +1001,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
-	update_cfs_load(cfs_rq);
+	update_cfs_load(cfs_rq, 0);
 	account_entity_dequeue(cfs_rq, se);
 	update_min_vruntime(cfs_rq);
 	update_cfs_shares(cfs_rq, 0);
@@ -1216,7 +1236,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-		update_cfs_load(cfs_rq);
+		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq, 0);
 	}
 
@@ -1246,7 +1266,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-		update_cfs_load(cfs_rq);
+		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq, 0);
 	}
 
@@ -2052,7 +2072,6 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
 	struct cfs_rq *cfs_rq;
 	unsigned long flags;
 	struct rq *rq;
-	long load_avg;
 
 	if (!tg->se[cpu])
 		return 0;
@@ -2063,12 +2082,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
 	raw_spin_lock_irqsave(&rq->lock, flags);
 
 	update_rq_clock(rq);
-	update_cfs_load(cfs_rq);
-
-	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
-	load_avg -= cfs_rq->load_contribution;
-	atomic_add(load_avg, &tg->load_weight);
-	cfs_rq->load_contribution += load_avg;
+	update_cfs_load(cfs_rq, 1);
 
 	/*
 	 * We need to update shares after updating tg->load_weight in
-- 
cgit v0.10.2


From 9437178f623a19af5951808d880a8599f66ac150 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:10 -0800
Subject: sched: Update tg->shares after cpu.shares write

Formerly sched_group_set_shares would force a rebalance by overflowing domain
share sums.  Now that per-cpu averages are maintained we can set the true value
by issuing an update_cfs_shares() following a tg->shares update.

Also initialize tg se->load to 0 for consistency since we'll now set correct
weights on enqueue.

Signed-off-by: Paul Turner <pjt@google.com?>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234938.465521344@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index e914a71..550cf3a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7646,7 +7646,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 		se->cfs_rq = parent->my_q;
 
 	se->my_q = cfs_rq;
-	update_load_set(&se->load, tg->shares);
+	update_load_set(&se->load, 0);
 	se->parent = parent;
 }
 #endif
@@ -8274,37 +8274,12 @@ void sched_move_task(struct task_struct *tsk)
 #endif /* CONFIG_CGROUP_SCHED */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-	struct cfs_rq *cfs_rq = se->cfs_rq;
-	int on_rq;
-
-	on_rq = se->on_rq;
-	if (on_rq)
-		dequeue_entity(cfs_rq, se, 0);
-
-	update_load_set(&se->load, shares);
-
-	if (on_rq)
-		enqueue_entity(cfs_rq, se, 0);
-}
-
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-	struct cfs_rq *cfs_rq = se->cfs_rq;
-	struct rq *rq = cfs_rq->rq;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	__set_se_shares(se, shares);
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
 static DEFINE_MUTEX(shares_mutex);
 
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
+	unsigned long flags;
 
 	/*
 	 * We can't change the weight of the root cgroup.
@@ -8323,10 +8298,15 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 
 	tg->shares = shares;
 	for_each_possible_cpu(i) {
-		/*
-		 * force a rebalance
-		 */
-		set_se_shares(tg->se[i], shares);
+		struct rq *rq = cpu_rq(i);
+		struct sched_entity *se;
+
+		se = tg->se[i];
+		/* Propagate contribution to hierarchy */
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		for_each_sched_entity(se)
+			update_cfs_shares(group_cfs_rq(se), 0);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 
 done:
-- 
cgit v0.10.2


From 84e1c6bb38eb318e456558b610396d9f1afaabf0 Mon Sep 17 00:00:00 2001
From: matthieu castet <castet.matthieu@free.fr>
Date: Tue, 16 Nov 2010 22:35:16 +0100
Subject: x86: Add RO/NX protection for loadable kernel modules

This patch is a logical extension of the protection provided by
CONFIG_DEBUG_RODATA to LKMs. The protection is provided by
splitting module_core and module_init into three logical parts
each and setting appropriate page access permissions for each
individual section:

 1. Code: RO+X
 2. RO data: RO+NX
 3. RW data: RW+NX

In order to achieve proper protection, layout_sections() have
been modified to align each of the three parts mentioned above
onto page boundary. Next, the corresponding page access
permissions are set right before successful exit from
load_module(). Further, free_module() and sys_init_module have
been modified to set module_core and module_init as RW+NX right
before calling module_free().

By default, the original section layout and access flags are
preserved. When compiled with CONFIG_DEBUG_SET_MODULE_RONX=y,
the patch will page-align each group of sections to ensure that
each page contains only one type of content and will enforce
RO/NX for each group of pages.

  -v1: Initial proof-of-concept patch.
  -v2: The patch have been re-written to reduce the number of #ifdefs
       and to make it architecture-agnostic. Code formatting has also
       been corrected.
  -v3: Opportunistic RO/NX protection is now unconditional. Section
       page-alignment is enabled when CONFIG_DEBUG_RODATA=y.
  -v4: Removed most macros and improved coding style.
  -v5: Changed page-alignment and RO/NX section size calculation
  -v6: Fixed comments. Restricted RO/NX enforcement to x86 only
  -v7: Introduced CONFIG_DEBUG_SET_MODULE_RONX, added
       calls to set_all_modules_text_rw() and set_all_modules_text_ro()
       in ftrace
  -v8: updated for compatibility with linux 2.6.33-rc5
  -v9: coding style fixes
 -v10: more coding style fixes
 -v11: minor adjustments for -tip
 -v12: minor adjustments for v2.6.35-rc2-tip
 -v13: minor adjustments for v2.6.37-rc1-tip

Signed-off-by: Siarhei Liakh <sliakh.lkml@gmail.com>
Signed-off-by: Xuxian Jiang <jiang@cs.ncsu.edu>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Dave Jones <davej@redhat.com>
Cc: Kees Cook <kees.cook@canonical.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4CE2F914.9070106@free.fr>
[ minor cleanliness edits, -v14: build failure fix ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index b59ee76..45143bb 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -117,6 +117,17 @@ config DEBUG_RODATA_TEST
 	  feature as well as for the change_page_attr() infrastructure.
 	  If in doubt, say "N"
 
+config DEBUG_SET_MODULE_RONX
+	bool "Set loadable kernel module data as NX and text as RO"
+	depends on MODULES
+	---help---
+	  This option helps catch unintended modifications to loadable
+	  kernel module's text and read-only data. It also prevents execution
+	  of module data. Such protection may interfere with run-time code
+	  patching and dynamic kernel tracing - and they might also protect
+	  against certain classes of kernel exploits.
+	  If in doubt, say "N".
+
 config DEBUG_NX_TEST
 	tristate "Testcase for the NX non-executable stack feature"
 	depends on DEBUG_KERNEL && m
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 3afb33f..2984486 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/list.h>
+#include <linux/module.h>
 
 #include <trace/syscall.h>
 
@@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code);
 int ftrace_arch_code_modify_prepare(void)
 {
 	set_kernel_text_rw();
+	set_all_modules_text_rw();
 	modifying_code = 1;
 	return 0;
 }
@@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void)
 int ftrace_arch_code_modify_post_process(void)
 {
 	modifying_code = 0;
+	set_all_modules_text_ro();
 	set_kernel_text_ro();
 	return 0;
 }
diff --git a/include/linux/module.h b/include/linux/module.h
index b29e745..ddaa689 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -308,6 +308,9 @@ struct module
 	/* The size of the executable code in each section.  */
 	unsigned int init_text_size, core_text_size;
 
+	/* Size of RO sections of the module (text+rodata) */
+	unsigned int init_ro_size, core_ro_size;
+
 	/* Arch-specific module values */
 	struct mod_arch_specific arch;
 
@@ -672,7 +675,6 @@ static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter)
 {
 	return 0;
 }
-
 #endif /* CONFIG_MODULES */
 
 #ifdef CONFIG_SYSFS
@@ -687,6 +689,13 @@ extern int module_sysfs_initialized;
 
 #define __MODULE_STRING(x) __stringify(x)
 
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+extern void set_all_modules_text_rw(void);
+extern void set_all_modules_text_ro(void);
+#else
+static inline void set_all_modules_text_rw(void) { }
+static inline void set_all_modules_text_ro(void) { }
+#endif
 
 #ifdef CONFIG_GENERIC_BUG
 void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
diff --git a/kernel/module.c b/kernel/module.c
index 437a74a..ba421e6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
 #include <linux/jump_label.h>
+#include <linux/pfn.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
@@ -70,6 +71,26 @@
 #define ARCH_SHF_SMALL 0
 #endif
 
+/*
+ * Modules' sections will be aligned on page boundaries
+ * to ensure complete separation of code and data, but
+ * only when CONFIG_DEBUG_SET_MODULE_RONX=y
+ */
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+# define debug_align(X) ALIGN(X, PAGE_SIZE)
+#else
+# define debug_align(X) (X)
+#endif
+
+/*
+ * Given BASE and SIZE this macro calculates the number of pages the
+ * memory regions occupies
+ */
+#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ?		\
+		(PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) -	\
+			 PFN_DOWN((unsigned long)BASE) + 1)	\
+		: (0UL))
+
 /* If this is set, the section belongs in the init part of the module */
 #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
 
@@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod)
 	return 0;
 }
 
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+/*
+ * LKM RO/NX protection: protect module's text/ro-data
+ * from modification and any data from execution.
+ */
+void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
+{
+	unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
+	unsigned long end_pfn = PFN_DOWN((unsigned long)end);
+
+	if (end_pfn > begin_pfn)
+		set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+}
+
+static void set_section_ro_nx(void *base,
+			unsigned long text_size,
+			unsigned long ro_size,
+			unsigned long total_size)
+{
+	/* begin and end PFNs of the current subsection */
+	unsigned long begin_pfn;
+	unsigned long end_pfn;
+
+	/*
+	 * Set RO for module text and RO-data:
+	 * - Always protect first page.
+	 * - Do not protect last partial page.
+	 */
+	if (ro_size > 0)
+		set_page_attributes(base, base + ro_size, set_memory_ro);
+
+	/*
+	 * Set NX permissions for module data:
+	 * - Do not protect first partial page.
+	 * - Always protect last page.
+	 */
+	if (total_size > text_size) {
+		begin_pfn = PFN_UP((unsigned long)base + text_size);
+		end_pfn = PFN_UP((unsigned long)base + total_size);
+		if (end_pfn > begin_pfn)
+			set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+	}
+}
+
+/* Setting memory back to RW+NX before releasing it */
+void unset_section_ro_nx(struct module *mod, void *module_region)
+{
+	unsigned long total_pages;
+
+	if (mod->module_core == module_region) {
+		/* Set core as NX+RW */
+		total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size);
+		set_memory_nx((unsigned long)mod->module_core, total_pages);
+		set_memory_rw((unsigned long)mod->module_core, total_pages);
+
+	} else if (mod->module_init == module_region) {
+		/* Set init as NX+RW */
+		total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size);
+		set_memory_nx((unsigned long)mod->module_init, total_pages);
+		set_memory_rw((unsigned long)mod->module_init, total_pages);
+	}
+}
+
+/* Iterate through all modules and set each module's text as RW */
+void set_all_modules_text_rw()
+{
+	struct module *mod;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry_rcu(mod, &modules, list) {
+		if ((mod->module_core) && (mod->core_text_size)) {
+			set_page_attributes(mod->module_core,
+						mod->module_core + mod->core_text_size,
+						set_memory_rw);
+		}
+		if ((mod->module_init) && (mod->init_text_size)) {
+			set_page_attributes(mod->module_init,
+						mod->module_init + mod->init_text_size,
+						set_memory_rw);
+		}
+	}
+	mutex_unlock(&module_mutex);
+}
+
+/* Iterate through all modules and set each module's text as RO */
+void set_all_modules_text_ro()
+{
+	struct module *mod;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry_rcu(mod, &modules, list) {
+		if ((mod->module_core) && (mod->core_text_size)) {
+			set_page_attributes(mod->module_core,
+						mod->module_core + mod->core_text_size,
+						set_memory_ro);
+		}
+		if ((mod->module_init) && (mod->init_text_size)) {
+			set_page_attributes(mod->module_init,
+						mod->module_init + mod->init_text_size,
+						set_memory_ro);
+		}
+	}
+	mutex_unlock(&module_mutex);
+}
+#else
+static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
+static inline void unset_section_ro_nx(struct module *mod, void *module_region) { }
+#endif
+
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
@@ -1566,6 +1696,7 @@ static void free_module(struct module *mod)
 	destroy_params(mod->kp, mod->num_kp);
 
 	/* This may be NULL, but that's OK */
+	unset_section_ro_nx(mod, mod->module_init);
 	module_free(mod, mod->module_init);
 	kfree(mod->args);
 	percpu_modfree(mod);
@@ -1574,6 +1705,7 @@ static void free_module(struct module *mod)
 	lockdep_free_key_range(mod->module_core, mod->core_size);
 
 	/* Finally, free the core (containing the module structure) */
+	unset_section_ro_nx(mod, mod->module_core);
 	module_free(mod, mod->module_core);
 
 #ifdef CONFIG_MPU
@@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
 			DEBUGP("\t%s\n", name);
 		}
-		if (m == 0)
+		switch (m) {
+		case 0: /* executable */
+			mod->core_size = debug_align(mod->core_size);
 			mod->core_text_size = mod->core_size;
+			break;
+		case 1: /* RO: text and ro-data */
+			mod->core_size = debug_align(mod->core_size);
+			mod->core_ro_size = mod->core_size;
+			break;
+		case 3: /* whole core */
+			mod->core_size = debug_align(mod->core_size);
+			break;
+		}
 	}
 
 	DEBUGP("Init section allocation order:\n");
@@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
 					 | INIT_OFFSET_MASK);
 			DEBUGP("\t%s\n", sname);
 		}
-		if (m == 0)
+		switch (m) {
+		case 0: /* executable */
+			mod->init_size = debug_align(mod->init_size);
 			mod->init_text_size = mod->init_size;
+			break;
+		case 1: /* RO: text and ro-data */
+			mod->init_size = debug_align(mod->init_size);
+			mod->init_ro_size = mod->init_size;
+			break;
+		case 3: /* whole init */
+			mod->init_size = debug_align(mod->init_size);
+			break;
+		}
 	}
 }
 
@@ -2650,6 +2804,18 @@ static struct module *load_module(void __user *umod,
 	kfree(info.strmap);
 	free_copy(&info);
 
+	/* Set RO and NX regions for core */
+	set_section_ro_nx(mod->module_core,
+				mod->core_text_size,
+				mod->core_ro_size,
+				mod->core_size);
+
+	/* Set RO and NX regions for init */
+	set_section_ro_nx(mod->module_init,
+				mod->init_text_size,
+				mod->init_ro_size,
+				mod->init_size);
+
 	/* Done! */
 	trace_module_load(mod);
 	return mod;
@@ -2753,6 +2919,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	mod->symtab = mod->core_symtab;
 	mod->strtab = mod->core_strtab;
 #endif
+	unset_section_ro_nx(mod, mod->module_init);
 	module_free(mod, mod->module_init);
 	mod->module_init = NULL;
 	mod->init_size = 0;
-- 
cgit v0.10.2


From eec1d4fa00c6552ae2fdf71d59f1eded7c88dd89 Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@amd.com>
Date: Fri, 29 Oct 2010 17:14:30 +0200
Subject: x86, amd-nb: Complete the rename of AMD NB and related code

Not only the naming of the files was confusing, it was even more so for
the function and variable names.

Renamed the K8 NB and NUMA stuff that is also used on other AMD
platforms. This also renames the CONFIG_K8_NUMA option to
CONFIG_AMD_NUMA and the related file k8topology_64.c to
amdtopology_64.c. No functional changes intended.

Signed-off-by: Hans Rosenfeld <hans.rosenfeld@amd.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e832768..08993a3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1141,16 +1141,16 @@ config NUMA
 comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
 	depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
 
-config K8_NUMA
+config AMD_NUMA
 	def_bool y
 	prompt "Old style AMD Opteron NUMA detection"
 	depends on X86_64 && NUMA && PCI
 	---help---
-	  Enable K8 NUMA node topology detection.  You should say Y here if
-	  you have a multi processor AMD K8 system. This uses an old
-	  method to read the NUMA configuration directly from the builtin
-	  Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
-	  instead, which also takes priority if both are compiled in.
+	  Enable AMD NUMA node topology detection.  You should say Y here if
+	  you have a multi processor AMD system. This uses an old method to
+	  read the NUMA configuration directly from the builtin Northbridge
+	  of Opteron. It is recommended to use X86_64_ACPI_NUMA instead,
+	  which also takes priority if both are compiled in.
 
 config X86_64_ACPI_NUMA
 	def_bool y
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index c8517f8..35b17a8 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,33 +3,33 @@
 
 #include <linux/pci.h>
 
-extern struct pci_device_id k8_nb_ids[];
+extern struct pci_device_id amd_nb_ids[];
 struct bootnode;
 
-extern int early_is_k8_nb(u32 value);
-extern int cache_k8_northbridges(void);
-extern void k8_flush_garts(void);
-extern int k8_get_nodes(struct bootnode *nodes);
-extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
-extern int k8_scan_nodes(void);
+extern int early_is_amd_nb(u32 value);
+extern int cache_amd_northbridges(void);
+extern void amd_flush_garts(void);
+extern int amd_get_nodes(struct bootnode *nodes);
+extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn);
+extern int amd_scan_nodes(void);
 
-struct k8_northbridge_info {
+struct amd_northbridge_info {
 	u16 num;
 	u8 gart_supported;
 	struct pci_dev **nb_misc;
 };
-extern struct k8_northbridge_info k8_northbridges;
+extern struct amd_northbridge_info amd_northbridges;
 
 #ifdef CONFIG_AMD_NB
 
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
+static inline struct pci_dev *node_to_amd_nb_misc(int node)
 {
-	return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
+	return (node < amd_northbridges.num) ? amd_northbridges.nb_misc[node] : NULL;
 }
 
 #else
 
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
+static inline struct pci_dev *node_to_amd_nb_misc(int node)
 {
 	return NULL;
 }
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 8f6463d..c46df40 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -12,95 +12,95 @@
 
 static u32 *flush_words;
 
-struct pci_device_id k8_nb_ids[] = {
+struct pci_device_id amd_nb_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
 	{}
 };
-EXPORT_SYMBOL(k8_nb_ids);
+EXPORT_SYMBOL(amd_nb_ids);
 
-struct k8_northbridge_info k8_northbridges;
-EXPORT_SYMBOL(k8_northbridges);
+struct amd_northbridge_info amd_northbridges;
+EXPORT_SYMBOL(amd_northbridges);
 
-static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
+static struct pci_dev *next_amd_northbridge(struct pci_dev *dev)
 {
 	do {
 		dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
 		if (!dev)
 			break;
-	} while (!pci_match_id(&k8_nb_ids[0], dev));
+	} while (!pci_match_id(&amd_nb_ids[0], dev));
 	return dev;
 }
 
-int cache_k8_northbridges(void)
+int cache_amd_northbridges(void)
 {
 	int i;
 	struct pci_dev *dev;
 
-	if (k8_northbridges.num)
+	if (amd_northbridges.num)
 		return 0;
 
 	dev = NULL;
-	while ((dev = next_k8_northbridge(dev)) != NULL)
-		k8_northbridges.num++;
+	while ((dev = next_amd_northbridge(dev)) != NULL)
+		amd_northbridges.num++;
 
 	/* some CPU families (e.g. family 0x11) do not support GART */
 	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
 	    boot_cpu_data.x86 == 0x15)
-		k8_northbridges.gart_supported = 1;
+		amd_northbridges.gart_supported = 1;
 
-	k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
+	amd_northbridges.nb_misc = kmalloc((amd_northbridges.num + 1) *
 					  sizeof(void *), GFP_KERNEL);
-	if (!k8_northbridges.nb_misc)
+	if (!amd_northbridges.nb_misc)
 		return -ENOMEM;
 
-	if (!k8_northbridges.num) {
-		k8_northbridges.nb_misc[0] = NULL;
+	if (!amd_northbridges.num) {
+		amd_northbridges.nb_misc[0] = NULL;
 		return 0;
 	}
 
-	if (k8_northbridges.gart_supported) {
-		flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
+	if (amd_northbridges.gart_supported) {
+		flush_words = kmalloc(amd_northbridges.num * sizeof(u32),
 				      GFP_KERNEL);
 		if (!flush_words) {
-			kfree(k8_northbridges.nb_misc);
+			kfree(amd_northbridges.nb_misc);
 			return -ENOMEM;
 		}
 	}
 
 	dev = NULL;
 	i = 0;
-	while ((dev = next_k8_northbridge(dev)) != NULL) {
-		k8_northbridges.nb_misc[i] = dev;
-		if (k8_northbridges.gart_supported)
+	while ((dev = next_amd_northbridge(dev)) != NULL) {
+		amd_northbridges.nb_misc[i] = dev;
+		if (amd_northbridges.gart_supported)
 			pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
 	}
-	k8_northbridges.nb_misc[i] = NULL;
+	amd_northbridges.nb_misc[i] = NULL;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(cache_k8_northbridges);
+EXPORT_SYMBOL_GPL(cache_amd_northbridges);
 
 /* Ignores subdevice/subvendor but as far as I can figure out
    they're useless anyways */
-int __init early_is_k8_nb(u32 device)
+int __init early_is_amd_nb(u32 device)
 {
 	struct pci_device_id *id;
 	u32 vendor = device & 0xffff;
 	device >>= 16;
-	for (id = k8_nb_ids; id->vendor; id++)
+	for (id = amd_nb_ids; id->vendor; id++)
 		if (vendor == id->vendor && device == id->device)
 			return 1;
 	return 0;
 }
 
-void k8_flush_garts(void)
+void amd_flush_garts(void)
 {
 	int flushed, i;
 	unsigned long flags;
 	static DEFINE_SPINLOCK(gart_lock);
 
-	if (!k8_northbridges.gart_supported)
+	if (!amd_northbridges.gart_supported)
 		return;
 
 	/* Avoid races between AGP and IOMMU. In theory it's not needed
@@ -109,16 +109,16 @@ void k8_flush_garts(void)
 	   that it doesn't matter to serialize more. -AK */
 	spin_lock_irqsave(&gart_lock, flags);
 	flushed = 0;
-	for (i = 0; i < k8_northbridges.num; i++) {
-		pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
+	for (i = 0; i < amd_northbridges.num; i++) {
+		pci_write_config_dword(amd_northbridges.nb_misc[i], 0x9c,
 				       flush_words[i]|1);
 		flushed++;
 	}
-	for (i = 0; i < k8_northbridges.num; i++) {
+	for (i = 0; i < amd_northbridges.num; i++) {
 		u32 w;
 		/* Make sure the hardware actually executed the flush*/
 		for (;;) {
-			pci_read_config_dword(k8_northbridges.nb_misc[i],
+			pci_read_config_dword(amd_northbridges.nb_misc[i],
 					      0x9c, &w);
 			if (!(w & 1))
 				break;
@@ -129,19 +129,19 @@ void k8_flush_garts(void)
 	if (!flushed)
 		printk("nothing to flush?\n");
 }
-EXPORT_SYMBOL_GPL(k8_flush_garts);
+EXPORT_SYMBOL_GPL(amd_flush_garts);
 
-static __init int init_k8_nbs(void)
+static __init int init_amd_nbs(void)
 {
 	int err = 0;
 
-	err = cache_k8_northbridges();
+	err = cache_amd_northbridges();
 
 	if (err < 0)
-		printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
+		printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
 
 	return err;
 }
 
 /* This has to go after the PCI subsystem */
-fs_initcall(init_k8_nbs);
+fs_initcall(init_amd_nbs);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b3a16e8..dcd7c83 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -206,7 +206,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
  * Do an PCI bus scan by hand because we're running before the PCI
  * subsystem.
  *
- * All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+ * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
  * generically. It's probably overkill to always scan all slots because
  * the AGP bridges should be always an own bus on the HT hierarchy,
  * but do it here for future safety.
@@ -303,7 +303,7 @@ void __init early_gart_iommu_check(void)
 		dev_limit = bus_dev_ranges[i].dev_limit;
 
 		for (slot = dev_base; slot < dev_limit; slot++) {
-			if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+			if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
 				continue;
 
 			ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
@@ -358,7 +358,7 @@ void __init early_gart_iommu_check(void)
 		dev_limit = bus_dev_ranges[i].dev_limit;
 
 		for (slot = dev_base; slot < dev_limit; slot++) {
-			if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+			if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
 				continue;
 
 			ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
@@ -400,7 +400,7 @@ int __init gart_iommu_hole_init(void)
 		dev_limit = bus_dev_ranges[i].dev_limit;
 
 		for (slot = dev_base; slot < dev_limit; slot++) {
-			if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+			if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
 				continue;
 
 			iommu_detected = 1;
@@ -518,7 +518,7 @@ out:
 		dev_base = bus_dev_ranges[i].dev_base;
 		dev_limit = bus_dev_ranges[i].dev_limit;
 		for (slot = dev_base; slot < dev_limit; slot++) {
-			if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+			if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
 				continue;
 
 			write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 17ad033..92512ed 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -333,7 +333,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
 {
 	struct amd_l3_cache *l3;
-	struct pci_dev *dev = node_to_k8_nb_misc(node);
+	struct pci_dev *dev = node_to_amd_nb_misc(node);
 
 	l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
 	if (!l3) {
@@ -370,7 +370,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
 			return;
 
 	/* not in virtualized environments */
-	if (k8_northbridges.num == 0)
+	if (amd_northbridges.num == 0)
 		return;
 
 	/*
@@ -378,7 +378,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
 	 * never freed but this is done only on shutdown so it doesn't matter.
 	 */
 	if (!l3_caches) {
-		int size = k8_northbridges.num * sizeof(struct amd_l3_cache *);
+		int size = amd_northbridges.num * sizeof(struct amd_l3_cache *);
 
 		l3_caches = kzalloc(size, GFP_ATOMIC);
 		if (!l3_caches)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index ba0f0ca..63317c5 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -143,7 +143,7 @@ static void flush_gart(void)
 
 	spin_lock_irqsave(&iommu_bitmap_lock, flags);
 	if (need_flush) {
-		k8_flush_garts();
+		amd_flush_garts();
 		need_flush = false;
 	}
 	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -561,17 +561,17 @@ static void enable_gart_translations(void)
 {
 	int i;
 
-	if (!k8_northbridges.gart_supported)
+	if (!amd_northbridges.gart_supported)
 		return;
 
-	for (i = 0; i < k8_northbridges.num; i++) {
-		struct pci_dev *dev = k8_northbridges.nb_misc[i];
+	for (i = 0; i < amd_northbridges.num; i++) {
+		struct pci_dev *dev = amd_northbridges.nb_misc[i];
 
 		enable_gart_translation(dev, __pa(agp_gatt_table));
 	}
 
 	/* Flush the GART-TLB to remove stale entries */
-	k8_flush_garts();
+	amd_flush_garts();
 }
 
 /*
@@ -596,13 +596,13 @@ static void gart_fixup_northbridges(struct sys_device *dev)
 	if (!fix_up_north_bridges)
 		return;
 
-	if (!k8_northbridges.gart_supported)
+	if (!amd_northbridges.gart_supported)
 		return;
 
 	pr_info("PCI-DMA: Restoring GART aperture settings\n");
 
-	for (i = 0; i < k8_northbridges.num; i++) {
-		struct pci_dev *dev = k8_northbridges.nb_misc[i];
+	for (i = 0; i < amd_northbridges.num; i++) {
+		struct pci_dev *dev = amd_northbridges.nb_misc[i];
 
 		/*
 		 * Don't enable translations just yet.  That is the next
@@ -644,7 +644,7 @@ static struct sys_device device_gart = {
  * Private Northbridge GATT initialization in case we cannot use the
  * AGP driver for some reason.
  */
-static __init int init_k8_gatt(struct agp_kern_info *info)
+static __init int init_amd_gatt(struct agp_kern_info *info)
 {
 	unsigned aper_size, gatt_size, new_aper_size;
 	unsigned aper_base, new_aper_base;
@@ -656,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
 	aper_size = aper_base = info->aper_size = 0;
 	dev = NULL;
-	for (i = 0; i < k8_northbridges.num; i++) {
-		dev = k8_northbridges.nb_misc[i];
+	for (i = 0; i < amd_northbridges.num; i++) {
+		dev = amd_northbridges.nb_misc[i];
 		new_aper_base = read_aperture(dev, &new_aper_size);
 		if (!new_aper_base)
 			goto nommu;
@@ -725,13 +725,13 @@ static void gart_iommu_shutdown(void)
 	if (!no_agp)
 		return;
 
-	if (!k8_northbridges.gart_supported)
+	if (!amd_northbridges.gart_supported)
 		return;
 
-	for (i = 0; i < k8_northbridges.num; i++) {
+	for (i = 0; i < amd_northbridges.num; i++) {
 		u32 ctl;
 
-		dev = k8_northbridges.nb_misc[i];
+		dev = amd_northbridges.nb_misc[i];
 		pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
 
 		ctl &= ~GARTEN;
@@ -749,14 +749,14 @@ int __init gart_iommu_init(void)
 	unsigned long scratch;
 	long i;
 
-	if (!k8_northbridges.gart_supported)
+	if (!amd_northbridges.gart_supported)
 		return 0;
 
 #ifndef CONFIG_AGP_AMD64
 	no_agp = 1;
 #else
 	/* Makefile puts PCI initialization via subsys_initcall first. */
-	/* Add other K8 AGP bridge drivers here */
+	/* Add other AMD AGP bridge drivers here */
 	no_agp = no_agp ||
 		(agp_amd64_init() < 0) ||
 		(agp_copy_info(agp_bridge, &info) < 0);
@@ -765,7 +765,7 @@ int __init gart_iommu_init(void)
 	if (no_iommu ||
 	    (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
 	    !gart_iommu_aperture ||
-	    (no_agp && init_k8_gatt(&info) < 0)) {
+	    (no_agp && init_amd_gatt(&info) < 0)) {
 		if (max_pfn > MAX_DMA32_PFN) {
 			pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
 			pr_warning("falling back to iommu=soft.\n");
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 21c6746..df172c1 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -694,7 +694,7 @@ static u64 __init get_max_mapped(void)
 void __init setup_arch(char **cmdline_p)
 {
 	int acpi = 0;
-	int k8 = 0;
+	int amd = 0;
 	unsigned long flags;
 
 #ifdef CONFIG_X86_32
@@ -981,12 +981,12 @@ void __init setup_arch(char **cmdline_p)
 	acpi = acpi_numa_init();
 #endif
 
-#ifdef CONFIG_K8_NUMA
+#ifdef CONFIG_AMD_NUMA
 	if (!acpi)
-		k8 = !k8_numa_init(0, max_pfn);
+		amd = !amd_numa_init(0, max_pfn);
 #endif
 
-	initmem_init(0, max_pfn, acpi, k8);
+	initmem_init(0, max_pfn, acpi, amd);
 	memblock_find_dma_reserve();
 	dma32_reserve_bootmem();
 
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 5554339..09df2f9 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,7 +23,7 @@ mmiotrace-y			:= kmmio.o pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
 
 obj-$(CONFIG_NUMA)		+= numa.o numa_$(BITS).o
-obj-$(CONFIG_K8_NUMA)		+= k8topology_64.o
+obj-$(CONFIG_AMD_NUMA)		+= amdtopology_64.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o
 
 obj-$(CONFIG_HAVE_MEMBLOCK)		+= memblock.o
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
new file mode 100644
index 0000000..51fae9c
--- /dev/null
+++ b/arch/x86/mm/amdtopology_64.c
@@ -0,0 +1,237 @@
+/*
+ * AMD NUMA support.
+ * Discover the memory map and associated nodes.
+ *
+ * This version reads it directly from the AMD northbridge.
+ *
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/memblock.h>
+
+#include <asm/io.h>
+#include <linux/pci_ids.h>
+#include <linux/acpi.h>
+#include <asm/types.h>
+#include <asm/mmzone.h>
+#include <asm/proto.h>
+#include <asm/e820.h>
+#include <asm/pci-direct.h>
+#include <asm/numa.h>
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <asm/amd_nb.h>
+
+static struct bootnode __initdata nodes[8];
+static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
+
+static __init int find_northbridge(void)
+{
+	int num;
+
+	for (num = 0; num < 32; num++) {
+		u32 header;
+
+		header = read_pci_config(0, num, 0, 0x00);
+		if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
+			continue;
+
+		header = read_pci_config(0, num, 1, 0x00);
+		if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
+			continue;
+		return num;
+	}
+
+	return -1;
+}
+
+static __init void early_get_boot_cpu_id(void)
+{
+	/*
+	 * need to get the APIC ID of the BSP so can use that to
+	 * create apicid_to_node in amd_scan_nodes()
+	 */
+#ifdef CONFIG_X86_MPPARSE
+	/*
+	 * get boot-time SMP configuration:
+	 */
+	if (smp_found_config)
+		early_get_smp_config();
+#endif
+	early_init_lapic_mapping();
+}
+
+int __init amd_get_nodes(struct bootnode *physnodes)
+{
+	int i;
+	int ret = 0;
+
+	for_each_node_mask(i, nodes_parsed) {
+		physnodes[ret].start = nodes[i].start;
+		physnodes[ret].end = nodes[i].end;
+		ret++;
+	}
+	return ret;
+}
+
+int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+	unsigned long start = PFN_PHYS(start_pfn);
+	unsigned long end = PFN_PHYS(end_pfn);
+	unsigned numnodes;
+	unsigned long prevbase;
+	int i, nb, found = 0;
+	u32 nodeid, reg;
+
+	if (!early_pci_allowed())
+		return -1;
+
+	nb = find_northbridge();
+	if (nb < 0)
+		return nb;
+
+	pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
+
+	reg = read_pci_config(0, nb, 0, 0x60);
+	numnodes = ((reg >> 4) & 0xF) + 1;
+	if (numnodes <= 1)
+		return -1;
+
+	pr_info("Number of physical nodes %d\n", numnodes);
+
+	prevbase = 0;
+	for (i = 0; i < 8; i++) {
+		unsigned long base, limit;
+
+		base = read_pci_config(0, nb, 1, 0x40 + i*8);
+		limit = read_pci_config(0, nb, 1, 0x44 + i*8);
+
+		nodeid = limit & 7;
+		if ((base & 3) == 0) {
+			if (i < numnodes)
+				pr_info("Skipping disabled node %d\n", i);
+			continue;
+		}
+		if (nodeid >= numnodes) {
+			pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+				base, limit);
+			continue;
+		}
+
+		if (!limit) {
+			pr_info("Skipping node entry %d (base %lx)\n",
+				i, base);
+			continue;
+		}
+		if ((base >> 8) & 3 || (limit >> 8) & 3) {
+			pr_err("Node %d using interleaving mode %lx/%lx\n",
+			       nodeid, (base >> 8) & 3, (limit >> 8) & 3);
+			return -1;
+		}
+		if (node_isset(nodeid, nodes_parsed)) {
+			pr_info("Node %d already present, skipping\n",
+				nodeid);
+			continue;
+		}
+
+		limit >>= 16;
+		limit <<= 24;
+		limit |= (1<<24)-1;
+		limit++;
+
+		if (limit > end)
+			limit = end;
+		if (limit <= base)
+			continue;
+
+		base >>= 16;
+		base <<= 24;
+
+		if (base < start)
+			base = start;
+		if (limit > end)
+			limit = end;
+		if (limit == base) {
+			pr_err("Empty node %d\n", nodeid);
+			continue;
+		}
+		if (limit < base) {
+			pr_err("Node %d bogus settings %lx-%lx.\n",
+			       nodeid, base, limit);
+			continue;
+		}
+
+		/* Could sort here, but pun for now. Should not happen anyroads. */
+		if (prevbase > base) {
+			pr_err("Node map not sorted %lx,%lx\n",
+			       prevbase, base);
+			return -1;
+		}
+
+		pr_info("Node %d MemBase %016lx Limit %016lx\n",
+			nodeid, base, limit);
+
+		found++;
+
+		nodes[nodeid].start = base;
+		nodes[nodeid].end = limit;
+
+		prevbase = base;
+
+		node_set(nodeid, nodes_parsed);
+	}
+
+	if (!found)
+		return -1;
+	return 0;
+}
+
+int __init amd_scan_nodes(void)
+{
+	unsigned int bits;
+	unsigned int cores;
+	unsigned int apicid_base;
+	int i;
+
+	BUG_ON(nodes_empty(nodes_parsed));
+	node_possible_map = nodes_parsed;
+	memnode_shift = compute_hash_shift(nodes, 8, NULL);
+	if (memnode_shift < 0) {
+		pr_err("No NUMA node hash function found. Contact maintainer\n");
+		return -1;
+	}
+	pr_info("Using node hash shift of %d\n", memnode_shift);
+
+	/* use the coreid bits from early_identify_cpu */
+	bits = boot_cpu_data.x86_coreid_bits;
+	cores = (1<<bits);
+	apicid_base = 0;
+	/* get the APIC ID of the BSP early for systems with apicid lifting */
+	early_get_boot_cpu_id();
+	if (boot_cpu_physical_apicid > 0) {
+		pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
+		apicid_base = boot_cpu_physical_apicid;
+	}
+
+	for_each_node_mask(i, node_possible_map) {
+		int j;
+
+		memblock_x86_register_active_regions(i,
+				nodes[i].start >> PAGE_SHIFT,
+				nodes[i].end >> PAGE_SHIFT);
+		for (j = apicid_base; j < cores + apicid_base; j++)
+			apicid_to_node[(i << bits) + j] = i;
+		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+	}
+
+	numa_init_array();
+	return 0;
+}
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
deleted file mode 100644
index 804a3b6..0000000
--- a/arch/x86/mm/k8topology_64.c
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * AMD K8 NUMA support.
- * Discover the memory map and associated nodes.
- *
- * This version reads it directly from the K8 northbridge.
- *
- * Copyright 2002,2003 Andi Kleen, SuSE Labs.
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/nodemask.h>
-#include <linux/memblock.h>
-
-#include <asm/io.h>
-#include <linux/pci_ids.h>
-#include <linux/acpi.h>
-#include <asm/types.h>
-#include <asm/mmzone.h>
-#include <asm/proto.h>
-#include <asm/e820.h>
-#include <asm/pci-direct.h>
-#include <asm/numa.h>
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <asm/amd_nb.h>
-
-static struct bootnode __initdata nodes[8];
-static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
-
-static __init int find_northbridge(void)
-{
-	int num;
-
-	for (num = 0; num < 32; num++) {
-		u32 header;
-
-		header = read_pci_config(0, num, 0, 0x00);
-		if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
-			header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
-			header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
-			continue;
-
-		header = read_pci_config(0, num, 1, 0x00);
-		if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
-			header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
-			header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
-			continue;
-		return num;
-	}
-
-	return -1;
-}
-
-static __init void early_get_boot_cpu_id(void)
-{
-	/*
-	 * need to get the APIC ID of the BSP so can use that to
-	 * create apicid_to_node in k8_scan_nodes()
-	 */
-#ifdef CONFIG_X86_MPPARSE
-	/*
-	 * get boot-time SMP configuration:
-	 */
-	if (smp_found_config)
-		early_get_smp_config();
-#endif
-	early_init_lapic_mapping();
-}
-
-int __init k8_get_nodes(struct bootnode *physnodes)
-{
-	int i;
-	int ret = 0;
-
-	for_each_node_mask(i, nodes_parsed) {
-		physnodes[ret].start = nodes[i].start;
-		physnodes[ret].end = nodes[i].end;
-		ret++;
-	}
-	return ret;
-}
-
-int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
-{
-	unsigned long start = PFN_PHYS(start_pfn);
-	unsigned long end = PFN_PHYS(end_pfn);
-	unsigned numnodes;
-	unsigned long prevbase;
-	int i, nb, found = 0;
-	u32 nodeid, reg;
-
-	if (!early_pci_allowed())
-		return -1;
-
-	nb = find_northbridge();
-	if (nb < 0)
-		return nb;
-
-	pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
-
-	reg = read_pci_config(0, nb, 0, 0x60);
-	numnodes = ((reg >> 4) & 0xF) + 1;
-	if (numnodes <= 1)
-		return -1;
-
-	pr_info("Number of physical nodes %d\n", numnodes);
-
-	prevbase = 0;
-	for (i = 0; i < 8; i++) {
-		unsigned long base, limit;
-
-		base = read_pci_config(0, nb, 1, 0x40 + i*8);
-		limit = read_pci_config(0, nb, 1, 0x44 + i*8);
-
-		nodeid = limit & 7;
-		if ((base & 3) == 0) {
-			if (i < numnodes)
-				pr_info("Skipping disabled node %d\n", i);
-			continue;
-		}
-		if (nodeid >= numnodes) {
-			pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
-				base, limit);
-			continue;
-		}
-
-		if (!limit) {
-			pr_info("Skipping node entry %d (base %lx)\n",
-				i, base);
-			continue;
-		}
-		if ((base >> 8) & 3 || (limit >> 8) & 3) {
-			pr_err("Node %d using interleaving mode %lx/%lx\n",
-			       nodeid, (base >> 8) & 3, (limit >> 8) & 3);
-			return -1;
-		}
-		if (node_isset(nodeid, nodes_parsed)) {
-			pr_info("Node %d already present, skipping\n",
-				nodeid);
-			continue;
-		}
-
-		limit >>= 16;
-		limit <<= 24;
-		limit |= (1<<24)-1;
-		limit++;
-
-		if (limit > end)
-			limit = end;
-		if (limit <= base)
-			continue;
-
-		base >>= 16;
-		base <<= 24;
-
-		if (base < start)
-			base = start;
-		if (limit > end)
-			limit = end;
-		if (limit == base) {
-			pr_err("Empty node %d\n", nodeid);
-			continue;
-		}
-		if (limit < base) {
-			pr_err("Node %d bogus settings %lx-%lx.\n",
-			       nodeid, base, limit);
-			continue;
-		}
-
-		/* Could sort here, but pun for now. Should not happen anyroads. */
-		if (prevbase > base) {
-			pr_err("Node map not sorted %lx,%lx\n",
-			       prevbase, base);
-			return -1;
-		}
-
-		pr_info("Node %d MemBase %016lx Limit %016lx\n",
-			nodeid, base, limit);
-
-		found++;
-
-		nodes[nodeid].start = base;
-		nodes[nodeid].end = limit;
-
-		prevbase = base;
-
-		node_set(nodeid, nodes_parsed);
-	}
-
-	if (!found)
-		return -1;
-	return 0;
-}
-
-int __init k8_scan_nodes(void)
-{
-	unsigned int bits;
-	unsigned int cores;
-	unsigned int apicid_base;
-	int i;
-
-	BUG_ON(nodes_empty(nodes_parsed));
-	node_possible_map = nodes_parsed;
-	memnode_shift = compute_hash_shift(nodes, 8, NULL);
-	if (memnode_shift < 0) {
-		pr_err("No NUMA node hash function found. Contact maintainer\n");
-		return -1;
-	}
-	pr_info("Using node hash shift of %d\n", memnode_shift);
-
-	/* use the coreid bits from early_identify_cpu */
-	bits = boot_cpu_data.x86_coreid_bits;
-	cores = (1<<bits);
-	apicid_base = 0;
-	/* get the APIC ID of the BSP early for systems with apicid lifting */
-	early_get_boot_cpu_id();
-	if (boot_cpu_physical_apicid > 0) {
-		pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
-		apicid_base = boot_cpu_physical_apicid;
-	}
-
-	for_each_node_mask(i, node_possible_map) {
-		int j;
-
-		memblock_x86_register_active_regions(i,
-				nodes[i].start >> PAGE_SHIFT,
-				nodes[i].end >> PAGE_SHIFT);
-		for (j = apicid_base; j < cores + apicid_base; j++)
-			apicid_to_node[(i << bits) + j] = i;
-		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-	}
-
-	numa_init_array();
-	return 0;
-}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7ffc9b7..7762a51 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -264,7 +264,7 @@ static struct bootnode physnodes[MAX_NUMNODES] __initdata;
 static char *cmdline __initdata;
 
 static int __init setup_physnodes(unsigned long start, unsigned long end,
-					int acpi, int k8)
+					int acpi, int amd)
 {
 	int nr_nodes = 0;
 	int ret = 0;
@@ -274,13 +274,13 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
 	if (acpi)
 		nr_nodes = acpi_get_nodes(physnodes);
 #endif
-#ifdef CONFIG_K8_NUMA
-	if (k8)
-		nr_nodes = k8_get_nodes(physnodes);
+#ifdef CONFIG_AMD_NUMA
+	if (amd)
+		nr_nodes = amd_get_nodes(physnodes);
 #endif
 	/*
 	 * Basic sanity checking on the physical node map: there may be errors
-	 * if the SRAT or K8 incorrectly reported the topology or the mem=
+	 * if the SRAT or AMD code incorrectly reported the topology or the mem=
 	 * kernel parameter is used.
 	 */
 	for (i = 0; i < nr_nodes; i++) {
@@ -549,7 +549,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
  * numa=fake command-line option.
  */
 static int __init numa_emulation(unsigned long start_pfn,
-			unsigned long last_pfn, int acpi, int k8)
+			unsigned long last_pfn, int acpi, int amd)
 {
 	u64 addr = start_pfn << PAGE_SHIFT;
 	u64 max_addr = last_pfn << PAGE_SHIFT;
@@ -557,7 +557,7 @@ static int __init numa_emulation(unsigned long start_pfn,
 	int num_nodes;
 	int i;
 
-	num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
+	num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd);
 	/*
 	 * If the numa=fake command-line contains a 'M' or 'G', it represents
 	 * the fixed node size.  Otherwise, if it is just a single number N,
@@ -602,7 +602,7 @@ static int __init numa_emulation(unsigned long start_pfn,
 #endif /* CONFIG_NUMA_EMU */
 
 void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
-				int acpi, int k8)
+				int acpi, int amd)
 {
 	int i;
 
@@ -610,7 +610,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 	nodes_clear(node_online_map);
 
 #ifdef CONFIG_NUMA_EMU
-	if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
+	if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
 		return;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
@@ -624,8 +624,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 	nodes_clear(node_online_map);
 #endif
 
-#ifdef CONFIG_K8_NUMA
-	if (!numa_off && k8 && !k8_scan_nodes())
+#ifdef CONFIG_AMD_NUMA
+	if (!numa_off && amd && !amd_scan_nodes())
 		return;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 42396df..b1f8bb5 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -38,7 +38,7 @@ static int agp_bridges_found;
 
 static void amd64_tlbflush(struct agp_memory *temp)
 {
-	k8_flush_garts();
+	amd_flush_garts();
 }
 
 static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
@@ -124,7 +124,7 @@ static int amd64_fetch_size(void)
 	u32 temp;
 	struct aper_size_info_32 *values;
 
-	dev = k8_northbridges.nb_misc[0];
+	dev = amd_northbridges.nb_misc[0];
 	if (dev==NULL)
 		return 0;
 
@@ -181,16 +181,16 @@ static int amd_8151_configure(void)
 	unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
 	int i;
 
-	if (!k8_northbridges.gart_supported)
+	if (!amd_northbridges.gart_supported)
 		return 0;
 
 	/* Configure AGP regs in each x86-64 host bridge. */
-	for (i = 0; i < k8_northbridges.num; i++) {
+	for (i = 0; i < amd_northbridges.num; i++) {
 		agp_bridge->gart_bus_addr =
-				amd64_configure(k8_northbridges.nb_misc[i],
+				amd64_configure(amd_northbridges.nb_misc[i],
 						gatt_bus);
 	}
-	k8_flush_garts();
+	amd_flush_garts();
 	return 0;
 }
 
@@ -200,11 +200,11 @@ static void amd64_cleanup(void)
 	u32 tmp;
 	int i;
 
-	if (!k8_northbridges.gart_supported)
+	if (!amd_northbridges.gart_supported)
 		return;
 
-	for (i = 0; i < k8_northbridges.num; i++) {
-		struct pci_dev *dev = k8_northbridges.nb_misc[i];
+	for (i = 0; i < amd_northbridges.num; i++) {
+		struct pci_dev *dev = amd_northbridges.nb_misc[i];
 		/* disable gart translation */
 		pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
 		tmp &= ~GARTEN;
@@ -331,15 +331,15 @@ static __devinit int cache_nbs(struct pci_dev *pdev, u32 cap_ptr)
 {
 	int i;
 
-	if (cache_k8_northbridges() < 0)
+	if (cache_amd_northbridges() < 0)
 		return -ENODEV;
 
-	if (!k8_northbridges.gart_supported)
+	if (!amd_northbridges.gart_supported)
 		return -ENODEV;
 
 	i = 0;
-	for (i = 0; i < k8_northbridges.num; i++) {
-		struct pci_dev *dev = k8_northbridges.nb_misc[i];
+	for (i = 0; i < amd_northbridges.num; i++) {
+		struct pci_dev *dev = amd_northbridges.nb_misc[i];
 		if (fix_northbridge(dev, pdev, cap_ptr) < 0) {
 			dev_err(&dev->dev, "no usable aperture found\n");
 #ifdef __x86_64__
@@ -416,7 +416,7 @@ static int __devinit uli_agp_init(struct pci_dev *pdev)
 	}
 
 	/* shadow x86-64 registers into ULi registers */
-	pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+	pci_read_config_dword (amd_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
 			       &httfea);
 
 	/* if x86-64 aperture base is beyond 4G, exit here */
@@ -484,7 +484,7 @@ static int nforce3_agp_init(struct pci_dev *pdev)
 	pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp);
 
 	/* shadow x86-64 registers into NVIDIA registers */
-	pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+	pci_read_config_dword (amd_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
 			       &apbase);
 
 	/* if x86-64 aperture base is beyond 4G, exit here */
@@ -778,7 +778,7 @@ int __init agp_amd64_init(void)
 		}
 
 		/* First check that we have at least one AMD64 NB */
-		if (!pci_dev_present(k8_nb_ids))
+		if (!pci_dev_present(amd_nb_ids))
 			return -ENODEV;
 
 		/* Look for any AGP bridge */
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 8521401..8b144cc 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2917,7 +2917,7 @@ static int __init amd64_edac_init(void)
 
 	opstate_init();
 
-	if (cache_k8_northbridges() < 0)
+	if (cache_amd_northbridges() < 0)
 		goto err_ret;
 
 	msrs = msrs_alloc();
@@ -2934,7 +2934,7 @@ static int __init amd64_edac_init(void)
 	 * to finish initialization of the MC instances.
 	 */
 	err = -ENODEV;
-	for (nb = 0; nb < k8_northbridges.num; nb++) {
+	for (nb = 0; nb < amd_northbridges.num; nb++) {
 		if (!pvt_lookup[nb])
 			continue;
 
-- 
cgit v0.10.2


From 9653a5c76c8677b05b45b3b999d3b39988d2a064 Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@amd.com>
Date: Fri, 29 Oct 2010 17:14:31 +0200
Subject: x86, amd-nb: Cleanup AMD northbridge caching code

Support more than just the "Misc Control" part of the northbridges.
Support more flags by turning "gart_supported" into a single bit flag
that is stored in a flags member. Clean up related code by using a set
of functions (amd_nb_num(), amd_nb_has_feature() and node_to_amd_nb())
instead of accessing the NB data structures directly. Reorder the
initialization code and put the GART flush words caching in a separate
function.

Signed-off-by: Hans Rosenfeld <hans.rosenfeld@amd.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 35b17a8..4d7ec7d 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,36 +3,52 @@
 
 #include <linux/pci.h>
 
-extern struct pci_device_id amd_nb_ids[];
+extern struct pci_device_id amd_nb_misc_ids[];
 struct bootnode;
 
 extern int early_is_amd_nb(u32 value);
-extern int cache_amd_northbridges(void);
+extern int amd_cache_northbridges(void);
 extern void amd_flush_garts(void);
 extern int amd_get_nodes(struct bootnode *nodes);
 extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn);
 extern int amd_scan_nodes(void);
 
+struct amd_northbridge {
+	struct pci_dev *misc;
+};
+
 struct amd_northbridge_info {
 	u16 num;
-	u8 gart_supported;
-	struct pci_dev **nb_misc;
+	u64 flags;
+	struct amd_northbridge *nb;
 };
 extern struct amd_northbridge_info amd_northbridges;
 
+#define AMD_NB_GART			0x1
+
 #ifdef CONFIG_AMD_NB
 
-static inline struct pci_dev *node_to_amd_nb_misc(int node)
+static inline int amd_nb_num(void)
 {
-	return (node < amd_northbridges.num) ? amd_northbridges.nb_misc[node] : NULL;
+	return amd_northbridges.num;
 }
 
-#else
+static inline int amd_nb_has_feature(int feature)
+{
+	return ((amd_northbridges.flags & feature) == feature);
+}
 
-static inline struct pci_dev *node_to_amd_nb_misc(int node)
+static inline struct amd_northbridge *node_to_amd_nb(int node)
 {
-	return NULL;
+	return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
 }
+
+#else
+
+#define amd_nb_num(x)		0
+#define amd_nb_has_feature(x)	false
+#define node_to_amd_nb(x)	NULL
+
 #endif
 
 
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index c46df40..63c8b4f 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -12,74 +12,65 @@
 
 static u32 *flush_words;
 
-struct pci_device_id amd_nb_ids[] = {
+struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
 	{}
 };
-EXPORT_SYMBOL(amd_nb_ids);
+EXPORT_SYMBOL(amd_nb_misc_ids);
 
 struct amd_northbridge_info amd_northbridges;
 EXPORT_SYMBOL(amd_northbridges);
 
-static struct pci_dev *next_amd_northbridge(struct pci_dev *dev)
+static struct pci_dev *next_northbridge(struct pci_dev *dev,
+					struct pci_device_id *ids)
 {
 	do {
 		dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
 		if (!dev)
 			break;
-	} while (!pci_match_id(&amd_nb_ids[0], dev));
+	} while (!pci_match_id(ids, dev));
 	return dev;
 }
 
-int cache_amd_northbridges(void)
+int amd_cache_northbridges(void)
 {
-	int i;
-	struct pci_dev *dev;
+	int i = 0;
+	struct amd_northbridge *nb;
+	struct pci_dev *misc;
 
-	if (amd_northbridges.num)
+	if (amd_nb_num())
 		return 0;
 
-	dev = NULL;
-	while ((dev = next_amd_northbridge(dev)) != NULL)
-		amd_northbridges.num++;
+	misc = NULL;
+	while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
+		i++;
 
-	/* some CPU families (e.g. family 0x11) do not support GART */
-	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
-	    boot_cpu_data.x86 == 0x15)
-		amd_northbridges.gart_supported = 1;
+	if (i == 0)
+		return 0;
 
-	amd_northbridges.nb_misc = kmalloc((amd_northbridges.num + 1) *
-					  sizeof(void *), GFP_KERNEL);
-	if (!amd_northbridges.nb_misc)
+	nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
+	if (!nb)
 		return -ENOMEM;
 
-	if (!amd_northbridges.num) {
-		amd_northbridges.nb_misc[0] = NULL;
-		return 0;
-	}
+	amd_northbridges.nb = nb;
+	amd_northbridges.num = i;
 
-	if (amd_northbridges.gart_supported) {
-		flush_words = kmalloc(amd_northbridges.num * sizeof(u32),
-				      GFP_KERNEL);
-		if (!flush_words) {
-			kfree(amd_northbridges.nb_misc);
-			return -ENOMEM;
-		}
-	}
+	misc = NULL;
+	for (i = 0; i != amd_nb_num(); i++) {
+		node_to_amd_nb(i)->misc = misc =
+			next_northbridge(misc, amd_nb_misc_ids);
+        }
+
+	/* some CPU families (e.g. family 0x11) do not support GART */
+	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+	    boot_cpu_data.x86 == 0x15)
+		amd_northbridges.flags |= AMD_NB_GART;
 
-	dev = NULL;
-	i = 0;
-	while ((dev = next_amd_northbridge(dev)) != NULL) {
-		amd_northbridges.nb_misc[i] = dev;
-		if (amd_northbridges.gart_supported)
-			pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
-	}
-	amd_northbridges.nb_misc[i] = NULL;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(cache_amd_northbridges);
+EXPORT_SYMBOL_GPL(amd_cache_northbridges);
 
 /* Ignores subdevice/subvendor but as far as I can figure out
    they're useless anyways */
@@ -88,19 +79,39 @@ int __init early_is_amd_nb(u32 device)
 	struct pci_device_id *id;
 	u32 vendor = device & 0xffff;
 	device >>= 16;
-	for (id = amd_nb_ids; id->vendor; id++)
+	for (id = amd_nb_misc_ids; id->vendor; id++)
 		if (vendor == id->vendor && device == id->device)
 			return 1;
 	return 0;
 }
 
+int amd_cache_gart(void)
+{
+       int i;
+
+       if (!amd_nb_has_feature(AMD_NB_GART))
+               return 0;
+
+       flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
+       if (!flush_words) {
+               amd_northbridges.flags &= ~AMD_NB_GART;
+               return -ENOMEM;
+       }
+
+       for (i = 0; i != amd_nb_num(); i++)
+               pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+                                     &flush_words[i]);
+
+       return 0;
+}
+
 void amd_flush_garts(void)
 {
 	int flushed, i;
 	unsigned long flags;
 	static DEFINE_SPINLOCK(gart_lock);
 
-	if (!amd_northbridges.gart_supported)
+	if (!amd_nb_has_feature(AMD_NB_GART))
 		return;
 
 	/* Avoid races between AGP and IOMMU. In theory it's not needed
@@ -109,16 +120,16 @@ void amd_flush_garts(void)
 	   that it doesn't matter to serialize more. -AK */
 	spin_lock_irqsave(&gart_lock, flags);
 	flushed = 0;
-	for (i = 0; i < amd_northbridges.num; i++) {
-		pci_write_config_dword(amd_northbridges.nb_misc[i], 0x9c,
-				       flush_words[i]|1);
+	for (i = 0; i < amd_nb_num(); i++) {
+		pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+				       flush_words[i] | 1);
 		flushed++;
 	}
-	for (i = 0; i < amd_northbridges.num; i++) {
+	for (i = 0; i < amd_nb_num(); i++) {
 		u32 w;
 		/* Make sure the hardware actually executed the flush*/
 		for (;;) {
-			pci_read_config_dword(amd_northbridges.nb_misc[i],
+			pci_read_config_dword(node_to_amd_nb(i)->misc,
 					      0x9c, &w);
 			if (!(w & 1))
 				break;
@@ -135,11 +146,15 @@ static __init int init_amd_nbs(void)
 {
 	int err = 0;
 
-	err = cache_amd_northbridges();
+	err = amd_cache_northbridges();
 
 	if (err < 0)
 		printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
 
+	if (amd_cache_gart() < 0)
+		printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
+		       "GART support disabled.\n");
+
 	return err;
 }
 
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 92512ed..6b8ea74 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -333,7 +333,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
 {
 	struct amd_l3_cache *l3;
-	struct pci_dev *dev = node_to_amd_nb_misc(node);
+	struct pci_dev *dev = node_to_amd_nb(node)->misc;
 
 	l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
 	if (!l3) {
@@ -370,7 +370,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
 			return;
 
 	/* not in virtualized environments */
-	if (amd_northbridges.num == 0)
+	if (amd_nb_num() == 0)
 		return;
 
 	/*
@@ -378,7 +378,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
 	 * never freed but this is done only on shutdown so it doesn't matter.
 	 */
 	if (!l3_caches) {
-		int size = amd_northbridges.num * sizeof(struct amd_l3_cache *);
+		int size = amd_nb_num() * sizeof(struct amd_l3_cache *);
 
 		l3_caches = kzalloc(size, GFP_ATOMIC);
 		if (!l3_caches)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 63317c5..c01ffa5 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -561,11 +561,11 @@ static void enable_gart_translations(void)
 {
 	int i;
 
-	if (!amd_northbridges.gart_supported)
+	if (!amd_nb_has_feature(AMD_NB_GART))
 		return;
 
-	for (i = 0; i < amd_northbridges.num; i++) {
-		struct pci_dev *dev = amd_northbridges.nb_misc[i];
+	for (i = 0; i < amd_nb_num(); i++) {
+		struct pci_dev *dev = node_to_amd_nb(i)->misc;
 
 		enable_gart_translation(dev, __pa(agp_gatt_table));
 	}
@@ -596,13 +596,13 @@ static void gart_fixup_northbridges(struct sys_device *dev)
 	if (!fix_up_north_bridges)
 		return;
 
-	if (!amd_northbridges.gart_supported)
+	if (!amd_nb_has_feature(AMD_NB_GART))
 		return;
 
 	pr_info("PCI-DMA: Restoring GART aperture settings\n");
 
-	for (i = 0; i < amd_northbridges.num; i++) {
-		struct pci_dev *dev = amd_northbridges.nb_misc[i];
+	for (i = 0; i < amd_nb_num(); i++) {
+		struct pci_dev *dev = node_to_amd_nb(i)->misc;
 
 		/*
 		 * Don't enable translations just yet.  That is the next
@@ -656,8 +656,8 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
 
 	aper_size = aper_base = info->aper_size = 0;
 	dev = NULL;
-	for (i = 0; i < amd_northbridges.num; i++) {
-		dev = amd_northbridges.nb_misc[i];
+	for (i = 0; i < amd_nb_num(); i++) {
+		dev = node_to_amd_nb(i)->misc;
 		new_aper_base = read_aperture(dev, &new_aper_size);
 		if (!new_aper_base)
 			goto nommu;
@@ -725,13 +725,13 @@ static void gart_iommu_shutdown(void)
 	if (!no_agp)
 		return;
 
-	if (!amd_northbridges.gart_supported)
+	if (!amd_nb_has_feature(AMD_NB_GART))
 		return;
 
-	for (i = 0; i < amd_northbridges.num; i++) {
+	for (i = 0; i < amd_nb_num(); i++) {
 		u32 ctl;
 
-		dev = amd_northbridges.nb_misc[i];
+		dev = node_to_amd_nb(i)->misc;
 		pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
 
 		ctl &= ~GARTEN;
@@ -749,7 +749,7 @@ int __init gart_iommu_init(void)
 	unsigned long scratch;
 	long i;
 
-	if (!amd_northbridges.gart_supported)
+	if (!amd_nb_has_feature(AMD_NB_GART))
 		return 0;
 
 #ifndef CONFIG_AGP_AMD64
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index b1f8bb5..9252e85 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -124,7 +124,7 @@ static int amd64_fetch_size(void)
 	u32 temp;
 	struct aper_size_info_32 *values;
 
-	dev = amd_northbridges.nb_misc[0];
+	dev = node_to_amd_nb(0)->misc;
 	if (dev==NULL)
 		return 0;
 
@@ -181,14 +181,13 @@ static int amd_8151_configure(void)
 	unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
 	int i;
 
-	if (!amd_northbridges.gart_supported)
+	if (!amd_nb_has_feature(AMD_NB_GART))
 		return 0;
 
 	/* Configure AGP regs in each x86-64 host bridge. */
-	for (i = 0; i < amd_northbridges.num; i++) {
+	for (i = 0; i < amd_nb_num(); i++) {
 		agp_bridge->gart_bus_addr =
-				amd64_configure(amd_northbridges.nb_misc[i],
-						gatt_bus);
+			amd64_configure(node_to_amd_nb(i)->misc, gatt_bus);
 	}
 	amd_flush_garts();
 	return 0;
@@ -200,11 +199,11 @@ static void amd64_cleanup(void)
 	u32 tmp;
 	int i;
 
-	if (!amd_northbridges.gart_supported)
+	if (!amd_nb_has_feature(AMD_NB_GART))
 		return;
 
-	for (i = 0; i < amd_northbridges.num; i++) {
-		struct pci_dev *dev = amd_northbridges.nb_misc[i];
+	for (i = 0; i < amd_nb_num(); i++) {
+		struct pci_dev *dev = node_to_amd_nb(i)->misc;
 		/* disable gart translation */
 		pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
 		tmp &= ~GARTEN;
@@ -331,15 +330,15 @@ static __devinit int cache_nbs(struct pci_dev *pdev, u32 cap_ptr)
 {
 	int i;
 
-	if (cache_amd_northbridges() < 0)
+	if (amd_cache_northbridges() < 0)
 		return -ENODEV;
 
-	if (!amd_northbridges.gart_supported)
+	if (!amd_nb_has_feature(AMD_NB_GART))
 		return -ENODEV;
 
 	i = 0;
-	for (i = 0; i < amd_northbridges.num; i++) {
-		struct pci_dev *dev = amd_northbridges.nb_misc[i];
+	for (i = 0; i < amd_nb_num(); i++) {
+		struct pci_dev *dev = node_to_amd_nb(i)->misc;
 		if (fix_northbridge(dev, pdev, cap_ptr) < 0) {
 			dev_err(&dev->dev, "no usable aperture found\n");
 #ifdef __x86_64__
@@ -416,7 +415,7 @@ static int __devinit uli_agp_init(struct pci_dev *pdev)
 	}
 
 	/* shadow x86-64 registers into ULi registers */
-	pci_read_config_dword (amd_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+	pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE,
 			       &httfea);
 
 	/* if x86-64 aperture base is beyond 4G, exit here */
@@ -484,7 +483,7 @@ static int nforce3_agp_init(struct pci_dev *pdev)
 	pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp);
 
 	/* shadow x86-64 registers into NVIDIA registers */
-	pci_read_config_dword (amd_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+	pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE,
 			       &apbase);
 
 	/* if x86-64 aperture base is beyond 4G, exit here */
@@ -778,7 +777,7 @@ int __init agp_amd64_init(void)
 		}
 
 		/* First check that we have at least one AMD64 NB */
-		if (!pci_dev_present(amd_nb_ids))
+		if (!pci_dev_present(amd_nb_misc_ids))
 			return -ENODEV;
 
 		/* Look for any AGP bridge */
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 8b144cc..774f950 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2917,7 +2917,7 @@ static int __init amd64_edac_init(void)
 
 	opstate_init();
 
-	if (cache_amd_northbridges() < 0)
+	if (amd_cache_northbridges() < 0)
 		goto err_ret;
 
 	msrs = msrs_alloc();
@@ -2934,7 +2934,7 @@ static int __init amd64_edac_init(void)
 	 * to finish initialization of the MC instances.
 	 */
 	err = -ENODEV;
-	for (nb = 0; nb < amd_northbridges.num; nb++) {
+	for (nb = 0; nb < amd_nb_num(); nb++) {
 		if (!pvt_lookup[nb])
 			continue;
 
-- 
cgit v0.10.2


From f658bcfb2607bf0808966a69cf74135ce98e5c2d Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@amd.com>
Date: Fri, 29 Oct 2010 17:14:32 +0200
Subject: x86, cacheinfo: Cleanup L3 cache index disable support

Adaptions to the changes of the AMD northbridge caching code: instead
of a bool in each l3 struct, use a flag in amd_northbridges.flags to
indicate L3 cache index disable support; use a pointer to the whole
northbridge instead of the misc device in the l3 struct; simplify the
initialisation; dynamically generate sysfs attribute array.

Signed-off-by: Hans Rosenfeld <hans.rosenfeld@amd.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 4d7ec7d..6aee50d 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -25,6 +25,7 @@ struct amd_northbridge_info {
 extern struct amd_northbridge_info amd_northbridges;
 
 #define AMD_NB_GART			0x1
+#define AMD_NB_L3_INDEX_DISABLE		0x2
 
 #ifdef CONFIG_AMD_NB
 
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 63c8b4f..affacb5 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -68,6 +68,16 @@ int amd_cache_northbridges(void)
 	    boot_cpu_data.x86 == 0x15)
 		amd_northbridges.flags |= AMD_NB_GART;
 
+	/*
+	 * Some CPU families support L3 Cache Index Disable. There are some
+	 * limitations because of E382 and E388 on family 0x10.
+	 */
+	if (boot_cpu_data.x86 == 0x10 &&
+	    boot_cpu_data.x86_model >= 0x8 &&
+	    (boot_cpu_data.x86_model > 0x9 ||
+	     boot_cpu_data.x86_mask >= 0x1))
+		amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(amd_cache_northbridges);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6b8ea74..9ecf81f 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -149,8 +149,7 @@ union _cpuid4_leaf_ecx {
 };
 
 struct amd_l3_cache {
-	struct	 pci_dev *dev;
-	bool	 can_disable;
+	struct	 amd_northbridge *nb;
 	unsigned indices;
 	u8	 subcaches[4];
 };
@@ -311,14 +310,12 @@ struct _cache_attr {
 /*
  * L3 cache descriptors
  */
-static struct amd_l3_cache **__cpuinitdata l3_caches;
-
 static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 {
 	unsigned int sc0, sc1, sc2, sc3;
 	u32 val = 0;
 
-	pci_read_config_dword(l3->dev, 0x1C4, &val);
+	pci_read_config_dword(l3->nb->misc, 0x1C4, &val);
 
 	/* calculate subcache sizes */
 	l3->subcaches[0] = sc0 = !(val & BIT(0));
@@ -330,47 +327,14 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 	l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
 }
 
-static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
-{
-	struct amd_l3_cache *l3;
-	struct pci_dev *dev = node_to_amd_nb(node)->misc;
-
-	l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
-	if (!l3) {
-		printk(KERN_WARNING "Error allocating L3 struct\n");
-		return NULL;
-	}
-
-	l3->dev = dev;
-
-	amd_calc_l3_indices(l3);
-
-	return l3;
-}
-
-static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
-					   int index)
+static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
+					int index)
 {
+	static struct amd_l3_cache *__cpuinitdata l3_caches;
 	int node;
 
-	if (boot_cpu_data.x86 != 0x10)
-		return;
-
-	if (index < 3)
-		return;
-
-	/* see errata #382 and #388 */
-	if (boot_cpu_data.x86_model < 0x8)
-		return;
-
-	if ((boot_cpu_data.x86_model == 0x8 ||
-	     boot_cpu_data.x86_model == 0x9)
-		&&
-	     boot_cpu_data.x86_mask < 0x1)
-			return;
-
-	/* not in virtualized environments */
-	if (amd_nb_num() == 0)
+	/* only for L3, and not in virtualized environments */
+	if (index < 3 || amd_nb_num() == 0)
 		return;
 
 	/*
@@ -378,7 +342,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
 	 * never freed but this is done only on shutdown so it doesn't matter.
 	 */
 	if (!l3_caches) {
-		int size = amd_nb_num() * sizeof(struct amd_l3_cache *);
+		int size = amd_nb_num() * sizeof(struct amd_l3_cache);
 
 		l3_caches = kzalloc(size, GFP_ATOMIC);
 		if (!l3_caches)
@@ -387,14 +351,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
 
 	node = amd_get_nb_id(smp_processor_id());
 
-	if (!l3_caches[node]) {
-		l3_caches[node] = amd_init_l3_cache(node);
-		l3_caches[node]->can_disable = true;
+	if (!l3_caches[node].nb) {
+		l3_caches[node].nb = node_to_amd_nb(node);
+		amd_calc_l3_indices(&l3_caches[node]);
 	}
 
-	WARN_ON(!l3_caches[node]);
-
-	this_leaf->l3 = l3_caches[node];
+	this_leaf->l3 = &l3_caches[node];
 }
 
 /*
@@ -408,7 +370,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
 {
 	unsigned int reg = 0;
 
-	pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
+	pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, &reg);
 
 	/* check whether this slot is activated already */
 	if (reg & (3UL << 30))
@@ -422,7 +384,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 {
 	int index;
 
-	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+	if (!this_leaf->l3 ||
+	    !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
 		return -EINVAL;
 
 	index = amd_get_l3_disable_slot(this_leaf->l3, slot);
@@ -457,7 +420,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
 		if (!l3->subcaches[i])
 			continue;
 
-		pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+		pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
 
 		/*
 		 * We need to WBINVD on a core on the node containing the L3
@@ -467,7 +430,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
 		wbinvd_on_cpu(cpu);
 
 		reg |= BIT(31);
-		pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+		pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
 	}
 }
 
@@ -524,7 +487,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+	if (!this_leaf->l3 ||
+	    !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
 		return -EINVAL;
 
 	cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -545,7 +509,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 #define STORE_CACHE_DISABLE(slot)					\
 static ssize_t								\
 store_cache_disable_##slot(struct _cpuid4_info *this_leaf,		\
-			    const char *buf, size_t count)		\
+			   const char *buf, size_t count)		\
 {									\
 	return store_cache_disable(this_leaf, buf, count, slot);	\
 }
@@ -558,10 +522,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
 		show_cache_disable_1, store_cache_disable_1);
 
 #else	/* CONFIG_AMD_NB */
-static void __cpuinit
-amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
-{
-};
+#define amd_init_l3_cache(x, y)
 #endif /* CONFIG_AMD_NB */
 
 static int
@@ -575,7 +536,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
 		amd_cpuid4(index, &eax, &ebx, &ecx);
-		amd_check_l3_disable(this_leaf, index);
+		amd_init_l3_cache(this_leaf, index);
 	} else {
 		cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
 	}
@@ -983,30 +944,48 @@ define_one_ro(size);
 define_one_ro(shared_cpu_map);
 define_one_ro(shared_cpu_list);
 
-#define DEFAULT_SYSFS_CACHE_ATTRS	\
-	&type.attr,			\
-	&level.attr,			\
-	&coherency_line_size.attr,	\
-	&physical_line_partition.attr,	\
-	&ways_of_associativity.attr,	\
-	&number_of_sets.attr,		\
-	&size.attr,			\
-	&shared_cpu_map.attr,		\
-	&shared_cpu_list.attr
-
 static struct attribute *default_attrs[] = {
-	DEFAULT_SYSFS_CACHE_ATTRS,
+	&type.attr,
+	&level.attr,
+	&coherency_line_size.attr,
+	&physical_line_partition.attr,
+	&ways_of_associativity.attr,
+	&number_of_sets.attr,
+	&size.attr,
+	&shared_cpu_map.attr,
+	&shared_cpu_list.attr,
 	NULL
 };
 
-static struct attribute *default_l3_attrs[] = {
-	DEFAULT_SYSFS_CACHE_ATTRS,
 #ifdef CONFIG_AMD_NB
-	&cache_disable_0.attr,
-	&cache_disable_1.attr,
+static struct attribute ** __cpuinit amd_l3_attrs(void)
+{
+	static struct attribute **attrs;
+	int n;
+
+	if (attrs)
+		return attrs;
+
+	n = sizeof (default_attrs) / sizeof (struct attribute *);
+
+	if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+		n += 2;
+
+	attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
+	if (attrs == NULL)
+		return attrs = default_attrs;
+
+	for (n = 0; default_attrs[n]; n++)
+		attrs[n] = default_attrs[n];
+
+	if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+		attrs[n++] = &cache_disable_0.attr;
+		attrs[n++] = &cache_disable_1.attr;
+	}
+
+	return attrs;
+}
 #endif
-	NULL
-};
 
 static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
@@ -1117,11 +1096,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 
 		this_leaf = CPUID4_INFO_IDX(cpu, i);
 
-		if (this_leaf->l3 && this_leaf->l3->can_disable)
-			ktype_cache.default_attrs = default_l3_attrs;
-		else
-			ktype_cache.default_attrs = default_attrs;
-
+		ktype_cache.default_attrs = default_attrs;
+#ifdef CONFIG_AMD_NB
+		if (this_leaf->l3)
+			ktype_cache.default_attrs = amd_l3_attrs();
+#endif
 		retval = kobject_init_and_add(&(this_object->kobj),
 					      &ktype_cache,
 					      per_cpu(ici_cache_kobject, cpu),
-- 
cgit v0.10.2


From 5ca9afdb9f6a5267927b54de3f42c756e8af7fcd Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segoon@openwall.com>
Date: Thu, 18 Nov 2010 21:16:45 +0300
Subject: x86, mrst: Check platform_device_register() return code

platform_device_register() may fail, if so propagate the return
code from mrst_device_create().

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
LKML-Reference: <1290104207-31279-1-git-send-email-segoon@openwall.com>
Acked-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
index 4d3f770..32cd7ed 100644
--- a/arch/x86/platform/mrst/vrtc.c
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -159,8 +159,7 @@ static int __init mrst_device_create(void)
 	vrtc_resources[1].start = sfi_mrtc_array[0].irq;
 	vrtc_resources[1].end = sfi_mrtc_array[0].irq;
 
-	platform_device_register(&vrtc_device);
-	return 0;
+	return platform_device_register(&vrtc_device);
 }
 
 module_init(mrst_device_create);
-- 
cgit v0.10.2


From 9cdca869724e766eb48c061967cb777ddb436c76 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 20 Nov 2010 10:37:05 +0100
Subject: x86: platform: Move iris to x86/platform where it belongs

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index beac17a..9e13763 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -109,7 +109,6 @@ obj-$(CONFIG_MICROCODE)			+= microcode.o
 obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
 
 obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
-obj-$(CONFIG_X86_32_IRIS)		+= iris.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/iris.c b/arch/x86/kernel/iris.c
deleted file mode 100644
index 1ba7f5e..0000000
--- a/arch/x86/kernel/iris.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Eurobraille/Iris power off support.
- *
- * Eurobraille's Iris machine is a PC with no APM or ACPI support.
- * It is shutdown by a special I/O sequence which this module provides.
- *
- *  Copyright (C) Shérab <Sebastien.Hinderer@ens-lyon.org>
- *
- * This program is free software ; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation ; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY ; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with the program ; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/moduleparam.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/pm.h>
-#include <asm/io.h>
-
-#define IRIS_GIO_BASE		0x340
-#define IRIS_GIO_INPUT		IRIS_GIO_BASE
-#define IRIS_GIO_OUTPUT		(IRIS_GIO_BASE + 1)
-#define IRIS_GIO_PULSE		0x80 /* First byte to send */
-#define IRIS_GIO_REST		0x00 /* Second byte to send */
-#define IRIS_GIO_NODEV		0xff /* Likely not an Iris */
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
-MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
-MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
-
-static int force;
-
-module_param(force, bool, 0);
-MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
-
-static void (*old_pm_power_off)(void);
-
-static void iris_power_off(void)
-{
-	outb(IRIS_GIO_PULSE, IRIS_GIO_OUTPUT);
-	msleep(850);
-	outb(IRIS_GIO_REST, IRIS_GIO_OUTPUT);
-}
-
-/*
- * Before installing the power_off handler, try to make sure the OS is
- * running on an Iris.  Since Iris does not support DMI, this is done
- * by reading its input port and seeing whether the read value is
- * meaningful.
- */
-static int iris_init(void)
-{
-	unsigned char status;
-	if (force != 1) {
-		printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n");
-		return -ENODEV;
-	}
-	status = inb(IRIS_GIO_INPUT);
-	if (status == IRIS_GIO_NODEV) {
-		printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n");
-		return -ENODEV;
-	}
-	old_pm_power_off = pm_power_off;
-	pm_power_off = &iris_power_off;
-	printk(KERN_INFO "Iris power_off handler installed.\n");
-
-	return 0;
-}
-
-static void iris_exit(void)
-{
-	pm_power_off = old_pm_power_off;
-	printk(KERN_INFO "Iris power_off handler uninstalled.\n");
-}
-
-module_init(iris_init);
-module_exit(iris_exit);
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index a019bc3..021eee9 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -1,6 +1,7 @@
 # Platform specific code goes here
 obj-y	+= ce4100/
 obj-y	+= efi/
+obj-y	+= iris/
 obj-y	+= mrst/
 obj-y	+= olpc/
 obj-y	+= scx200/
diff --git a/arch/x86/platform/iris/Makefile b/arch/x86/platform/iris/Makefile
new file mode 100644
index 0000000..db92198
--- /dev/null
+++ b/arch/x86/platform/iris/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_32_IRIS)		+= iris.o
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
new file mode 100644
index 0000000..1ba7f5e
--- /dev/null
+++ b/arch/x86/platform/iris/iris.c
@@ -0,0 +1,91 @@
+/*
+ * Eurobraille/Iris power off support.
+ *
+ * Eurobraille's Iris machine is a PC with no APM or ACPI support.
+ * It is shutdown by a special I/O sequence which this module provides.
+ *
+ *  Copyright (C) Shérab <Sebastien.Hinderer@ens-lyon.org>
+ *
+ * This program is free software ; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation ; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with the program ; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/moduleparam.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <asm/io.h>
+
+#define IRIS_GIO_BASE		0x340
+#define IRIS_GIO_INPUT		IRIS_GIO_BASE
+#define IRIS_GIO_OUTPUT		(IRIS_GIO_BASE + 1)
+#define IRIS_GIO_PULSE		0x80 /* First byte to send */
+#define IRIS_GIO_REST		0x00 /* Second byte to send */
+#define IRIS_GIO_NODEV		0xff /* Likely not an Iris */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
+MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
+MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
+
+static int force;
+
+module_param(force, bool, 0);
+MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
+
+static void (*old_pm_power_off)(void);
+
+static void iris_power_off(void)
+{
+	outb(IRIS_GIO_PULSE, IRIS_GIO_OUTPUT);
+	msleep(850);
+	outb(IRIS_GIO_REST, IRIS_GIO_OUTPUT);
+}
+
+/*
+ * Before installing the power_off handler, try to make sure the OS is
+ * running on an Iris.  Since Iris does not support DMI, this is done
+ * by reading its input port and seeing whether the read value is
+ * meaningful.
+ */
+static int iris_init(void)
+{
+	unsigned char status;
+	if (force != 1) {
+		printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n");
+		return -ENODEV;
+	}
+	status = inb(IRIS_GIO_INPUT);
+	if (status == IRIS_GIO_NODEV) {
+		printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n");
+		return -ENODEV;
+	}
+	old_pm_power_off = pm_power_off;
+	pm_power_off = &iris_power_off;
+	printk(KERN_INFO "Iris power_off handler installed.\n");
+
+	return 0;
+}
+
+static void iris_exit(void)
+{
+	pm_power_off = old_pm_power_off;
+	printk(KERN_INFO "Iris power_off handler uninstalled.\n");
+}
+
+module_init(iris_init);
+module_exit(iris_exit);
-- 
cgit v0.10.2


From 691513f70d3957939a318da970987b876c720861 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Mon, 22 Nov 2010 14:03:28 +0100
Subject: x86: Resume trampoline must be executable

commit 5bd5a452(x86: Add NX protection for kernel data) marked the
trampoline area NX - which unsurprisingly breaks resume and cpu
hotplug.

Revert the portion of that commit, which touches the trampoline.

Originally-from: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <1290410581.2405.24.camel@minggr.sh.intel.com>
Cc: Matthieu Castet <castet.matthieu@free.fr>
Cc: Siarhei Liakh <sliakh.lkml@gmail.com>
Cc: Xuxian Jiang <jiang@cs.ncsu.edu>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Andi Kleen <andi@firstfloor.org>
Tested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ce59c05..71a5929 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -788,7 +788,6 @@ void mark_rodata_ro(void)
 	unsigned long rodata_start =
 		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
 	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
-	unsigned long kernel_end = (((unsigned long)&__init_end + HPAGE_SIZE) & HPAGE_MASK);
 	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
 	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
 	unsigned long data_start = (unsigned long) &_sdata;
@@ -803,7 +802,7 @@ void mark_rodata_ro(void)
 	 * The rodata section (but not the kernel text!) should also be
 	 * not-executable.
 	 */
-	set_memory_nx(rodata_start, (kernel_end - rodata_start) >> PAGE_SHIFT);
+	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
 
 	rodata_test();
 
-- 
cgit v0.10.2


From 28d0686cf7b14e30243096bd874d3f80591ed392 Mon Sep 17 00:00:00 2001
From: Erik Gilling <konkers@android.com>
Date: Fri, 19 Nov 2010 18:08:51 -0800
Subject: sched: Make task dump print all 15 chars of proc comm

Signed-off-by: Erik Gilling <konkers@android.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1290218934-8544-3-git-send-email-john.stultz@linaro.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 550cf3a..324afce 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5249,7 +5249,7 @@ void sched_show_task(struct task_struct *p)
 	unsigned state;
 
 	state = p->state ? __ffs(p->state) + 1 : 0;
-	printk(KERN_INFO "%-13.13s %c", p->comm,
+	printk(KERN_INFO "%-15.15s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
 	if (state == TASK_RUNNING)
-- 
cgit v0.10.2


From 70caf8a6c13c2279b35f2ad6b644815533d6c476 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 20 Nov 2010 00:53:51 +0100
Subject: sched: Fix UP build breakage

The recent cgroup-scheduling rework caused a UP build problem.

Cc: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 390ce30..82fd884 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -562,7 +562,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 	curr->vruntime += delta_exec_weighted;
 	update_min_vruntime(cfs_rq);
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->load_unacc_exec_time += delta_exec;
 	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
 		update_cfs_load(cfs_rq, 0);
-- 
cgit v0.10.2


From 2e01f4740a874b6085da6ebf541e7ffde9a72bf2 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval.giani@gmail.com>
Date: Thu, 18 Nov 2010 15:44:54 +0100
Subject: cpu: Remove unused variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GCC warns us about:

 kernel/cpu.c: In function ‘take_cpu_down’:
 kernel/cpu.c:200:15: warning: unused variable ‘cpu’

This variable is unused since param->hcpu is directly
used later on in cpu_notify.

Signed-off-by: Dhaval Giani <dhaval_giani@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1290091494.1145.5.camel@gondor.retis>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8615aa6..3945066 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -197,7 +197,6 @@ struct take_cpu_down_param {
 static int __ref take_cpu_down(void *_param)
 {
 	struct take_cpu_down_param *param = _param;
-	unsigned int cpu = (unsigned long)param->hcpu;
 	int err;
 
 	/* Ensure this CPU doesn't handle any more interrupts. */
-- 
cgit v0.10.2


From 51a96c77815e7f139892a6e9c8275a50e9baebdf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 19 Nov 2010 20:37:53 +0100
Subject: cpu: Remove incorrect BUG_ON

Oleg mentioned that there is no actual guarantee the dying cpu's
migration thread is actually finished running when we get there, so
replace the BUG_ON() with a spinloop waiting for it.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3945066..cb7a1ef 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -249,8 +249,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	 * The migration_call() CPU_DYING callback will have removed all
 	 * runnable tasks from the cpu, there's only the idle task left now
 	 * that the migration thread is done doing the stop_machine thing.
+	 *
+	 * Wait for the stop thread to go away.
 	 */
-	BUG_ON(!idle_cpu(cpu));
+	while (!idle_cpu(cpu))
+		cpu_relax();
 
 	/* This actually kills the CPU. */
 	__cpu_die(cpu);
-- 
cgit v0.10.2


From 5bb6b1ea67a73f0665a41726dd7138977b992c6c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 19 Nov 2010 21:11:09 +0100
Subject: sched: Add some clock info to sched_debug

Add more clock information to /proc/sched_debug, Thomas wanted to see
the sched_clock_stable state.

Requested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a14..9d8af0b 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 }
 EXPORT_SYMBOL_GPL(sched_clock);
 
-static __read_mostly int sched_clock_running;
+__read_mostly int sched_clock_running;
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 __read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index e6590e7..e95b774 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -250,6 +250,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 #undef P
 }
 
+extern __read_mostly int sched_clock_running;
+
 static void print_cpu(struct seq_file *m, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -321,21 +323,42 @@ static const char *sched_tunable_scaling_names[] = {
 
 static int sched_debug_show(struct seq_file *m, void *v)
 {
-	u64 now = ktime_to_ns(ktime_get());
+	u64 ktime, sched_clk, cpu_clk;
+	unsigned long flags;
 	int cpu;
 
-	SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
+	local_irq_save(flags);
+	ktime = ktime_to_ns(ktime_get());
+	sched_clk = sched_clock();
+	cpu_clk = local_clock();
+	local_irq_restore(flags);
+
+	SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
 
-	SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
+#define P(x) \
+	SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+	SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+	PN(ktime);
+	PN(sched_clk);
+	PN(cpu_clk);
+	P(jiffies);
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+	P(sched_clock_stable);
+#endif
+#undef PN
+#undef P
+
+	SEQ_printf(m, "\n");
+	SEQ_printf(m, "sysctl_sched\n");
 
 #define P(x) \
 	SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
 #define PN(x) \
 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
-	P(jiffies);
 	PN(sysctl_sched_latency);
 	PN(sysctl_sched_min_granularity);
 	PN(sysctl_sched_wakeup_granularity);
-- 
cgit v0.10.2


From 335d7afbfb71faac833734a94240c1e07cf0ead8 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 22 Nov 2010 15:47:36 +0100
Subject: mutexes, sched: Introduce arch_mutex_cpu_relax()

The spinning mutex implementation uses cpu_relax() in busy loops as a
compiler barrier. Depending on the architecture, cpu_relax() may do more
than needed in this specific mutex spin loops. On System z we also give
up the time slice of the virtual cpu in cpu_relax(), which prevents
effective spinning on the mutex.

This patch replaces cpu_relax() in the spinning mutex code with
arch_mutex_cpu_relax(), which can be defined by each architecture that
selects HAVE_ARCH_MUTEX_CPU_RELAX. The default is still cpu_relax(), so
this patch should not affect other architectures than System z for now.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1290437256.7455.4.camel@thinkpad>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/Kconfig b/arch/Kconfig
index 8bf0fa65..f78c2be 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -175,4 +175,7 @@ config HAVE_PERF_EVENTS_NMI
 config HAVE_ARCH_JUMP_LABEL
 	bool
 
+config HAVE_ARCH_MUTEX_CPU_RELAX
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index e0b98e7..6c6d7b3 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -99,6 +99,7 @@ config S390
 	select HAVE_KERNEL_LZMA
 	select HAVE_KERNEL_LZO
 	select HAVE_GET_USER_PAGES_FAST
+	select HAVE_ARCH_MUTEX_CPU_RELAX
 	select ARCH_INLINE_SPIN_TRYLOCK
 	select ARCH_INLINE_SPIN_TRYLOCK_BH
 	select ARCH_INLINE_SPIN_LOCK
diff --git a/arch/s390/include/asm/mutex.h b/arch/s390/include/asm/mutex.h
index 458c1f7..688271f 100644
--- a/arch/s390/include/asm/mutex.h
+++ b/arch/s390/include/asm/mutex.h
@@ -7,3 +7,5 @@
  */
 
 #include <asm-generic/mutex-dec.h>
+
+#define arch_mutex_cpu_relax()	barrier()
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index f363bc8..94b48bd 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -160,4 +160,8 @@ extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
+#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
+#define arch_mutex_cpu_relax()	cpu_relax()
+#endif
+
 #endif
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c..a5889fb 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * memory barriers as we'll eventually observe the right
 		 * values at the cost of a few extra spins.
 		 */
-		cpu_relax();
+		arch_mutex_cpu_relax();
 	}
 #endif
 	spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/sched.c b/kernel/sched.c
index 3e8a7db..abe7aec 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,7 @@
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
+#include <asm/mutex.h>
 
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
@@ -3888,7 +3889,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 		if (task_thread_info(rq->curr) != owner || need_resched())
 			return 0;
 
-		cpu_relax();
+		arch_mutex_cpu_relax();
 	}
 
 	return 1;
-- 
cgit v0.10.2


From b7a2b39d9b7703ccf068f549c8dc3465fc41d015 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Fri, 26 Nov 2010 12:37:09 +0530
Subject: sched: Remove unused argument dest_cpu to migrate_task()

Remove unused argument, 'dest_cpu' of migrate_task(), and pass runqueue,
as it is always known at the call site.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <201011261237.09187.knikanth@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index abe7aec..35a6373 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2061,10 +2061,8 @@ static int migration_cpu_stop(void *data);
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
-static bool migrate_task(struct task_struct *p, int dest_cpu)
+static bool migrate_task(struct task_struct *p, struct rq *rq)
 {
-	struct rq *rq = task_rq(p);
-
 	/*
 	 * If the task is not on a runqueue (and not running), then
 	 * the next wake-up will properly place the task.
@@ -3224,7 +3222,7 @@ void sched_exec(void)
 	 * select_task_rq() can race against ->cpus_allowed
 	 */
 	if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-	    likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+	    likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
 		struct migration_arg arg = { p, dest_cpu };
 
 		task_rq_unlock(rq, &flags);
@@ -5504,7 +5502,7 @@ again:
 		goto out;
 
 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-	if (migrate_task(p, dest_cpu)) {
+	if (migrate_task(p, rq)) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, &flags);
-- 
cgit v0.10.2


From 24278d148316d2180be6df40e06db013d8b232b8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Mon, 27 Sep 2010 17:25:23 -0700
Subject: rcu: priority boosting for TINY_PREEMPT_RCU

Add priority boosting, but only for TINY_PREEMPT_RCU.  This is enabled
by the default-off RCU_BOOST kernel parameter.  The priority to which to
boost preempted RCU readers is controlled by the RCU_BOOST_PRIO kernel
parameter (defaulting to real-time priority 1) and the time to wait
before boosting the readers blocking a given grace period is controlled
by the RCU_BOOST_DELAY kernel parameter (defaulting to 500 milliseconds).

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 2fea6c8..69f91aa 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -81,6 +81,12 @@ extern struct group_info init_groups;
  */
 # define CAP_INIT_BSET  CAP_FULL_SET
 
+#ifdef CONFIG_RCU_BOOST
+#define INIT_TASK_RCU_BOOST()						\
+	.rcu_boost_mutex = NULL,
+#else
+#define INIT_TASK_RCU_BOOST()
+#endif
 #ifdef CONFIG_TREE_PREEMPT_RCU
 #define INIT_TASK_RCU_TREE_PREEMPT()					\
 	.rcu_blocked_node = NULL,
@@ -92,7 +98,8 @@ extern struct group_info init_groups;
 	.rcu_read_lock_nesting = 0,					\
 	.rcu_read_unlock_special = 0,					\
 	.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),		\
-	INIT_TASK_RCU_TREE_PREEMPT()
+	INIT_TASK_RCU_TREE_PREEMPT()					\
+	INIT_TASK_RCU_BOOST()
 #else
 #define INIT_TASK_RCU_PREEMPT(tsk)
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e18473f..ed1a9bc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1210,6 +1210,9 @@ struct task_struct {
 #ifdef CONFIG_TREE_PREEMPT_RCU
 	struct rcu_node *rcu_blocked_node;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+	struct rt_mutex *rcu_boost_mutex;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
@@ -1745,7 +1748,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #ifdef CONFIG_PREEMPT_RCU
 
 #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
-#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
+#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
 
 static inline void rcu_copy_process(struct task_struct *p)
 {
@@ -1753,7 +1757,10 @@ static inline void rcu_copy_process(struct task_struct *p)
 	p->rcu_read_unlock_special = 0;
 #ifdef CONFIG_TREE_PREEMPT_RCU
 	p->rcu_blocked_node = NULL;
-#endif
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+	p->rcu_boost_mutex = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 	INIT_LIST_HEAD(&p->rcu_node_entry);
 }
 
diff --git a/init/Kconfig b/init/Kconfig
index a619a1a..48efefc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -450,6 +450,45 @@ config TREE_RCU_TRACE
 	  TREE_PREEMPT_RCU implementations, permitting Makefile to
 	  trivially select kernel/rcutree_trace.c.
 
+config RCU_BOOST
+	bool "Enable RCU priority boosting"
+	depends on RT_MUTEXES && TINY_PREEMPT_RCU
+	default n
+	help
+	  This option boosts the priority of preempted RCU readers that
+	  block the current preemptible RCU grace period for too long.
+	  This option also prevents heavy loads from blocking RCU
+	  callback invocation for all flavors of RCU.
+
+	  Say Y here if you are working with real-time apps or heavy loads
+	  Say N here if you are unsure.
+
+config RCU_BOOST_PRIO
+	int "Real-time priority to boost RCU readers to"
+	range 1 99
+	depends on RCU_BOOST
+	default 1
+	help
+	  This option specifies the real-time priority to which preempted
+	  RCU readers are to be boosted.  If you are working with CPU-bound
+	  real-time applications, you should specify a priority higher then
+	  the highest-priority CPU-bound application.
+
+	  Specify the real-time priority, or take the default if unsure.
+
+config RCU_BOOST_DELAY
+	int "Milliseconds to delay boosting after RCU grace-period start"
+	range 0 3000
+	depends on RCU_BOOST
+	default 500
+	help
+	  This option specifies the time to wait after the beginning of
+	  a given grace period before priority-boosting preempted RCU
+	  readers blocking that grace period.  Note that any RCU reader
+	  blocking an expedited RCU grace period is boosted immediately.
+
+	  Accept the default if unsure.
+
 endmenu # "RCU Subsystem"
 
 config IKCONFIG
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 86eef29..93d1665 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,38 +36,16 @@
 #include <linux/time.h>
 #include <linux/cpu.h>
 
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
-	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
-	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
-	struct rcu_head **curtail;	/* ->next pointer of last CB. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
-	.donetail	= &rcu_sched_ctrlblk.rcucblist,
-	.curtail	= &rcu_sched_ctrlblk.rcucblist,
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.donetail	= &rcu_bh_ctrlblk.rcucblist,
-	.curtail	= &rcu_bh_ctrlblk.rcucblist,
-};
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-/* Controls for rcu_cbs() kthread, replacing RCU_SOFTIRQ used previously. */
-static struct task_struct *rcu_cbs_task;
-static DECLARE_WAIT_QUEUE_HEAD(rcu_cbs_wq);
-static unsigned long have_rcu_cbs;
-static void invoke_rcu_cbs(void);
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
+static void invoke_rcu_kthread(void);
 
 /* Forward declarations for rcutiny_plugin.h. */
+struct rcu_ctrlblk;
 static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
-static int rcu_cbs(void *arg);
+static int rcu_kthread(void *arg);
 static void __call_rcu(struct rcu_head *head,
 		       void (*func)(struct rcu_head *rcu),
 		       struct rcu_ctrlblk *rcp);
@@ -130,7 +108,7 @@ void rcu_sched_qs(int cpu)
 {
 	if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
 	    rcu_qsctr_help(&rcu_bh_ctrlblk))
-		invoke_rcu_cbs();
+		invoke_rcu_kthread();
 }
 
 /*
@@ -139,7 +117,7 @@ void rcu_sched_qs(int cpu)
 void rcu_bh_qs(int cpu)
 {
 	if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-		invoke_rcu_cbs();
+		invoke_rcu_kthread();
 }
 
 /*
@@ -201,37 +179,41 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
  * This is a kthread, but it is never stopped, at least not until
  * the system goes down.
  */
-static int rcu_cbs(void *arg)
+static int rcu_kthread(void *arg)
 {
 	unsigned long work;
+	unsigned long morework;
 	unsigned long flags;
 
 	for (;;) {
-		wait_event(rcu_cbs_wq, have_rcu_cbs != 0);
+		wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+		morework = rcu_boost();
 		local_irq_save(flags);
-		work = have_rcu_cbs;
-		have_rcu_cbs = 0;
+		work = have_rcu_kthread_work;
+		have_rcu_kthread_work = morework;
 		local_irq_restore(flags);
 		if (work) {
 			rcu_process_callbacks(&rcu_sched_ctrlblk);
 			rcu_process_callbacks(&rcu_bh_ctrlblk);
 			rcu_preempt_process_callbacks();
 		}
+		schedule_timeout_interruptible(1); /* Leave CPU for others. */
 	}
 
 	return 0;  /* Not reached, but needed to shut gcc up. */
 }
 
 /*
- * Wake up rcu_cbs() to process callbacks now eligible for invocation.
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
  */
-static void invoke_rcu_cbs(void)
+static void invoke_rcu_kthread(void)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	have_rcu_cbs = 1;
-	wake_up(&rcu_cbs_wq);
+	have_rcu_kthread_work = 1;
+	wake_up(&rcu_kthread_wq);
 	local_irq_restore(flags);
 }
 
@@ -327,7 +309,11 @@ EXPORT_SYMBOL_GPL(rcu_barrier_sched);
  */
 static int __init rcu_spawn_kthreads(void)
 {
-	rcu_cbs_task = kthread_run(rcu_cbs, NULL, "rcu_cbs");
+	struct sched_param sp;
+
+	rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+	sp.sched_priority = RCU_BOOST_PRIO;
+	sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
 	return 0;
 }
 early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 95f9239..24f4316 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -24,6 +24,29 @@
 
 #include <linux/kthread.h>
 
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
+	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
+	struct rcu_head **curtail;	/* ->next pointer of last CB. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+	.donetail	= &rcu_sched_ctrlblk.rcucblist,
+	.curtail	= &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+	.donetail	= &rcu_bh_ctrlblk.rcucblist,
+	.curtail	= &rcu_bh_ctrlblk.rcucblist,
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
 #ifdef CONFIG_TINY_PREEMPT_RCU
 
 #include <linux/delay.h>
@@ -48,17 +71,27 @@ struct rcu_preempt_ctrlblk {
 	struct list_head *gp_tasks;
 				/* Pointer to the first task blocking the */
 				/*  current grace period, or NULL if there */
-				/*  is not such task. */
+				/*  is no such task. */
 	struct list_head *exp_tasks;
 				/* Pointer to first task blocking the */
 				/*  current expedited grace period, or NULL */
 				/*  if there is no such task.  If there */
 				/*  is no current expedited grace period, */
 				/*  then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+	struct list_head *boost_tasks;
+				/* Pointer to first task that needs to be */
+				/*  priority-boosted, or NULL if no priority */
+				/*  boosting is needed.  If there is no */
+				/*  current or expedited grace period, there */
+				/*  can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
 	u8 gpnum;		/* Current grace period. */
 	u8 gpcpu;		/* Last grace period blocked by the CPU. */
 	u8 completed;		/* Last grace period completed. */
 				/*  If all three are equal, RCU is idle. */
+	s8 boosted_this_gp;	/* Has boosting already happened? */
+	unsigned long boost_time; /* When to start boosting (jiffies) */
 };
 
 static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -124,6 +157,130 @@ static int rcu_preempt_gp_in_progress(void)
 }
 
 /*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+	struct list_head *np;
+
+	np = t->rcu_node_entry.next;
+	if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+		np = NULL;
+	return np;
+}
+
+#ifdef CONFIG_RCU_BOOST
+
+#include "rtmutex_common.h"
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+	unsigned long flags;
+	struct rt_mutex mtx;
+	struct list_head *np;
+	struct task_struct *t;
+
+	if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+		return 0;  /* Nothing to boost. */
+	raw_local_irq_save(flags);
+	rcu_preempt_ctrlblk.boosted_this_gp++;
+	t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
+			 rcu_node_entry);
+	np = rcu_next_node_entry(t);
+	rt_mutex_init_proxy_locked(&mtx, t);
+	t->rcu_boost_mutex = &mtx;
+	t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+	raw_local_irq_restore(flags);
+	rt_mutex_lock(&mtx);
+	rt_mutex_unlock(&mtx);
+	return rcu_preempt_ctrlblk.boost_tasks != NULL;
+}
+
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them.  If there is an expedited boost in progress,
+ * we wait for it to complete.
+ */
+static void rcu_initiate_boost(void)
+{
+	if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+	    rcu_preempt_ctrlblk.boost_tasks == NULL &&
+	    rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
+	    ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
+		rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+		invoke_rcu_kthread();
+	}
+}
+
+/*
+ * Initiate boosting for an expedited grace period.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+	unsigned long flags;
+
+	raw_local_irq_save(flags);
+	if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
+		rcu_preempt_ctrlblk.boost_tasks =
+			rcu_preempt_ctrlblk.blkd_tasks.next;
+		rcu_preempt_ctrlblk.boosted_this_gp = -1;
+		invoke_rcu_kthread();
+	}
+	raw_local_irq_restore(flags);
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+	rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+	if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
+		rcu_preempt_ctrlblk.boosted_this_gp = 0;
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+	return 0;
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate boosting.
+ */
+static void rcu_initiate_boost(void)
+{
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate expedited boosting.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+}
+
+/*
+ * If there is no RCU priority boosting, nothing to do at grace-period start.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+}
+
+#endif /* else #ifdef CONFIG_RCU_BOOST */
+
+/*
  * Record a preemptible-RCU quiescent state for the specified CPU.  Note
  * that this just means that the task currently running on the CPU is
  * in a quiescent state.  There might be any number of tasks blocked
@@ -150,12 +307,14 @@ static void rcu_preempt_cpu_qs(void)
 	rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
 	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 
-	/*
-	 * If there is no GP, or if blocked readers are still blocking GP,
-	 * then there is nothing more to do.
-	 */
+	/* If there is no GP then there is nothing more to do.  */
 	if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
 		return;
+	/* If there are blocked readers, go check up on boosting. */
+	if (rcu_preempt_blocked_readers_cgp()) {
+		rcu_initiate_boost();
+		return;
+	}
 
 	/* Advance callbacks. */
 	rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
@@ -168,7 +327,7 @@ static void rcu_preempt_cpu_qs(void)
 
 	/* If there are done callbacks, cause them to be invoked. */
 	if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-		invoke_rcu_cbs();
+		invoke_rcu_kthread();
 }
 
 /*
@@ -186,6 +345,9 @@ static void rcu_preempt_start_gp(void)
 			rcu_preempt_ctrlblk.gp_tasks =
 				rcu_preempt_ctrlblk.blkd_tasks.next;
 
+		/* Set up for RCU priority boosting. */
+		rcu_preempt_boost_start_gp();
+
 		/* If there is no running reader, CPU is done with GP. */
 		if (!rcu_preempt_running_reader())
 			rcu_preempt_cpu_qs();
@@ -306,14 +468,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
 		 */
 		empty = !rcu_preempt_blocked_readers_cgp();
 		empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
-		np = t->rcu_node_entry.next;
-		if (np == &rcu_preempt_ctrlblk.blkd_tasks)
-			np = NULL;
+		np = rcu_next_node_entry(t);
 		list_del(&t->rcu_node_entry);
 		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
 			rcu_preempt_ctrlblk.gp_tasks = np;
 		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
 			rcu_preempt_ctrlblk.exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
+			rcu_preempt_ctrlblk.boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 		INIT_LIST_HEAD(&t->rcu_node_entry);
 
 		/*
@@ -333,6 +497,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
 		if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
 			rcu_report_exp_done();
 	}
+#ifdef CONFIG_RCU_BOOST
+	/* Unboost self if was boosted. */
+	if (special & RCU_READ_UNLOCK_BOOSTED) {
+		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+		rt_mutex_unlock(t->rcu_boost_mutex);
+		t->rcu_boost_mutex = NULL;
+	}
+#endif /* #ifdef CONFIG_RCU_BOOST */
 	local_irq_restore(flags);
 }
 
@@ -376,7 +548,7 @@ static void rcu_preempt_check_callbacks(void)
 		rcu_preempt_cpu_qs();
 	if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
 	    rcu_preempt_ctrlblk.rcb.donetail)
-		invoke_rcu_cbs();
+		invoke_rcu_kthread();
 	if (rcu_preempt_gp_in_progress() &&
 	    rcu_cpu_blocking_cur_gp() &&
 	    rcu_preempt_running_reader())
@@ -534,6 +706,7 @@ void synchronize_rcu_expedited(void)
 
 	/* Wait for tail of ->blkd_tasks list to drain. */
 	if (rcu_preempted_readers_exp())
+		rcu_initiate_expedited_boost();
 		wait_event(sync_rcu_preempt_exp_wq,
 			   !rcu_preempted_readers_exp());
 
@@ -575,6 +748,15 @@ void exit_rcu(void)
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
 
 /*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+	return 0;
+}
+
+/*
  * Because preemptible RCU does not exist, it never has any callbacks
  * to check.
  */
@@ -614,3 +796,9 @@ void __init rcu_scheduler_starting(void)
 }
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
-- 
cgit v0.10.2


From 9e571a82f0cb205a65a0ea41657f19f22b7fabb8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 30 Sep 2010 21:26:52 -0700
Subject: rcu: add tracing for TINY_RCU and TINY_PREEMPT_RCU

Add tracing for the tiny RCU implementations, including statistics on
boosting in the case of TINY_PREEMPT_RCU and RCU_BOOST.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/init/Kconfig b/init/Kconfig
index 48efefc..929adf6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -384,7 +384,6 @@ config PREEMPT_RCU
 
 config RCU_TRACE
 	bool "Enable tracing for RCU"
-	depends on TREE_RCU || TREE_PREEMPT_RCU
 	help
 	  This option provides tracing in RCU which presents stats
 	  in debugfs for debugging RCU implementation.
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 93d1665..0344937 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -144,6 +144,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
 	struct rcu_head *next, *list;
 	unsigned long flags;
+	RCU_TRACE(int cb_count = 0);
 
 	/* If no RCU callbacks ready to invoke, just return. */
 	if (&rcp->rcucblist == rcp->donetail)
@@ -169,7 +170,9 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 		list->func(list);
 		local_bh_enable();
 		list = next;
+		RCU_TRACE(cb_count++);
 	}
+	RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
 }
 
 /*
@@ -252,6 +255,7 @@ static void __call_rcu(struct rcu_head *head,
 	local_irq_save(flags);
 	*rcp->curtail = head;
 	rcp->curtail = &head->next;
+	RCU_TRACE(rcp->qlen++);
 	local_irq_restore(flags);
 }
 
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 24f4316..f4e0df0 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -23,12 +23,21 @@
  */
 
 #include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt)	stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
 	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
 	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
 	struct rcu_head **curtail;	/* ->next pointer of last CB. */
+	RCU_TRACE(long qlen);		/* Number of pending CBs. */
 };
 
 /* Definition for rcupdate control block. */
@@ -90,8 +99,26 @@ struct rcu_preempt_ctrlblk {
 	u8 gpcpu;		/* Last grace period blocked by the CPU. */
 	u8 completed;		/* Last grace period completed. */
 				/*  If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
 	s8 boosted_this_gp;	/* Has boosting already happened? */
 	unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+	unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+	unsigned long n_tasks_boosted;
+	unsigned long n_exp_boosts;
+	unsigned long n_normal_boosts;
+	unsigned long n_normal_balk_blkd_tasks;
+	unsigned long n_normal_balk_gp_tasks;
+	unsigned long n_normal_balk_boost_tasks;
+	unsigned long n_normal_balk_boosted;
+	unsigned long n_normal_balk_notyet;
+	unsigned long n_normal_balk_nos;
+	unsigned long n_exp_balk_blkd_tasks;
+	unsigned long n_exp_balk_nos;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
 };
 
 static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -170,6 +197,65 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t)
 	return np;
 }
 
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+static void rcu_initiate_exp_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+	seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+		   rcu_preempt_ctrlblk.rcb.qlen,
+		   rcu_preempt_ctrlblk.n_grace_periods,
+		   rcu_preempt_ctrlblk.gpnum,
+		   rcu_preempt_ctrlblk.gpcpu,
+		   rcu_preempt_ctrlblk.completed,
+		   "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+		   "N."[!rcu_preempt_ctrlblk.gp_tasks],
+		   "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+	seq_printf(m, "             ttb=%c btg=",
+		   "B."[!rcu_preempt_ctrlblk.boost_tasks]);
+	switch (rcu_preempt_ctrlblk.boosted_this_gp) {
+	case -1:
+		seq_puts(m, "exp");
+		break;
+	case 0:
+		seq_puts(m, "no");
+		break;
+	case 1:
+		seq_puts(m, "done");
+		break;
+	default:
+		seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
+	}
+	seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+		   rcu_preempt_ctrlblk.n_tasks_boosted,
+		   rcu_preempt_ctrlblk.n_exp_boosts,
+		   rcu_preempt_ctrlblk.n_normal_boosts,
+		   (int)(jiffies & 0xffff),
+		   (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+	seq_printf(m, "             %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
+		   "normal balk",
+		   rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
+		   rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
+		   rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
+		   rcu_preempt_ctrlblk.n_normal_balk_boosted,
+		   rcu_preempt_ctrlblk.n_normal_balk_notyet,
+		   rcu_preempt_ctrlblk.n_normal_balk_nos);
+	seq_printf(m, "             exp balk: bt=%lu nos=%lu\n",
+		   rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
+		   rcu_preempt_ctrlblk.n_exp_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
 #ifdef CONFIG_RCU_BOOST
 
 #include "rtmutex_common.h"
@@ -197,6 +283,7 @@ static int rcu_boost(void)
 	t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
 	raw_local_irq_restore(flags);
 	rt_mutex_lock(&mtx);
+	RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
 	rt_mutex_unlock(&mtx);
 	return rcu_preempt_ctrlblk.boost_tasks != NULL;
 }
@@ -206,16 +293,27 @@ static int rcu_boost(void)
  * the current grace period, and, if so, tell the rcu_kthread_task to
  * start boosting them.  If there is an expedited boost in progress,
  * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1.  Note that this
+ * return value is independent of whether or not boosting was done.
  */
-static void rcu_initiate_boost(void)
+static int rcu_initiate_boost(void)
 {
+	if (!rcu_preempt_blocked_readers_cgp()) {
+		RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+		return 0;
+	}
 	if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
 	    rcu_preempt_ctrlblk.boost_tasks == NULL &&
 	    rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
 	    ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
 		rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
 		invoke_rcu_kthread();
-	}
+		RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+	} else
+		RCU_TRACE(rcu_initiate_boost_trace());
+	return 1;
 }
 
 /*
@@ -231,7 +329,9 @@ static void rcu_initiate_expedited_boost(void)
 			rcu_preempt_ctrlblk.blkd_tasks.next;
 		rcu_preempt_ctrlblk.boosted_this_gp = -1;
 		invoke_rcu_kthread();
-	}
+		RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+	} else
+		RCU_TRACE(rcu_initiate_exp_boost_trace());
 	raw_local_irq_restore(flags);
 }
 
@@ -258,10 +358,13 @@ static int rcu_boost(void)
 }
 
 /*
- * If there is no RCU priority boosting, we don't initiate boosting.
+ * If there is no RCU priority boosting, we don't initiate boosting,
+ * but we do indicate whether there are blocked readers blocking the
+ * current grace period.
  */
-static void rcu_initiate_boost(void)
+static int rcu_initiate_boost(void)
 {
+	return rcu_preempt_blocked_readers_cgp();
 }
 
 /*
@@ -308,13 +411,14 @@ static void rcu_preempt_cpu_qs(void)
 	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 
 	/* If there is no GP then there is nothing more to do.  */
-	if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+	if (!rcu_preempt_gp_in_progress())
 		return;
-	/* If there are blocked readers, go check up on boosting. */
-	if (rcu_preempt_blocked_readers_cgp()) {
-		rcu_initiate_boost();
+	/*
+	 * Check up on boosting.  If there are no readers blocking the
+	 * current grace period, leave.
+	 */
+	if (rcu_initiate_boost())
 		return;
-	}
 
 	/* Advance callbacks. */
 	rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
@@ -339,6 +443,7 @@ static void rcu_preempt_start_gp(void)
 
 		/* Official start of GP. */
 		rcu_preempt_ctrlblk.gpnum++;
+		RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
 
 		/* Any blocked RCU readers block new GP. */
 		if (rcu_preempt_blocked_readers_any())
@@ -591,6 +696,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 	local_irq_save(flags);
 	*rcu_preempt_ctrlblk.nexttail = head;
 	rcu_preempt_ctrlblk.nexttail = &head->next;
+	RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
 	rcu_preempt_start_gp();  /* checks to see if GP needed. */
 	local_irq_restore(flags);
 }
@@ -747,6 +853,18 @@ void exit_rcu(void)
 
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
 
+#ifdef CONFIG_RCU_TRACE
+
+/*
+ * Because preemptible RCU does not exist, it is not necessary to
+ * dump out its statistics.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
 /*
  * Because preemptible RCU does not exist, it is never necessary to
  * boost preempted RCU readers.
@@ -802,3 +920,97 @@ void __init rcu_scheduler_starting(void)
 #else /* #ifdef CONFIG_RCU_BOOST */
 #define RCU_BOOST_PRIO 1
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+
+static void rcu_initiate_boost_trace(void)
+{
+	if (rcu_preempt_ctrlblk.gp_tasks == NULL)
+		rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+	else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
+		rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
+	else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
+		rcu_preempt_ctrlblk.n_normal_balk_boosted++;
+	else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
+		rcu_preempt_ctrlblk.n_normal_balk_notyet++;
+	else
+		rcu_preempt_ctrlblk.n_normal_balk_nos++;
+}
+
+static void rcu_initiate_exp_boost_trace(void)
+{
+	if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+		rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
+	else
+		rcu_preempt_ctrlblk.n_exp_balk_nos++;
+}
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+	unsigned long flags;
+
+	raw_local_irq_save(flags);
+	rcp->qlen -= n;
+	raw_local_irq_restore(flags);
+}
+
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+	show_tiny_preempt_stats(m);
+	seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+	seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+	return 0;
+}
+
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_tiny_stats, NULL);
+}
+
+static const struct file_operations show_tiny_stats_fops = {
+	.owner = THIS_MODULE,
+	.open = show_tiny_stats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static struct dentry *rcudir;
+
+static int __init rcutiny_trace_init(void)
+{
+	struct dentry *retval;
+
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto free_out;
+	retval = debugfs_create_file("rcudata", 0444, rcudir,
+				     NULL, &show_tiny_stats_fops);
+	if (!retval)
+		goto free_out;
+	return 0;
+free_out:
+	debugfs_remove_recursive(rcudir);
+	return 1;
+}
+
+static void __exit rcutiny_trace_cleanup(void)
+{
+	debugfs_remove_recursive(rcudir);
+}
+
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
-- 
cgit v0.10.2


From 8e79e1f9615b83d1e1d26b328d1b776111ca0cf7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 4 Nov 2010 14:31:19 -0700
Subject: rcu: document TINY_RCU and TINY_PREEMPT_RCU tracing.

Add the required verbiage to Documentation/RCU/trace.txt.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index a851118..ff6d3f1 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -1,18 +1,22 @@
 CONFIG_RCU_TRACE debugfs Files and Formats
 
 
-The rcutree implementation of RCU provides debugfs trace output that
-summarizes counters and state.  This information is useful for debugging
-RCU itself, and can sometimes also help to debug abuses of RCU.
-The following sections describe the debugfs files and formats.
+The rcutree and rcutiny implementations of RCU provide debugfs trace
+output that summarizes counters and state.  This information is useful for
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+The following sections describe the debugfs files and formats, first
+for rcutree and next for rcutiny.
 
 
-Hierarchical RCU debugfs Files and Formats
+CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
 
-This implementation of RCU provides three debugfs files under the
+These implementations of RCU provides five debugfs files under the
 top-level directory RCU: rcu/rcudata (which displays fields in struct
-rcu_data), rcu/rcugp (which displays grace-period counters), and
-rcu/rcuhier (which displays the struct rcu_node hierarchy).
+rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
+rcu/rcudata), rcu/rcugp (which displays grace-period counters),
+rcu/rcuhier (which displays the struct rcu_node hierarchy), and
+rcu/rcu_pending (which displays counts of the reasons that the
+rcu_pending() function decided that there was core RCU work to do).
 
 The output of "cat rcu/rcudata" looks as follows:
 
@@ -326,3 +330,115 @@ o	"nn" is the number of times that this CPU needed nothing.  Alert
 	readers will note that the rcu "nn" number for a given CPU very
 	closely matches the rcu_bh "np" number for that same CPU.  This
 	is due to short-circuit evaluation in rcu_pending().
+
+
+CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
+
+These implementations of RCU provides a single debugfs file under the
+top-level directory RCU, namely rcu/rcudata, which displays fields in
+rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
+rcu_preempt_ctrlblk.
+
+The output of "cat rcu/rcudata" is as follows:
+
+rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
+             ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
+             normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
+             exp balk: bt=0 nos=0
+rcu_sched: qlen: 0
+rcu_bh: qlen: 0
+
+This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
+rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
+The last three lines of the rcu_preempt section appear only in
+CONFIG_RCU_BOOST kernel builds.  The fields are as follows:
+
+o	"qlen" is the number of RCU callbacks currently waiting either
+	for an RCU grace period or waiting to be invoked.  This is the
+	only field present for rcu_sched and rcu_bh, due to the
+	short-circuiting of grace period in those two cases.
+
+o	"gp" is the number of grace periods that have completed.
+
+o	"g197/p197/c197" displays the grace-period state, with the
+	"g" number being the number of grace periods that have started
+	(mod 256), the "p" number being the number of grace periods
+	that the CPU has responded to (also mod 256), and the "c"
+	number being the number of grace periods that have completed
+	(once again mode 256).
+
+	Why have both "gp" and "g"?  Because the data flowing into
+	"gp" is only present in a CONFIG_RCU_TRACE kernel.
+
+o	"tasks" is a set of bits.  The first bit is "T" if there are
+	currently tasks that have recently blocked within an RCU
+	read-side critical section, the second bit is "N" if any of the
+	aforementioned tasks are blocking the current RCU grace period,
+	and the third bit is "E" if any of the aforementioned tasks are
+	blocking the current expedited grace period.  Each bit is "."
+	if the corresponding condition does not hold.
+
+o	"ttb" is a single bit.  It is "B" if any of the blocked tasks
+	need to be priority boosted and "." otherwise.
+
+o	"btg" indicates whether boosting has been carried out during
+	the current grace period, with "exp" indicating that boosting
+	is in progress for an expedited grace period, "no" indicating
+	that boosting has not yet started for a normal grace period,
+	"begun" indicating that boosting has bebug for a normal grace
+	period, and "done" indicating that boosting has completed for
+	a normal grace period.
+
+o	"ntb" is the total number of tasks subjected to RCU priority boosting
+	periods since boot.
+
+o	"neb" is the number of expedited grace periods that have had
+	to resort to RCU priority boosting since boot.
+
+o	"nnb" is the number of normal grace periods that have had
+	to resort to RCU priority boosting since boot.
+
+o	"j" is the low-order 12 bits of the jiffies counter in hexadecimal.
+
+o	"bt" is the low-order 12 bits of the value that the jiffies counter
+	will have at the next time that boosting is scheduled to begin.
+
+o	In the line beginning with "normal balk", the fields are as follows:
+
+	o	"nt" is the number of times that the system balked from
+		boosting because there were no blocked tasks to boost.
+		Note that the system will balk from boosting even if the
+		grace period is overdue when the currently running task
+		is looping within an RCU read-side critical section.
+		There is no point in boosting in this case, because
+		boosting a running task won't make it run any faster.
+
+	o	"gt" is the number of times that the system balked
+		from boosting because, although there were blocked tasks,
+		none of them were preventing the current grace period
+		from completing.
+
+	o	"bt" is the number of times that the system balked
+		from boosting because boosting was already in progress.
+
+	o	"b" is the number of times that the system balked from
+		boosting because boosting had already completed for
+		the grace period in question.
+
+	o	"ny" is the number of times that the system balked from
+		boosting because it was not yet time to start boosting
+		the grace period in question.
+
+	o	"nos" is the number of times that the system balked from
+		boosting for inexplicable ("not otherwise specified")
+		reasons.  This can actually happen due to races involving
+		increments of the jiffies counter.
+
+o	In the line beginning with "exp balk", the fields are as follows:
+
+	o	"bt" is the number of times that the system balked from
+		boosting because there were no blocked tasks to boost.
+
+	o	"nos" is the number of times that the system balked from
+		 boosting for inexplicable ("not otherwise specified")
+		 reasons.
-- 
cgit v0.10.2


From e940cc804ec212e483f91167b93d1740c2fd3415 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 4 Nov 2010 14:55:26 -0700
Subject: rcu: Distinguish between boosting and boosted

RCU priority boosting's tracing did not distinguish between ongoing
boosting and completion of boosting.  This commit therefore adds this
capability.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f4e0df0..015abae 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -229,6 +229,9 @@ static void show_tiny_preempt_stats(struct seq_file *m)
 		seq_puts(m, "no");
 		break;
 	case 1:
+		seq_puts(m, "begun");
+		break;
+	case 2:
 		seq_puts(m, "done");
 		break;
 	default:
@@ -284,6 +287,7 @@ static int rcu_boost(void)
 	raw_local_irq_restore(flags);
 	rt_mutex_lock(&mtx);
 	RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+	rcu_preempt_ctrlblk.boosted_this_gp++;
 	rt_mutex_unlock(&mtx);
 	return rcu_preempt_ctrlblk.boost_tasks != NULL;
 }
-- 
cgit v0.10.2


From deb7a41815a8a32d4f9ea2af7a48ed1175222cec Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 30 Sep 2010 21:33:32 -0700
Subject: rcu: get rid of obsolete "classic" names in TREE_RCU tracing

The TREE_RCU tracing had obsolete rcuclassic_trace_init() and
rcuclassic_trace_cleanup() function names.  This commit brings them
up to date: rcutree_trace_init() and rcutree_trace_cleanup(),
respectively.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d15430b..78ad3c3 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
 
 static struct dentry *rcudir;
 
-static int __init rcuclassic_trace_init(void)
+static int __init rcutree_trace_init(void)
 {
 	struct dentry *retval;
 
@@ -337,14 +337,14 @@ free_out:
 	return 1;
 }
 
-static void __exit rcuclassic_trace_cleanup(void)
+static void __exit rcutree_trace_cleanup(void)
 {
 	debugfs_remove_recursive(rcudir);
 }
 
 
-module_init(rcuclassic_trace_init);
-module_exit(rcuclassic_trace_cleanup);
+module_init(rcutree_trace_init);
+module_exit(rcutree_trace_cleanup);
 
 MODULE_AUTHOR("Paul E. McKenney");
 MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
-- 
cgit v0.10.2


From 7b27d5475f86186914e54e4a6bb994e9a985337b Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 21 Oct 2010 11:29:05 +0800
Subject: rcu,cleanup: move synchronize_sched_expedited() out of sched.c

The first version of synchronize_sched_expedited() used the migration
code in the scheduler, and was therefore implemented in kernel/sched.c.
However, the more recent version of this code no longer uses the
migration code, so this commit moves it to the main RCU source files.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 7142ee3..49e8e16 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -66,7 +66,6 @@ extern void call_rcu_sched(struct rcu_head *head,
 extern void synchronize_sched(void);
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
-extern void synchronize_sched_expedited(void);
 extern int sched_expedited_torture_stats(char *page);
 
 static inline void __rcu_read_lock_bh(void)
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index ea025a6..30ebd7c 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -60,6 +60,11 @@ static inline void synchronize_rcu_bh_expedited(void)
 	synchronize_sched();
 }
 
+static inline void synchronize_sched_expedited(void)
+{
+	synchronize_sched();
+}
+
 #ifdef CONFIG_TINY_RCU
 
 static inline void rcu_preempt_note_context_switch(void)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index c0e9683..3a93348 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -48,6 +48,7 @@ static inline void exit_rcu(void)
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 
 extern void synchronize_rcu_bh(void);
+extern void synchronize_sched_expedited(void);
 extern void synchronize_rcu_expedited(void);
 
 static inline void synchronize_rcu_bh_expedited(void)
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 71a4147..21df7f3 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
  */
 
 #include <linux/delay.h>
+#include <linux/stop_machine.h>
 
 /*
  * Check the RCU kernel configuration parameters and print informative
@@ -1014,6 +1015,76 @@ static void __init __rcu_init_preempt(void)
 
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 
+#ifndef CONFIG_SMP
+
+void synchronize_sched_expedited(void)
+{
+	cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+	/*
+	 * There must be a full memory barrier on each affected CPU
+	 * between the time that try_stop_cpus() is called and the
+	 * time that it returns.
+	 *
+	 * In the current initial implementation of cpu_stop, the
+	 * above condition is already met when the control reaches
+	 * this point and the following smp_mb() is not strictly
+	 * necessary.  Do smp_mb() anyway for documentation and
+	 * robustness against future implementation changes.
+	 */
+	smp_mb(); /* See above comment block. */
+	return 0;
+}
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly.  This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ * observe this restriction will result in deadlock.
+ */
+void synchronize_sched_expedited(void)
+{
+	int snap, trycount = 0;
+
+	smp_mb();  /* ensure prior mod happens before capturing snap. */
+	snap = atomic_read(&synchronize_sched_expedited_count) + 1;
+	get_online_cpus();
+	while (try_stop_cpus(cpu_online_mask,
+			     synchronize_sched_expedited_cpu_stop,
+			     NULL) == -EAGAIN) {
+		put_online_cpus();
+		if (trycount++ < 10)
+			udelay(trycount * num_online_cpus());
+		else {
+			synchronize_sched();
+			return;
+		}
+		if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
+			smp_mb(); /* ensure test happens before caller kfree */
+			return;
+		}
+		get_online_cpus();
+	}
+	atomic_inc(&synchronize_sched_expedited_count);
+	smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
+	put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */
+
 #if !defined(CONFIG_RCU_FAST_NO_HZ)
 
 /*
diff --git a/kernel/sched.c b/kernel/sched.c
index ae8f75a..d1e8889 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9131,72 +9131,3 @@ struct cgroup_subsys cpuacct_subsys = {
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */
 
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
-	barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
-	/*
-	 * There must be a full memory barrier on each affected CPU
-	 * between the time that try_stop_cpus() is called and the
-	 * time that it returns.
-	 *
-	 * In the current initial implementation of cpu_stop, the
-	 * above condition is already met when the control reaches
-	 * this point and the following smp_mb() is not strictly
-	 * necessary.  Do smp_mb() anyway for documentation and
-	 * robustness against future implementation changes.
-	 */
-	smp_mb(); /* See above comment block. */
-	return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly.  This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier.  Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
-	int snap, trycount = 0;
-
-	smp_mb();  /* ensure prior mod happens before capturing snap. */
-	snap = atomic_read(&synchronize_sched_expedited_count) + 1;
-	get_online_cpus();
-	while (try_stop_cpus(cpu_online_mask,
-			     synchronize_sched_expedited_cpu_stop,
-			     NULL) == -EAGAIN) {
-		put_online_cpus();
-		if (trycount++ < 10)
-			udelay(trycount * num_online_cpus());
-		else {
-			synchronize_sched();
-			return;
-		}
-		if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
-			smp_mb(); /* ensure test happens before caller kfree */
-			return;
-		}
-		get_online_cpus();
-	}
-	atomic_inc(&synchronize_sched_expedited_count);
-	smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-	put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */
-- 
cgit v0.10.2


From 29494be71afe2a16ad04e344306a620d7cc22d06 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 20 Oct 2010 14:13:06 +0800
Subject: rcu,cleanup: simplify the code when cpu is dying

When we handle the CPU_DYING notifier, the whole system is stopped except
for the current CPU.  We therefore need no synchronization with the other
CPUs.  This allows us to move any orphaned RCU callbacks directly to the
list of any online CPU without needing to run them through the global
orphan lists.  These global orphan lists can therefore be dispensed with.
This commit makes thes changes, though currently victimizes CPU 0 @@@.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ccdc04c..669d7fe 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 	.gpnum = -300, \
 	.completed = -300, \
 	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
-	.orphan_cbs_list = NULL, \
-	.orphan_cbs_tail = &structname.orphan_cbs_list, \
-	.orphan_qlen = 0, \
 	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
 	.n_force_qs = 0, \
 	.n_force_qs_ngp = 0, \
@@ -984,53 +981,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 #ifdef CONFIG_HOTPLUG_CPU
 
 /*
- * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
- * specified flavor of RCU.  The callbacks will be adopted by the next
- * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
- * comes first.  Because this is invoked from the CPU_DYING notifier,
- * irqs are already disabled.
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
+ * Synchronization is not required because this function executes
+ * in stop_machine() context.
  */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
 	int i;
+	/* current DYING CPU is cleared in the cpu_online_mask */
+	int receive_cpu = cpumask_any(cpu_online_mask);
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+	struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
 
 	if (rdp->nxtlist == NULL)
 		return;  /* irqs disabled, so comparison is stable. */
-	raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
-	*rsp->orphan_cbs_tail = rdp->nxtlist;
-	rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+
+	*receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+	receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+	receive_rdp->qlen += rdp->qlen;
+	receive_rdp->n_cbs_adopted += rdp->qlen;
+	rdp->n_cbs_orphaned += rdp->qlen;
+
 	rdp->nxtlist = NULL;
 	for (i = 0; i < RCU_NEXT_SIZE; i++)
 		rdp->nxttail[i] = &rdp->nxtlist;
-	rsp->orphan_qlen += rdp->qlen;
-	rdp->n_cbs_orphaned += rdp->qlen;
 	rdp->qlen = 0;
-	raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
-}
-
-/*
- * Adopt previously orphaned RCU callbacks.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	raw_spin_lock_irqsave(&rsp->onofflock, flags);
-	rdp = this_cpu_ptr(rsp->rda);
-	if (rsp->orphan_cbs_list == NULL) {
-		raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-		return;
-	}
-	*rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
-	rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
-	rdp->qlen += rsp->orphan_qlen;
-	rdp->n_cbs_adopted += rsp->orphan_qlen;
-	rsp->orphan_cbs_list = NULL;
-	rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
-	rsp->orphan_qlen = 0;
-	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
 /*
@@ -1081,8 +1056,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	if (need_report & RCU_OFL_TASKS_EXP_GP)
 		rcu_report_exp_rnp(rsp, rnp);
-
-	rcu_adopt_orphan_cbs(rsp);
 }
 
 /*
@@ -1100,11 +1073,7 @@ static void rcu_offline_cpu(int cpu)
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
 
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
-{
-}
-
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
 }
 
@@ -1702,10 +1671,7 @@ static void _rcu_barrier(struct rcu_state *rsp,
 	 * early.
 	 */
 	atomic_set(&rcu_barrier_cpu_count, 1);
-	preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
-	rcu_adopt_orphan_cbs(rsp);
 	on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
-	preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
 	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
 		complete(&rcu_barrier_completion);
 	wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1797,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
 		/*
-		 * preempt_disable() in _rcu_barrier() prevents stop_machine(),
-		 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
-		 * returns, all online cpus have queued rcu_barrier_func().
-		 * The dying CPU clears its cpu_online_mask bit and
-		 * moves all of its RCU callbacks to ->orphan_cbs_list
-		 * in the context of stop_machine(), so subsequent calls
-		 * to _rcu_barrier() will adopt these callbacks and only
-		 * then queue rcu_barrier_func() on all remaining CPUs.
+		 * The whole machine is "stopped" except this cpu, so we can
+		 * touch any data without introducing corruption. And we send
+		 * the callbacks to an attribute chosen online cpu.
 		 */
-		rcu_send_cbs_to_orphanage(&rcu_bh_state);
-		rcu_send_cbs_to_orphanage(&rcu_sched_state);
-		rcu_preempt_send_cbs_to_orphanage();
+		rcu_send_cbs_to_online(&rcu_bh_state);
+		rcu_send_cbs_to_online(&rcu_sched_state);
+		rcu_preempt_send_cbs_to_online();
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 91d4170..1a54be2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -203,8 +203,8 @@ struct rcu_data {
 	long		qlen_last_fqs_check;
 					/* qlen at last check for QS forcing */
 	unsigned long	n_cbs_invoked;	/* count of RCU cbs invoked. */
-	unsigned long	n_cbs_orphaned;	/* RCU cbs sent to orphanage. */
-	unsigned long	n_cbs_adopted;	/* RCU cbs adopted from orphanage. */
+	unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
+	unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
 	unsigned long	n_force_qs_snap;
 					/* did other CPU force QS recently? */
 	long		blimit;		/* Upper limit on a processed batch */
@@ -309,15 +309,7 @@ struct rcu_state {
 	/* End of fields guarded by root rcu_node's lock. */
 
 	raw_spinlock_t onofflock;		/* exclude on/offline and */
-						/*  starting new GP.  Also */
-						/*  protects the following */
-						/*  orphan_cbs fields. */
-	struct rcu_head *orphan_cbs_list;	/* list of rcu_head structs */
-						/*  orphaned by all CPUs in */
-						/*  a given leaf rcu_node */
-						/*  going offline. */
-	struct rcu_head **orphan_cbs_tail;	/* And tail pointer. */
-	long orphan_qlen;			/* Number of orphaned cbs. */
+						/*  starting new GP. */
 	raw_spinlock_t fqslock;			/* Only one task forcing */
 						/*  quiescent states. */
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
@@ -390,7 +382,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_orphanage(void);
+static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_needs_cpu_flush(void);
 
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 21df7f3..0de359b 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -774,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 
 /*
- * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ * Move preemptable DYING RCU's callbacks to other online CPU.
  */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
-	rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+	rcu_send_cbs_to_online(&rcu_preempt_state);
 }
 
 /*
@@ -1002,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 /*
  * Because there is no preemptable RCU, there are no callbacks to move.
  */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
 }
 
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 78ad3c3..c8e9785 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 
 	gpnum = rsp->gpnum;
 	seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
-		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
+		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
 		   rsp->completed, gpnum, rsp->signaled,
 		   (long)(rsp->jiffies_force_qs - jiffies),
 		   (int)(jiffies & 0xffff),
 		   rsp->n_force_qs, rsp->n_force_qs_ngp,
 		   rsp->n_force_qs - rsp->n_force_qs_ngp,
-		   rsp->n_force_qs_lh, rsp->orphan_qlen);
+		   rsp->n_force_qs_lh);
 	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
 		if (rnp->level != level) {
 			seq_puts(m, "\n");
-- 
cgit v0.10.2


From 2d999e03b7c8305b4385dd20992e4ed3e827177b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 20 Oct 2010 12:06:18 -0700
Subject: rcu: update documentation/comments for Lai's adoption patch

Lai's RCU-callback immediate-adoption patch changes the RCU tracing
output, so update tracing.txt.  Also update a few comments to clarify
the synchronization design.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index ff6d3f1..6a8c73f 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -134,7 +134,8 @@ o	"ci" is the number of RCU callbacks that have been invoked for
 	been registered in absence of CPU-hotplug activity.
 
 o	"co" is the number of RCU callbacks that have been orphaned due to
-	this CPU going offline.
+	this CPU going offline.  These orphaned callbacks have been moved
+	to an arbitrarily chosen online CPU.
 
 o	"ca" is the number of RCU callbacks that have been adopted due to
 	other CPUs going offline.  Note that ci+co-ca+ql is the number of
@@ -172,12 +173,12 @@ o	"gpnum" is the number of grace periods that have started.  It is
 
 The output of "cat rcu/rcuhier" looks as follows, with very long lines:
 
-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
 1/1 .>. 0:127 ^0    
 3/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
 3/3f .>. 0:5 ^0    2/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3    
 rcu_bh:
-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
 0/1 .>. 0:127 ^0    
 0/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
 0/3f .>. 0:5 ^0    0/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3
@@ -216,11 +217,6 @@ o	"fqlh" is the number of calls to force_quiescent_state() that
 	exited immediately (without even being counted in nfqs above)
 	due to contention on ->fqslock.
 
-o	"oqlen" is the number of callbacks on the "orphan" callback
-	list.  RCU callbacks are placed on this list by CPUs going
-	offline, and are "adopted" either by the CPU helping the outgoing
-	CPU or by the next rcu_barrier*() call, whichever comes first.
-
 o	Each element of the form "1/1 0:127 ^0" represents one struct
 	rcu_node.  Each line represents one level of the hierarchy, from
 	root to leaves.  It is best to think of the rcu_data structures
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 669d7fe..120820ff 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1668,7 +1668,9 @@ static void _rcu_barrier(struct rcu_state *rsp,
 	 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
 	 * might complete its grace period before all of the other CPUs
 	 * did their increment, causing this function to return too
-	 * early.
+	 * early.  Note that on_each_cpu() disables irqs, which prevents
+	 * any CPUs from coming online or going offline until each online
+	 * CPU has queued its RCU-barrier callback.
 	 */
 	atomic_set(&rcu_barrier_cpu_count, 1);
 	on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
@@ -1797,9 +1799,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
 		/*
-		 * The whole machine is "stopped" except this cpu, so we can
-		 * touch any data without introducing corruption. And we send
-		 * the callbacks to an attribute chosen online cpu.
+		 * The whole machine is "stopped" except this CPU, so we can
+		 * touch any data without introducing corruption. We send the
+		 * dying CPU's callbacks to an arbitrarily chosen online CPU.
 		 */
 		rcu_send_cbs_to_online(&rcu_bh_state);
 		rcu_send_cbs_to_online(&rcu_sched_state);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0de359b..643c8f6 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -774,7 +774,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 
 /*
- * Move preemptable DYING RCU's callbacks to other online CPU.
+ * Move preemptable RCU's callbacks from dying CPU to other online CPU.
  */
 static void rcu_preempt_send_cbs_to_online(void)
 {
-- 
cgit v0.10.2


From db3a8920995484e5e9a0abaf3bad2c7311b163db Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 25 Oct 2010 07:39:22 -0700
Subject: rcu: fix race condition in synchronize_sched_expedited()

The new (early 2010) implementation of synchronize_sched_expedited() uses
try_stop_cpu() to force a context switch on every CPU.  It also permits
concurrent calls to synchronize_sched_expedited() to share a single call
to try_stop_cpu() through use of an atomically incremented
synchronize_sched_expedited_count variable.  Unfortunately, this is
subject to failure as follows:

o	Task A invokes synchronize_sched_expedited(), try_stop_cpus()
	succeeds, but Task A is preempted before getting to the atomic
	increment of synchronize_sched_expedited_count.

o	Task B also invokes synchronize_sched_expedited(), with exactly
	the same outcome as Task A.

o	Task C also invokes synchronize_sched_expedited(), again with
	exactly the same outcome as Tasks A and B.

o	Task D also invokes synchronize_sched_expedited(), but only
	gets as far as acquiring the mutex within try_stop_cpus()
	before being preempted, interrupted, or otherwise delayed.

o	Task E also invokes synchronize_sched_expedited(), but only
	gets to the snapshotting of synchronize_sched_expedited_count.

o	Tasks A, B, and C all increment synchronize_sched_expedited_count.

o	Task E fails to get the mutex, so checks the new value
	of synchronize_sched_expedited_count.  It finds that the
	value has increased, so (wrongly) assumes that its work
	has been done, returning despite there having been no
	expedited grace period since it began.

The solution is to have the lowest-numbered CPU atomically increment
the synchronize_sched_expedited_count variable within the
synchronize_sched_expedited_cpu_stop() function, which is under
the protection of the mutex acquired by try_stop_cpus().  However, this
also requires that piggybacking tasks wait for three rather than two
instances of try_stop_cpu(), because we cannot control the order in
which the per-CPU callback function occur.

Cc: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 643c8f6..c22c4ef 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1041,6 +1041,8 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
 	 * robustness against future implementation changes.
 	 */
 	smp_mb(); /* See above comment block. */
+	if (cpumask_first(cpu_online_mask) == smp_processor_id())
+		atomic_inc(&synchronize_sched_expedited_count);
 	return 0;
 }
 
@@ -1053,13 +1055,26 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
  * Note that it is illegal to call this function while holding any
  * lock that is acquired by a CPU-hotplug notifier.  Failing to
  * observe this restriction will result in deadlock.
+ *
+ * The synchronize_sched_expedited_cpu_stop() function is called
+ * in stop-CPU context, but in order to keep overhead down to a dull
+ * roar, we don't force this function to wait for its counterparts
+ * on other CPUs.  One instance of this function will increment the
+ * synchronize_sched_expedited_count variable per call to
+ * try_stop_cpus(), but there is no guarantee what order this instance
+ * will occur in.  The worst case is that it is last on one call
+ * to try_stop_cpus(), and the first on the next call.  This means
+ * that piggybacking requires that synchronize_sched_expedited_count
+ * be incremented by 3: this guarantees that the piggybacking
+ * task has waited through an entire cycle of context switches,
+ * even in the worst case.
  */
 void synchronize_sched_expedited(void)
 {
 	int snap, trycount = 0;
 
 	smp_mb();  /* ensure prior mod happens before capturing snap. */
-	snap = atomic_read(&synchronize_sched_expedited_count) + 1;
+	snap = atomic_read(&synchronize_sched_expedited_count) + 2;
 	get_online_cpus();
 	while (try_stop_cpus(cpu_online_mask,
 			     synchronize_sched_expedited_cpu_stop,
@@ -1077,7 +1092,6 @@ void synchronize_sched_expedited(void)
 		}
 		get_online_cpus();
 	}
-	atomic_inc(&synchronize_sched_expedited_count);
 	smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
 	put_online_cpus();
 }
-- 
cgit v0.10.2


From 46fdb0937f26124700fc9fc80da4776330cc00d3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 26 Oct 2010 02:11:40 -0700
Subject: rcu: Make synchronize_srcu_expedited() fast if running readers

The synchronize_srcu_expedited() function is currently quick if there
are no active readers, but will delay a full jiffy if there are any.
If these readers leave their SRCU read-side critical sections quickly,
this is way too long to wait.  So this commit first waits ten microseconds,
and only then falls back to jiffy-at-a-time waiting.

Reported-by: Avi Kivity <avi@redhat.com>
Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Tested-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/init/Kconfig b/init/Kconfig
index 929adf6..3551824 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -488,6 +488,21 @@ config RCU_BOOST_DELAY
 
 	  Accept the default if unsure.
 
+config SRCU_SYNCHRONIZE_DELAY
+	int "Microseconds to delay before waiting for readers"
+	range 0 20
+	default 10
+	help
+	  This option controls how long SRCU delays before entering its
+	  loop waiting on SRCU readers.  The purpose of this loop is
+	  to avoid the unconditional context-switch penalty that would
+	  otherwise be incurred if there was an active SRCU reader,
+	  in a manner similar to adaptive locking schemes.  This should
+	  be set to be a bit longer than the common-case SRCU read-side
+	  critical-section overhead.
+
+	  Accept the default if unsure.
+
 endmenu # "RCU Subsystem"
 
 config IKCONFIG
diff --git a/kernel/srcu.c b/kernel/srcu.c
index c71e075..98d8c1e 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
+#include <linux/delay.h>
 #include <linux/srcu.h>
 
 static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
 	 * all srcu_read_lock() calls using the old counters have completed.
 	 * Their corresponding critical sections might well be still
 	 * executing, but the srcu_read_lock() primitives themselves
-	 * will have finished executing.
+	 * will have finished executing.  We initially give readers
+	 * an arbitrarily chosen 10 microseconds to get out of their
+	 * SRCU read-side critical sections, then loop waiting 1/HZ
+	 * seconds per iteration.
 	 */
 
+	if (srcu_readers_active_idx(sp, idx))
+		udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
 	while (srcu_readers_active_idx(sp, idx))
 		schedule_timeout_interruptible(1);
 
-- 
cgit v0.10.2


From 822bc180a7f7a7bc5fcaaea195f41b487cc8cae8 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 29 Nov 2010 16:55:40 -0800
Subject: sched: Fix unregister_fair_sched_group()

In the flipping and flopping between calling
unregister_fair_sched_group() on a per-cpu versus per-group basis
we ended up in a bad state.

Remove from the list for the passed cpu as opposed to some
arbitrary index.

( This fixes explosions w/ autogroup as well as a group
  creation/destruction stress test. )

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20101130005740.080828123@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 35a6373..66ef579 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8085,7 +8085,6 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
-	int i;
 
 	/*
 	* Only empty task groups can be destroyed; so we can speculatively
@@ -8095,7 +8094,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 		return;
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
-	list_del_leaf_cfs_rq(tg->cfs_rq[i]);
+	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else /* !CONFG_FAIR_GROUP_SCHED */
-- 
cgit v0.10.2


From d2115778c7ea0df2201f1ad9aab948c49ffa1078 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 3 Nov 2010 19:58:53 +0000
Subject: GFS2: Change two WQ_RESCUERs into WQ_MEM_RECLAIM

The WQ_RESCUER flag should only be used internally to the
workqueue implementation.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f92c177..5a56568 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1782,12 +1782,13 @@ int __init gfs2_glock_init(void)
 	}
 #endif
 
-	glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+	glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
 					  WQ_HIGHPRI | WQ_FREEZEABLE, 0);
 	if (IS_ERR(glock_workqueue))
 		return PTR_ERR(glock_workqueue);
-	gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
-						WQ_FREEZEABLE, 0);
+	gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
+						WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+						0);
 	if (IS_ERR(gfs2_delete_workqueue)) {
 		destroy_workqueue(glock_workqueue);
 		return PTR_ERR(gfs2_delete_workqueue);
-- 
cgit v0.10.2


From cc18152eb7c27653199546bd14e991a451ab8d1b Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 5 Nov 2010 16:12:36 -0700
Subject: GFS2: fs/gfs2/glock.c: Convert sprintf_symbol to %pS

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 5a56568..13155f6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1626,18 +1626,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
 	struct task_struct *gh_owner = NULL;
-	char buffer[KSYM_SYMBOL_LEN];
 	char flags_buf[32];
 
-	sprint_symbol(buffer, gh->gh_ip);
 	if (gh->gh_owner_pid)
 		gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-	gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
-		  state2str(gh->gh_state),
-		  hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
-		  gh->gh_error, 
-		  gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
-		  gh_owner ? gh_owner->comm : "(ended)", buffer);
+	gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
+		       state2str(gh->gh_state),
+		       hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+		       gh->gh_error,
+		       gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+		       gh_owner ? gh_owner->comm : "(ended)",
+		       (void *)gh->gh_ip);
 	return 0;
 }
 
-- 
cgit v0.10.2


From 9e55cd53728719ac3a3234a6618259ab8e203a10 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 9 Nov 2010 14:09:53 +0000
Subject: GFS2: Remove unreachable calls to vmtruncate

Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e1213f7..14e682d 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -916,17 +916,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 	if (error)
 		return error;
 
-	if ((attr->ia_valid & ATTR_SIZE) &&
-	    attr->ia_size != i_size_read(inode)) {
-		error = vmtruncate(inode, attr->ia_size);
-		if (error)
-			return error;
-	}
-
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
-
-	gfs2_assert_warn(GFS2_SB(inode), !error);
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 12cbea7..457ecd1 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1104,17 +1104,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	if (error)
 		goto out_end_trans;
 
-	if ((attr->ia_valid & ATTR_SIZE) &&
-	    attr->ia_size != i_size_read(inode)) {
-		int error;
-
-		error = vmtruncate(inode, attr->ia_size);
-		gfs2_assert_warn(sdp, !error);
-	}
-
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
-
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 30b58f07..ecfd40d 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1325,17 +1325,8 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 	if (error)
 		goto out_trans_end;
 
-	if ((attr->ia_valid & ATTR_SIZE) &&
-	    attr->ia_size != i_size_read(inode)) {
-		int error;
-
-		error = vmtruncate(inode, attr->ia_size);
-		gfs2_assert_warn(GFS2_SB(inode), !error);
-	}
-
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
-
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
-- 
cgit v0.10.2


From 2ae51ed7b548c1d943d080da617515e801ea5c3e Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 10 Nov 2010 15:14:57 +0000
Subject: GFS2: Clean up duplicated setattr code

While preparing the last patch I noticed that the gfs2_setattr_simple
code had been duplicated into two other places. This patch updates
those to call gfs2_setattr_simple rather than open coding it.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 457ecd1..1db6b73 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1069,7 +1069,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct buffer_head *dibh;
 	u32 ouid, ogid, nuid, ngid;
 	int error;
 
@@ -1100,16 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	if (error)
 		goto out_gunlock_q;
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_setattr_simple(ip, attr);
 	if (error)
 		goto out_end_trans;
 
-	setattr_copy(inode, attr);
-	mark_inode_dirty(inode);
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(ip, dibh->b_data);
-	brelse(dibh);
-
 	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
 		u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
 		gfs2_quota_change(ip, -blocks, ouid, ogid);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index ecfd40d..439b61c 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,10 +1296,8 @@ fail:
 
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
-	struct inode *inode = &ip->i_inode;
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_ea_location el;
-	struct buffer_head *dibh;
 	int error;
 
 	error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,17 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 	if (error)
 		return error;
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		goto out_trans_end;
-
-	setattr_copy(inode, attr);
-	mark_inode_dirty(inode);
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(ip, dibh->b_data);
-	brelse(dibh);
-
-out_trans_end:
+	error = gfs2_setattr_simple(ip, attr);
 	gfs2_trans_end(sdp);
 	return error;
 }
-- 
cgit v0.10.2


From 5e69069c1afb655b5f1a154856ccdb4bb7327b81 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 9 Nov 2010 16:35:20 -0800
Subject: GFS2: fs/gfs2/glock.c: Use printf extension %pV

Using %pV reduces the number of printk calls and
eliminates any possible message interleaving from
other printk calls.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 13155f6..a9b53a4 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -951,17 +951,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
 
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
 	va_start(args, fmt);
+
 	if (seq) {
 		struct gfs2_glock_iter *gi = seq->private;
 		vsprintf(gi->string, fmt, args);
 		seq_printf(seq, gi->string);
 	} else {
-		printk(KERN_ERR " ");
-		vprintk(fmt, args);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+
+		printk(KERN_ERR " %pV", &vaf);
 	}
+
 	va_end(args);
 }
 
-- 
cgit v0.10.2


From edc221d00bd5c6da0e5c67701f3782b72796619f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 10 Nov 2010 13:19:06 -0800
Subject: GFS2: fs/gfs2/glock.h: Add __attribute__((format(printf,2,3)) to
 gfs2_print_dbg

Functions that use printf formatting, especially
those that use %pV, should have their uses of
printf format and arguments checked by the compiler.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index db1c26d..a12d117 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -212,6 +212,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+
+__attribute__ ((format(printf, 2, 3)))
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 
 /**
-- 
cgit v0.10.2


From 802ec9b6682349d9d9c92a9e55f44324d2954f41 Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Thu, 18 Nov 2010 11:26:46 -0500
Subject: GFS2: Allow gfs2 to update quota usage values through the quotactl
 interface

With this patch the gfs2_set_dqblk() function will be able to update the
quota usage block count (FS_DQ_BCOUNT) in addition to the already supported
FS_DQ_BHARD (limit) and FS_DQ_BSOFT (warn) fields of the dquot structure.

Signed-off-by: Abhi Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index f606baf..a689901 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -666,6 +666,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 			qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
 			qd->qd_qb.qb_limit = qp->qu_limit;
 		}
+		if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+			qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+			qd->qd_qb.qb_value = qp->qu_value;
+		}
 	}
 
 	/* Write the quota into the quota file on disk */
@@ -1509,7 +1513,7 @@ out:
 }
 
 /* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
 
 static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 			  struct fs_disk_quota *fdq)
@@ -1569,9 +1573,15 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 	if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
 	    ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
 		fdq->d_fieldmask ^= FS_DQ_BSOFT;
+
 	if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
 	    ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
 		fdq->d_fieldmask ^= FS_DQ_BHARD;
+
+	if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+	    ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+		fdq->d_fieldmask ^= FS_DQ_BCOUNT;
+
 	if (fdq->d_fieldmask == 0)
 		goto out_i;
 
@@ -1620,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
 	.get_dqblk	= gfs2_get_dqblk,
 	.set_dqblk	= gfs2_set_dqblk,
 };
-
-- 
cgit v0.10.2


From 921169ca2f7c8a0a2ccda2ce33c465dfe3ae30ef Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 29 Nov 2010 12:50:38 +0000
Subject: GFS2: Clean up of gdlm_lock function

The DLM never returns -EAGAIN in response to dlm_lock(), and even
if it did, the test in gdlm_lock() was wrong anyway. Once that
test is removed, it is possible to greatly simplify this code
by simply using a "normal" error return code (0 for success).

We then no longer need the LM_OUT_ASYNC return code which can
be removed.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a9b53a4..2dd1d72 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -541,21 +541,6 @@ out_locked:
 	spin_unlock(&gl->gl_spin);
 }
 
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-				 unsigned int req_state,
-				 unsigned int flags)
-{
-	int ret = LM_OUT_ERROR;
-
-	if (!sdp->sd_lockstruct.ls_ops->lm_lock)
-		return req_state == LM_ST_UNLOCKED ? 0 : req_state;
-
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
-							 req_state, flags);
-	return ret;
-}
-
 /**
  * do_xmote - Calls the DLM to change the state of a lock
  * @gl: The lock state
@@ -575,8 +560,8 @@ __acquires(&gl->gl_spin)
 
 	lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
 		      LM_FLAG_PRIORITY);
-	BUG_ON(gl->gl_state == target);
-	BUG_ON(gl->gl_state == gl->gl_target);
+	GLOCK_BUG_ON(gl, gl->gl_state == target);
+	GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
 	if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
 	    glops->go_inval) {
 		set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
@@ -594,15 +579,17 @@ __acquires(&gl->gl_spin)
 	    gl->gl_state == LM_ST_DEFERRED) &&
 	    !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
 		lck_flags |= LM_FLAG_TRY_1CB;
-	ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
 
-	if (!(ret & LM_OUT_ASYNC)) {
-		finish_xmote(gl, ret);
+	if (sdp->sd_lockstruct.ls_ops->lm_lock)	{
+		/* lock_dlm */
+		ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+		GLOCK_BUG_ON(gl, ret);
+	} else { /* lock_nolock */
+		finish_xmote(gl, target);
 		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 			gfs2_glock_put(gl);
-	} else {
-		GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
 	}
+
 	spin_lock(&gl->gl_spin);
 }
 
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index a12d117..ba6865c 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -91,7 +91,7 @@ enum {
 #define GL_NOCACHE		0x00000400
   
 /*
- * lm_lock() and lm_async_cb return flags
+ * lm_async_cb return flags
  *
  * LM_OUT_ST_MASK
  * Masks the lower two bits of lock state in the returned value.
@@ -99,15 +99,11 @@ enum {
  * LM_OUT_CANCELED
  * The lock request was canceled.
  *
- * LM_OUT_ASYNC
- * The result of the request will be returned in an LM_CB_ASYNC callback.
- *
  */
 
 #define LM_OUT_ST_MASK		0x00000003
 #define LM_OUT_CANCELED		0x00000008
-#define LM_OUT_ASYNC		0x00000080
-#define LM_OUT_ERROR		0x00000100
+#define LM_OUT_ERROR		0x00000004
 
 /*
  * lm_recovery_done() messages
@@ -124,8 +120,8 @@ struct lm_lockops {
  	void (*lm_unmount) (struct gfs2_sbd *sdp);
 	void (*lm_withdraw) (struct gfs2_sbd *sdp);
 	void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
-	unsigned int (*lm_lock) (struct gfs2_glock *gl,
-				 unsigned int req_state, unsigned int flags);
+	int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
+			unsigned int flags);
 	void (*lm_cancel) (struct gfs2_glock *gl);
 	const match_table_t *lm_tokens;
 };
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 1c09425..f40ce34 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -146,11 +146,10 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
 	return lkf;
 }
 
-static unsigned int gdlm_lock(struct gfs2_glock *gl,
-			      unsigned int req_state, unsigned int flags)
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
+		     unsigned int flags)
 {
 	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
-	int error;
 	int req;
 	u32 lkf;
 
@@ -162,13 +161,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
 	 * Submit the actual lock request.
 	 */
 
-	error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
-			 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
-	if (error == -EAGAIN)
-		return 0;
-	if (error)
-		return LM_OUT_ERROR;
-	return LM_OUT_ASYNC;
+	return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+			GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
 }
 
 static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
-- 
cgit v0.10.2


From 5091faa449ee0b7d73bc296a93bca9540fc51d0a Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 30 Nov 2010 14:18:03 +0100
Subject: sched: Add 'autogroup' scheduling feature: automated per session task
 groups

A recurring complaint from CFS users is that parallel kbuild has
a negative impact on desktop interactivity.  This patch
implements an idea from Linus, to automatically create task
groups.  Currently, only per session autogroups are implemented,
but the patch leaves the way open for enhancement.

Implementation: each task's signal struct contains an inherited
pointer to a refcounted autogroup struct containing a task group
pointer, the default for all tasks pointing to the
init_task_group.  When a task calls setsid(), a new task group
is created, the process is moved into the new task group, and a
reference to the preveious task group is dropped.  Child
processes inherit this task group thereafter, and increase it's
refcount.  When the last thread of a process exits, the
process's reference is dropped, such that when the last process
referencing an autogroup exits, the autogroup is destroyed.

At runqueue selection time, IFF a task has no cgroup assignment,
its current autogroup is used.

Autogroup bandwidth is controllable via setting it's nice level
through the proc filesystem:

  cat /proc/<pid>/autogroup

Displays the task's group and the group's nice level.

  echo <nice level> > /proc/<pid>/autogroup

Sets the task group's shares to the weight of nice <level> task.
Setting nice level is rate limited for !admin users due to the
abuse risk of task group locking.

The feature is enabled from boot by default if
CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via
the boot option noautogroup, and can also be turned on/off on
the fly via:

  echo [01] > /proc/sys/kernel/sched_autogroup_enabled

... which will automatically move tasks to/from the root task group.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Paul Turner <pjt@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
[ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 92e83e5..86820a7 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1622,6 +1622,8 @@ and is between 256 and 4096 characters. It is defined in the file
 	noapic		[SMP,APIC] Tells the kernel to not make use of any
 			IOAPICs that may be present in the system.
 
+	noautogroup	Disable scheduler automatic task group creation.
+
 	nobats		[PPC] Do not use BATs for mapping kernel lowmem
 			on "Classic" PPC cores.
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f3d02ca..2fa0ce2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1407,6 +1407,82 @@ static const struct file_operations proc_pid_sched_operations = {
 
 #endif
 
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+	proc_sched_autogroup_show_task(p, m);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+	    size_t count, loff_t *offset)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct task_struct *p;
+	char buffer[PROC_NUMBUF];
+	long nice;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+
+	err = strict_strtol(strstrip(buffer), 0, &nice);
+	if (err)
+		return -EINVAL;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	err = nice;
+	err = proc_sched_autogroup_set_nice(p, &err);
+	if (err)
+		count = err;
+
+	put_task_struct(p);
+
+	return count;
+}
+
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
+	int ret;
+
+	ret = single_open(filp, sched_autogroup_show, NULL);
+	if (!ret) {
+		struct seq_file *m = filp->private_data;
+
+		m->private = inode;
+	}
+	return ret;
+}
+
+static const struct file_operations proc_pid_sched_autogroup_operations = {
+	.open		= sched_autogroup_open,
+	.read		= seq_read,
+	.write		= sched_autogroup_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
+
 static ssize_t comm_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *offset)
 {
@@ -2733,6 +2809,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+	REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
+#endif
 	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	INF("syscall",    S_IRUSR, proc_pid_syscall),
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a5b92c7..9c2d46d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -509,6 +509,8 @@ struct thread_group_cputimer {
 	spinlock_t lock;
 };
 
+struct autogroup;
+
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
@@ -576,6 +578,9 @@ struct signal_struct {
 
 	struct tty_struct *tty; /* NULL if no tty */
 
+#ifdef CONFIG_SCHED_AUTOGROUP
+	struct autogroup *autogroup;
+#endif
 	/*
 	 * Cumulative resource counters for dead threads in the group,
 	 * and for reaped dead child processes forked by this group.
@@ -1927,6 +1932,24 @@ int sched_rt_handler(struct ctl_table *table, int write,
 
 extern unsigned int sysctl_sched_compat_yield;
 
+#ifdef CONFIG_SCHED_AUTOGROUP
+extern unsigned int sysctl_sched_autogroup_enabled;
+
+extern void sched_autogroup_create_attach(struct task_struct *p);
+extern void sched_autogroup_detach(struct task_struct *p);
+extern void sched_autogroup_fork(struct signal_struct *sig);
+extern void sched_autogroup_exit(struct signal_struct *sig);
+#ifdef CONFIG_PROC_FS
+extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
+extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice);
+#endif
+#else
+static inline void sched_autogroup_create_attach(struct task_struct *p) { }
+static inline void sched_autogroup_detach(struct task_struct *p) { }
+static inline void sched_autogroup_fork(struct signal_struct *sig) { }
+static inline void sched_autogroup_exit(struct signal_struct *sig) { }
+#endif
+
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
diff --git a/init/Kconfig b/init/Kconfig
index 88c1046..f1bba0a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -728,6 +728,19 @@ config NET_NS
 
 endif # NAMESPACES
 
+config SCHED_AUTOGROUP
+	bool "Automatic process group scheduling"
+	select EVENTFD
+	select CGROUPS
+	select CGROUP_SCHED
+	select FAIR_GROUP_SCHED
+	help
+	  This option optimizes the scheduler for common desktop workloads by
+	  automatically creating and populating task groups.  This separation
+	  of workloads isolates aggressive CPU burners (like build jobs) from
+	  desktop applications.  Task group autogeneration is currently based
+	  upon task session.
+
 config MM_OWNER
 	bool
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b159c5..b6f2475 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig)
 
 static inline void put_signal_struct(struct signal_struct *sig)
 {
-	if (atomic_dec_and_test(&sig->sigcnt))
+	if (atomic_dec_and_test(&sig->sigcnt)) {
+		sched_autogroup_exit(sig);
 		free_signal_struct(sig);
+	}
 }
 
 void __put_task_struct(struct task_struct *tsk)
@@ -904,6 +906,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	posix_cpu_timers_init_group(sig);
 
 	tty_audit_fork(sig);
+	sched_autogroup_fork(sig);
 
 	sig->oom_adj = current->signal->oom_adj;
 	sig->oom_score_adj = current->signal->oom_score_adj;
diff --git a/kernel/sched.c b/kernel/sched.c
index 66ef579..b646dad 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -79,6 +79,7 @@
 
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
+#include "sched_autogroup.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -271,6 +272,10 @@ struct task_group {
 	struct task_group *parent;
 	struct list_head siblings;
 	struct list_head children;
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+	struct autogroup *autogroup;
+#endif
 };
 
 #define root_task_group init_task_group
@@ -603,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
  */
 static inline struct task_group *task_group(struct task_struct *p)
 {
+	struct task_group *tg;
 	struct cgroup_subsys_state *css;
 
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 			lockdep_is_held(&task_rq(p)->lock));
-	return container_of(css, struct task_group, css);
+	tg = container_of(css, struct task_group, css);
+
+	return autogroup_task_group(p, tg);
 }
 
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -1869,6 +1877,7 @@ static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "sched_autogroup.c"
 #include "sched_stoptask.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
@@ -7750,7 +7759,7 @@ void __init sched_init(void)
 #ifdef CONFIG_CGROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
 	INIT_LIST_HEAD(&init_task_group.children);
-
+	autogroup_init(&init_task);
 #endif /* CONFIG_CGROUP_SCHED */
 
 	for_each_possible_cpu(i) {
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 0000000..57a7ac28
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,229 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+static struct autogroup autogroup_default;
+static atomic_t autogroup_seq_nr;
+
+static void autogroup_init(struct task_struct *init_task)
+{
+	autogroup_default.tg = &init_task_group;
+	init_task_group.autogroup = &autogroup_default;
+	kref_init(&autogroup_default.kref);
+	init_rwsem(&autogroup_default.lock);
+	init_task->signal->autogroup = &autogroup_default;
+}
+
+static inline void autogroup_free(struct task_group *tg)
+{
+	kfree(tg->autogroup);
+}
+
+static inline void autogroup_destroy(struct kref *kref)
+{
+	struct autogroup *ag = container_of(kref, struct autogroup, kref);
+
+	sched_destroy_group(ag->tg);
+}
+
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+	kref_put(&ag->kref, autogroup_destroy);
+}
+
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+	kref_get(&ag->kref);
+	return ag;
+}
+
+static inline struct autogroup *autogroup_create(void)
+{
+	struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+	struct task_group *tg;
+
+	if (!ag)
+		goto out_fail;
+
+	tg = sched_create_group(&init_task_group);
+
+	if (IS_ERR(tg))
+		goto out_free;
+
+	kref_init(&ag->kref);
+	init_rwsem(&ag->lock);
+	ag->id = atomic_inc_return(&autogroup_seq_nr);
+	ag->tg = tg;
+	tg->autogroup = ag;
+
+	return ag;
+
+out_free:
+	kfree(ag);
+out_fail:
+	if (printk_ratelimit()) {
+		printk(KERN_WARNING "autogroup_create: %s failure.\n",
+			ag ? "sched_create_group()" : "kmalloc()");
+	}
+
+	return autogroup_kref_get(&autogroup_default);
+}
+
+static inline bool
+task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+	if (tg != &root_task_group)
+		return false;
+
+	if (p->sched_class != &fair_sched_class)
+		return false;
+
+	/*
+	 * We can only assume the task group can't go away on us if
+	 * autogroup_move_group() can see us on ->thread_group list.
+	 */
+	if (p->flags & PF_EXITING)
+		return false;
+
+	return true;
+}
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+	int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+	if (enabled && task_wants_autogroup(p, tg))
+		return p->signal->autogroup->tg;
+
+	return tg;
+}
+
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+	struct autogroup *prev;
+	struct task_struct *t;
+	unsigned long flags;
+
+	BUG_ON(!lock_task_sighand(p, &flags));
+
+	prev = p->signal->autogroup;
+	if (prev == ag) {
+		unlock_task_sighand(p, &flags);
+		return;
+	}
+
+	p->signal->autogroup = autogroup_kref_get(ag);
+
+	t = p;
+	do {
+		sched_move_task(t);
+	} while_each_thread(p, t);
+
+	unlock_task_sighand(p, &flags);
+	autogroup_kref_put(prev);
+}
+
+/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+	struct autogroup *ag = autogroup_create();
+
+	autogroup_move_group(p, ag);
+	/* drop extra refrence added by autogroup_create() */
+	autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+
+/* Cannot be called under siglock.  Currently has no users */
+void sched_autogroup_detach(struct task_struct *p)
+{
+	autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+	struct task_struct *p = current;
+
+	spin_lock_irq(&p->sighand->siglock);
+	sig->autogroup = autogroup_kref_get(p->signal->autogroup);
+	spin_unlock_irq(&p->sighand->siglock);
+}
+
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+	autogroup_kref_put(sig->autogroup);
+}
+
+static int __init setup_autogroup(char *str)
+{
+	sysctl_sched_autogroup_enabled = 0;
+
+	return 1;
+}
+
+__setup("noautogroup", setup_autogroup);
+
+#ifdef CONFIG_PROC_FS
+
+/* Called with siglock held. */
+int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+{
+	static unsigned long next = INITIAL_JIFFIES;
+	struct autogroup *ag;
+	int err;
+
+	if (*nice < -20 || *nice > 19)
+		return -EINVAL;
+
+	err = security_task_setnice(current, *nice);
+	if (err)
+		return err;
+
+	if (*nice < 0 && !can_nice(current, *nice))
+		return -EPERM;
+
+	/* this is a heavy operation taking global locks.. */
+	if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
+		return -EAGAIN;
+
+	next = HZ / 10 + jiffies;
+	ag = autogroup_kref_get(p->signal->autogroup);
+
+	down_write(&ag->lock);
+	err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+	if (!err)
+		ag->nice = *nice;
+	up_write(&ag->lock);
+
+	autogroup_kref_put(ag);
+
+	return err;
+}
+
+void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
+{
+	struct autogroup *ag = autogroup_kref_get(p->signal->autogroup);
+
+	down_read(&ag->lock);
+	seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
+	up_read(&ag->lock);
+
+	autogroup_kref_put(ag);
+}
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+	return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 0000000..5358e24
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,32 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+struct autogroup {
+	struct kref		kref;
+	struct task_group	*tg;
+	struct rw_semaphore	lock;
+	unsigned long		id;
+	int			nice;
+};
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg);
+
+#else /* !CONFIG_SCHED_AUTOGROUP */
+
+static inline void autogroup_init(struct task_struct *init_task) {  }
+static inline void autogroup_free(struct task_group *tg) { }
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+	return tg;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+	return 0;
+}
+#endif
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index e95b774..1dfae3d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void print_cfs_group_stats(struct seq_file *m, int cpu,
-		struct task_group *tg)
+static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
 {
 	struct sched_entity *se = tg->se[cpu];
 	if (!se)
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
 
-#ifdef CONFIG_CGROUP_SCHED
-	{
-		char path[64];
-
-		rcu_read_lock();
-		cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
-		rcu_read_unlock();
-		SEQ_printf(m, " %s", path);
-	}
-#endif
 	SEQ_printf(m, "\n");
 }
 
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 	read_unlock_irqrestore(&tasklist_lock, flags);
 }
 
-#if defined(CONFIG_CGROUP_SCHED) && \
-	(defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
-static void task_group_path(struct task_group *tg, char *buf, int buflen)
-{
-	/* may be NULL if the underlying cgroup isn't fully-created yet */
-	if (!tg->css.cgroup) {
-		buf[0] = '\0';
-		return;
-	}
-	cgroup_path(tg->css.cgroup, buf, buflen);
-}
-#endif
-
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
 	s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	struct sched_entity *last;
 	unsigned long flags;
 
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-	char path[128];
-	struct task_group *tg = cfs_rq->tg;
-
-	task_group_path(tg, path, sizeof(path));
-
-	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#else
 	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
-#endif
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
 			SPLIT_NS(cfs_rq->exec_clock));
 
@@ -215,7 +182,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
 			cfs_rq->load_contribution);
 	SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
-			atomic_read(&tg->load_weight));
+			atomic_read(&cfs_rq->tg->load_weight));
 #endif
 
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -224,17 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
-	char path[128];
-	struct task_group *tg = rt_rq->tg;
-
-	task_group_path(tg, path, sizeof(path));
-
-	SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
-#else
 	SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
-#endif
-
 
 #define P(x) \
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd..2745dcd 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid)
 	err = session;
 out:
 	write_unlock_irq(&tasklist_lock);
-	if (err > 0)
+	if (err > 0) {
 		proc_sid_connector(group_leader);
+		sched_autogroup_create_attach(group_leader);
+	}
 	return err;
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a00fdef..121e4ff 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -370,6 +370,17 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SCHED_AUTOGROUP
+	{
+		.procname	= "sched_autogroup_enabled",
+		.data		= &sysctl_sched_autogroup_enabled,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
-- 
cgit v0.10.2


From 0b1246e6776c79719ff4a3afd9c38fba99b99d5a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 30 Nov 2010 15:33:04 +0000
Subject: GFS2: Remove duplicate #defines from glock.h

There are a number of duplicated #defines in glock.h
plus one which is unused. This removes the extra
definitions.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index ba6865c..691851c 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -87,7 +87,6 @@ enum {
 #define GL_ASYNC		0x00000040
 #define GL_EXACT		0x00000080
 #define GL_SKIP			0x00000100
-#define GL_ATIME		0x00000200
 #define GL_NOCACHE		0x00000400
   
 /*
@@ -126,19 +125,6 @@ struct lm_lockops {
 	const match_table_t *lm_tokens;
 };
 
-#define LM_FLAG_TRY		0x00000001
-#define LM_FLAG_TRY_1CB		0x00000002
-#define LM_FLAG_NOEXP		0x00000004
-#define LM_FLAG_ANY		0x00000008
-#define LM_FLAG_PRIORITY	0x00000010
-
-#define GL_ASYNC		0x00000040
-#define GL_EXACT		0x00000080
-#define GL_SKIP			0x00000100
-#define GL_NOCACHE		0x00000400
-
-#define GLR_TRYFAILED		13
-
 extern struct workqueue_struct *gfs2_delete_workqueue;
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
-- 
cgit v0.10.2


From 0489b3f5eba735413ccedd425651cf41d6b1f7c5 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Tue, 30 Nov 2010 09:38:35 -0600
Subject: GFS2: reread rindex when necessary to grow rindex

When GFS2 grew the filesystem, it was never rereading the rindex file during
the grow. This is necessary for large grows when the filesystem is almost full,
and GFS2 needs to use some of the space allocated earlier in the grow to
complete it.  Now, if GFS2 fails to reserve the necessary space and the rindex
file is not uptodate, it rereads it.  Also, the only difference between
gfs2_ri_update() and gfs2_ri_update_special() was that gfs2_ri_update_special()
didn't clear out the existing resource groups, since you knew that it was only
called when there were no resource groups.  Attempting to clear out the
resource groups when there are none takes almost no time, and rarely happens,
so I simply removed gfs2_ri_update_special().

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 33c8407..6375b38 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -614,46 +614,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 }
 
 /**
- * gfs2_ri_update_special - Pull in a new resource index from the disk
- *
- * This is a special version that's safe to call from gfs2_inplace_reserve_i.
- * In this case we know that we don't have any resource groups in memory yet.
- *
- * @ip: pointer to the rindex inode
- *
- * Returns: 0 on successful update, error code otherwise
- */
-static int gfs2_ri_update_special(struct gfs2_inode *ip)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct inode *inode = &ip->i_inode;
-	struct file_ra_state ra_state;
-	struct gfs2_rgrpd *rgd;
-	unsigned int max_data = 0;
-	int error;
-
-	file_ra_state_init(&ra_state, inode->i_mapping);
-	for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
-		/* Ignore partials */
-		if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-		    i_size_read(inode))
-			break;
-		error = read_rindex_entry(ip, &ra_state);
-		if (error) {
-			clear_rgrpdi(sdp);
-			return error;
-		}
-	}
-	list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
-		if (rgd->rd_data > max_data)
-			max_data = rgd->rd_data;
-	sdp->sd_max_rg_data = max_data;
-
-	sdp->sd_rindex_uptodate = 1;
-	return 0;
-}
-
-/**
  * gfs2_rindex_hold - Grab a lock on the rindex
  * @sdp: The GFS2 superblock
  * @ri_gh: the glock holder
@@ -1226,16 +1186,25 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
 			error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
 		else if (!sdp->sd_rgrps) /* We may not have the rindex read
 					    in, so: */
-			error = gfs2_ri_update_special(ip);
+			error = gfs2_ri_update(ip);
 		if (error)
 			return error;
 	}
 
+try_again:
 	do {
 		error = get_local_rgrp(ip, &last_unlinked);
 		/* If there is no space, flushing the log may release some */
-		if (error)
+		if (error) {
+			if (ip == GFS2_I(sdp->sd_rindex) &&
+			    !sdp->sd_rindex_uptodate) {
+				error = gfs2_ri_update(ip);
+				if (error)
+					return error;
+				goto try_again;
+			}
 			gfs2_log_flush(sdp, NULL);
+		}
 	} while (error && tries++ < 3);
 
 	if (error) {
-- 
cgit v0.10.2


From 086d8334cf73b3bb695b82dd864a7a8b00d96b7e Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Tue, 23 Nov 2010 23:52:55 -0600
Subject: GFS2: fix recursive locking during rindex truncates

When you truncate the rindex file, you need to avoid calling gfs2_rindex_hold,
since you already hold it.  However, if you haven't already read in the
resource groups, you need to do that.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5476c06..ae7d205 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	if (metadata)
 		revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
 
-	error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+	if (ip != GFS2_I(sdp->sd_rindex))
+		error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+	else if (!sdp->sd_rgrps)
+		error = gfs2_ri_update(ip);
+
 	if (error)
 		return error;
 
@@ -879,7 +883,8 @@ out_rg_gunlock:
 out_rlist:
 	gfs2_rlist_free(&rlist);
 out:
-	gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
+	if (ip != GFS2_I(sdp->sd_rindex))
+		gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
 	return error;
 }
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 6375b38..25dbe5c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
  * Returns: 0 on successful update, error code otherwise
  */
 
-static int gfs2_ri_update(struct gfs2_inode *ip)
+int gfs2_ri_update(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct inode *inode = &ip->i_inode;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 0e35c04..50c2bb0 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
 
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
+extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
 
-- 
cgit v0.10.2


From e06dfc492870e1d380f02722cde084b724dc197b Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 30 Nov 2010 15:46:02 +0000
Subject: GFS2: Fix uninitialised error value in previous patch

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index ae7d205..3c4039d 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	int metadata;
 	unsigned int revokes = 0;
 	int x;
-	int error;
+	int error = 0;
 
 	if (!*top)
 		sm->sm_first = 0;
-- 
cgit v0.10.2


From 47a25380e37f44db7202093ca92e4af569c34f55 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 30 Nov 2010 15:49:31 +0000
Subject: GFS2: Merge glock state fields into a bitfield

We can only merge the fields into a bitfield if the locking
rules for them are the same. In this case gl_spin covers all
of the fields (write side) but a couple of them are used
with GLF_LOCK as the read side lock, which should be ok
since we know that the field in question won't be changing
at the time.

The gl_req setting has to be done earlier (in glock.c) in order
to place it under gl_spin. The gl_reply setting also has to be
brought under gl_spin in order to comply with the new rules.

This saves 4*sizeof(unsigned int) per glock.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Bob Peterson <rpeterso@redhat.com>

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 2dd1d72..08a8beb 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -567,6 +567,7 @@ __acquires(&gl->gl_spin)
 		set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
 		do_error(gl, 0); /* Fail queued try locks */
 	}
+	gl->gl_req = target;
 	spin_unlock(&gl->gl_spin);
 	if (glops->go_xmote_th)
 		glops->go_xmote_th(gl);
@@ -1353,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
  * @gl: Pointer to the glock
  * @ret: The return value from the dlm
  *
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
  */
 
 void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
 	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
 
+	spin_lock(&gl->gl_spin);
 	gl->gl_reply = ret;
 
 	if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
-		spin_lock(&gl->gl_spin);
 		if (gfs2_should_freeze(gl)) {
 			set_bit(GLF_FROZEN, &gl->gl_flags);
 			spin_unlock(&gl->gl_spin);
 			return;
 		}
-		spin_unlock(&gl->gl_spin);
 	}
+
+	spin_unlock(&gl->gl_spin);
 	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+	smp_wmb();
 	gfs2_glock_hold(gl);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 		gfs2_glock_put(gl);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 764fbb4..8d3d2b4 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -207,12 +207,14 @@ struct gfs2_glock {
 
 	spinlock_t gl_spin;
 
-	unsigned int gl_state;
-	unsigned int gl_target;
-	unsigned int gl_reply;
+	/* State fields protected by gl_spin */
+	unsigned int gl_state:2,	/* Current state */
+		     gl_target:2,	/* Target state */
+		     gl_demote_state:2,	/* State requested by remote node */
+		     gl_req:2,		/* State in last dlm request */
+		     gl_reply:8;	/* Last reply from the dlm */
+
 	unsigned int gl_hash;
-	unsigned int gl_req;
-	unsigned int gl_demote_state; /* state requested by remote node */
 	unsigned long gl_demote_time; /* time of first demote request */
 	struct list_head gl_holders;
 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index f40ce34..6e493ae 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -153,7 +153,6 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
 	int req;
 	u32 lkf;
 
-	gl->gl_req = req_state;
 	req = make_mode(req_state);
 	lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
 
-- 
cgit v0.10.2


From 87de5ac782761a3ebf806e434e8c9cc205a87274 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 20 Sep 2010 17:42:46 -0700
Subject: timers: Introduce timerlist infrastructure.

The timerlist infrastructure is a thin layer over the rbtree
code that implements a simple list of timers sorted by an
expires value, and a getnext function that provides a pointer
to the earliest timer.

This infrastructure allows drivers and other kernel infrastructure
to easily implement timers without duplicating code.

Signed-off-by: John Stultz <john.stultz@linaro.org>
LKML Reference: <1290136329-18291-2-git-send-email-john.stultz@linaro.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Richard Cochran <richardcochran@gmail.com>

diff --git a/include/linux/timerlist.h b/include/linux/timerlist.h
new file mode 100644
index 0000000..c46b28a
--- /dev/null
+++ b/include/linux/timerlist.h
@@ -0,0 +1,37 @@
+#ifndef _LINUX_TIMERLIST_H
+#define _LINUX_TIMERLIST_H
+
+#include <linux/rbtree.h>
+#include <linux/ktime.h>
+
+
+struct timerlist_node {
+	struct rb_node node;
+	ktime_t expires;
+};
+
+struct timerlist_head {
+	struct rb_root head;
+	struct timerlist_node *next;
+};
+
+
+extern void timerlist_add(struct timerlist_head *head,
+				struct timerlist_node *node);
+extern void timerlist_del(struct timerlist_head *head,
+				struct timerlist_node *node);
+extern struct timerlist_node *timerlist_getnext(struct timerlist_head *head);
+extern struct timerlist_node *timerlist_iterate_next(
+						struct timerlist_node *node);
+
+static inline void timerlist_init(struct timerlist_node *node)
+{
+	RB_CLEAR_NODE(&node->node);
+}
+
+static inline void timerlist_init_head(struct timerlist_head *head)
+{
+	head->head = RB_ROOT;
+	head->next = NULL;
+}
+#endif /* _LINUX_TIMERLIST_H */
diff --git a/lib/Makefile b/lib/Makefile
index e6a3763..8b475cf 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
 endif
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
-	 rbtree.o radix-tree.o dump_stack.o \
+	 rbtree.o radix-tree.o dump_stack.o timerlist.o\
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
 	 proportions.o prio_heap.o ratelimit.o show_mem.o \
diff --git a/lib/timerlist.c b/lib/timerlist.c
new file mode 100644
index 0000000..9101b42
--- /dev/null
+++ b/lib/timerlist.c
@@ -0,0 +1,118 @@
+/*
+ *  Generic Timer-list
+ *
+ *  Manages a simple list of timers, ordered by expiration time.
+ *  Uses rbtrees for quick list adds and expiration.
+ *
+ *  NOTE: All of the following functions need to be serialized
+ *  to avoid races. No locking is done by this libary code.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/timerlist.h>
+#include <linux/rbtree.h>
+
+/**
+ * timerlist_add - Adds timer to timerlist.
+ *
+ * @head: head of timerlist
+ * @node: timer node to be added
+ *
+ * Adds the timer node to the timerlist, sorted by the
+ * node's expires value.
+ */
+void timerlist_add(struct timerlist_head *head, struct timerlist_node *node)
+{
+	struct rb_node **p = &head->head.rb_node;
+	struct rb_node *parent = NULL;
+	struct timerlist_node  *ptr;
+
+	/* Make sure we don't add nodes that are already added */
+	WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));
+
+	while (*p) {
+		parent = *p;
+		ptr = rb_entry(parent, struct timerlist_node, node);
+		if (node->expires.tv64 < ptr->expires.tv64)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&node->node, parent, p);
+	rb_insert_color(&node->node, &head->head);
+
+	if (!head->next || node->expires.tv64 < head->next->expires.tv64)
+		head->next = node;
+}
+
+/**
+ * timerlist_del - Removes a timer from the timerlist.
+ *
+ * @head: head of timerlist
+ * @node: timer node to be removed
+ *
+ * Removes the timer node from the timerlist.
+ */
+void timerlist_del(struct timerlist_head *head, struct timerlist_node *node)
+{
+	WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
+
+	/* update next pointer */
+	if (head->next == node) {
+		struct rb_node *rbn = rb_next(&node->node);
+
+		head->next = rbn ?
+			rb_entry(rbn, struct timerlist_node, node) : NULL;
+	}
+	rb_erase(&node->node, &head->head);
+	RB_CLEAR_NODE(&node->node);
+}
+
+
+/**
+ * timerlist_getnext - Returns the timer with the earlies expiration time
+ *
+ * @head: head of timerlist
+ *
+ * Returns a pointer to the timer node that has the
+ * earliest expiration time.
+ */
+struct timerlist_node *timerlist_getnext(struct timerlist_head *head)
+{
+	return head->next;
+}
+
+
+/**
+ * timerlist_iterate_next - Returns the timer after the provided timer
+ *
+ * @node: Pointer to a timer.
+ *
+ * Provides the timer that is after the given node. This is used, when
+ * necessary, to iterate through the list of timers in a timer list
+ * without modifying the list.
+ */
+struct timerlist_node *timerlist_iterate_next(struct timerlist_node *node)
+{
+	struct rb_node *next;
+
+	if (!node)
+		return NULL;
+	next = rb_next(&node->node);
+	if (!next)
+		return NULL;
+	return container_of(next, struct timerlist_node, node);
+}
-- 
cgit v0.10.2


From 08ec0c58fb8a05d3191d5cb6f5d6f81adb419798 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 27 Jul 2010 17:00:00 -0700
Subject: x86: Improve TSC calibration using a delayed workqueue

Boot to boot the TSC calibration may vary by quite a large amount.

While normal variance of 50-100ppm can easily be seen, the quick
calibration code only requires 500ppm accuracy, which is the limit
of what NTP can correct for.

This can cause problems for systems being used as NTP servers, as
every time they reboot it can take hours for them to calculate the
new drift error caused by the calibration.

The classic trade-off here is calibration accuracy vs slow boot times,
as during the calibration nothing else can run.

This patch uses a delayed workqueue  to calibrate the TSC over the
period of a second. This allows very accurate calibration (in my
tests only varying by 1khz or 0.4ppm boot to boot). Additionally this
refined calibration step does not block the boot process, and only
delays the TSC clocksoure registration by a few seconds in early boot.
If the refined calibration strays 1% from the early boot calibration
value, the system will fall back to already calculated early boot
calibration.

Credit to Andi Kleen who suggested using a timer quite awhile back,
but I dismissed it thinking the timer calibration would be done after
the clocksource was registered (which would break things). Forgive
me for my short-sightedness.

This patch has worked very well in my testing, but TSC hardware is
quite varied so it would probably be good to get some extended
testing, possibly pushing inclusion out to 2.6.39.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1289003985-29060-1-git-send-email-johnstul@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ingo Molnar <mingo@elte.hu>
CC: Martin Schwidefsky <schwidefsky@de.ibm.com>
CC: Clark Williams <williams@redhat.com>
CC: Andi Kleen <andi@firstfloor.org>

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index bb64beb..dc1393e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -888,7 +888,82 @@ __cpuinit int unsynchronized_tsc(void)
 	return 0;
 }
 
-static void __init init_tsc_clocksource(void)
+
+static void tsc_refine_calibration_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
+/**
+ * tsc_refine_calibration_work - Further refine tsc freq calibration
+ * @work - ignored.
+ *
+ * This functions uses delayed work over a period of a
+ * second to further refine the TSC freq value. Since this is
+ * timer based, instead of loop based, we don't block the boot
+ * process while this longer calibration is done.
+ *
+ * If there are any calibration anomolies (too many SMIs, etc),
+ * or the refined calibration is off by 1% of the fast early
+ * calibration, we throw out the new calibration and use the
+ * early calibration.
+ */
+static void tsc_refine_calibration_work(struct work_struct *work)
+{
+	static u64 tsc_start = -1, ref_start;
+	static int hpet;
+	u64 tsc_stop, ref_stop, delta;
+	unsigned long freq;
+
+	/* Don't bother refining TSC on unstable systems */
+	if (check_tsc_unstable())
+		goto out;
+
+	/*
+	 * Since the work is started early in boot, we may be
+	 * delayed the first time we expire. So set the workqueue
+	 * again once we know timers are working.
+	 */
+	if (tsc_start == -1) {
+		/*
+		 * Only set hpet once, to avoid mixing hardware
+		 * if the hpet becomes enabled later.
+		 */
+		hpet = is_hpet_enabled();
+		schedule_delayed_work(&tsc_irqwork, HZ);
+		tsc_start = tsc_read_refs(&ref_start, hpet);
+		return;
+	}
+
+	tsc_stop = tsc_read_refs(&ref_stop, hpet);
+
+	/* hpet or pmtimer available ? */
+	if (!hpet && !ref_start && !ref_stop)
+		goto out;
+
+	/* Check, whether the sampling was disturbed by an SMI */
+	if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
+		goto out;
+
+	delta = tsc_stop - tsc_start;
+	delta *= 1000000LL;
+	if (hpet)
+		freq = calc_hpet_ref(delta, ref_start, ref_stop);
+	else
+		freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+
+	/* Make sure we're within 1% */
+	if (abs(tsc_khz - freq) > tsc_khz/100)
+		goto out;
+
+	tsc_khz = freq;
+	printk(KERN_INFO "Refined TSC clocksource calibration: "
+		"%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
+					(unsigned long)tsc_khz % 1000);
+
+out:
+	clocksource_register_khz(&clocksource_tsc, tsc_khz);
+}
+
+
+static int __init init_tsc_clocksource(void)
 {
 	if (tsc_clocksource_reliable)
 		clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
@@ -897,8 +972,14 @@ static void __init init_tsc_clocksource(void)
 		clocksource_tsc.rating = 0;
 		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
 	}
-	clocksource_register_khz(&clocksource_tsc, tsc_khz);
+	schedule_delayed_work(&tsc_irqwork, 0);
+	return 0;
 }
+/*
+ * We use device_initcall here, to ensure we run after the hpet
+ * is fully initialized, which may occur at fs_initcall time.
+ */
+device_initcall(init_tsc_clocksource);
 
 void __init tsc_init(void)
 {
@@ -952,6 +1033,5 @@ void __init tsc_init(void)
 		mark_tsc_unstable("TSCs unsynchronized");
 
 	check_system_tsc_reliable();
-	init_tsc_clocksource();
 }
 
-- 
cgit v0.10.2


From a38c5380ef9f088be9f49b6e4c5d80af8b1b5cd4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Nov 2010 17:50:20 +0100
Subject: x86: io_apic: Split setup_ioapic_ids_from_mpc()

Sodaville needs to setup the IO_APIC ids as the boot loader leaves
them uninitialized. Split out the setter function so it can be called
unconditionally from the sodaville board code.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <20101126165020.GA26361@www.tglx.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 240a0a5..d7d46cb 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -169,6 +169,7 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 
 extern void setup_ioapic_ids_from_mpc(void);
+extern void setup_ioapic_ids_from_mpc_nocheck(void);
 
 struct mp_ioapic_gsi{
 	u32 gsi_base;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ce3c6fb..4f026a6 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1934,8 +1934,7 @@ void disable_IO_APIC(void)
  *
  * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
  */
-
-void __init setup_ioapic_ids_from_mpc(void)
+void __init setup_ioapic_ids_from_mpc_nocheck(void)
 {
 	union IO_APIC_reg_00 reg_00;
 	physid_mask_t phys_id_present_map;
@@ -1944,15 +1943,6 @@ void __init setup_ioapic_ids_from_mpc(void)
 	unsigned char old_id;
 	unsigned long flags;
 
-	if (acpi_ioapic)
-		return;
-	/*
-	 * Don't check I/O APIC IDs for xAPIC systems.  They have
-	 * no meaning without the serial APIC bus.
-	 */
-	if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		|| APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-		return;
 	/*
 	 * This is broken; anything with a real cpu count has to
 	 * circumvent this idiocy regardless.
@@ -2006,7 +1996,6 @@ void __init setup_ioapic_ids_from_mpc(void)
 			physids_or(phys_id_present_map, phys_id_present_map, tmp);
 		}
 
-
 		/*
 		 * We need to adjust the IRQ routing table
 		 * if the ID changed.
@@ -2042,6 +2031,21 @@ void __init setup_ioapic_ids_from_mpc(void)
 			apic_printk(APIC_VERBOSE, " ok.\n");
 	}
 }
+
+void __init setup_ioapic_ids_from_mpc(void)
+{
+
+	if (acpi_ioapic)
+		return;
+	/*
+	 * Don't check I/O APIC IDs for xAPIC systems.  They have
+	 * no meaning without the serial APIC bus.
+	 */
+	if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		|| APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+		return;
+	setup_ioapic_ids_from_mpc_nocheck();
+}
 #endif
 
 int no_timer_check __initdata;
-- 
cgit v0.10.2


From 5ec6960f6f0c7be9cc6e5506fdf0070add3b6e08 Mon Sep 17 00:00:00 2001
From: Dirk Brandewie <dirk.brandewie@gmail.com>
Date: Mon, 22 Nov 2010 06:28:48 -0800
Subject: ce4100: Add errata fixes for UART on CE4100

This patch enables the UART on the CE4100. The UART has a couple of
issues that need to be worked around. First the UART is mostly PC
compatible except that it is clocked eight times faster than a
standard PC so the default configuration provided in
arch/x86/include/asm/serial.h needs to be overridden. Second the TX
interrupt may not be set correctly all the time. Lastly accessing the
UART via I/O space for early_prink() hangs the chip when the IOAPIC is
enabled.

A custom mem_serial_in() is provided to work around the TX interrupt
issue. The configuration issues are dealt with in the call back
registered with the 8250 driver via serial8250_set_isa_configurator()

Signed-off-by: Dirk Brandewie <dirk.brandewie@gmail.com>
LKML-Reference: <1290436128-17958-1-git-send-email-dirk.brandewie@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index 0ede12b..d2c0d51 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -12,20 +12,110 @@
 #include <linux/kernel.h>
 #include <linux/irq.h>
 #include <linux/module.h>
+#include <linux/serial_reg.h>
+#include <linux/serial_8250.h>
 
 #include <asm/setup.h>
+#include <asm/io.h>
 
 static int ce4100_i8042_detect(void)
 {
 	return 0;
 }
 
-static void __init sdv_arch_setup(void)
+static void __init sdv_find_smp_config(void)
 {
 }
 
-static void __init sdv_find_smp_config(void)
+#ifdef CONFIG_SERIAL_8250
+
+
+static unsigned int mem_serial_in(struct uart_port *p, int offset)
+{
+	offset = offset << p->regshift;
+	return readl(p->membase + offset);
+}
+
+/*
+ * The UART Tx interrupts are not set under some conditions and therefore serial
+ * transmission hangs. This is a silicon issue and has not been root caused. The
+ * workaround for this silicon issue checks UART_LSR_THRE bit and UART_LSR_TEMT
+ * bit of LSR register in interrupt handler to see whether at least one of these
+ * two bits is set, if so then process the transmit request. If this workaround
+ * is not applied, then the serial transmission may hang. This workaround is for
+ * errata number 9 in Errata - B step.
+*/
+
+static unsigned int ce4100_mem_serial_in(struct uart_port *p, int offset)
+{
+	unsigned int ret, ier, lsr;
+
+	if (offset == UART_IIR) {
+		offset = offset << p->regshift;
+		ret = readl(p->membase + offset);
+		if (ret & UART_IIR_NO_INT) {
+			/* see if the TX interrupt should have really set */
+			ier = mem_serial_in(p, UART_IER);
+			/* see if the UART's XMIT interrupt is enabled */
+			if (ier & UART_IER_THRI) {
+				lsr = mem_serial_in(p, UART_LSR);
+				/* now check to see if the UART should be
+				   generating an interrupt (but isn't) */
+				if (lsr & (UART_LSR_THRE | UART_LSR_TEMT))
+					ret &= ~UART_IIR_NO_INT;
+			}
+		}
+	} else
+		ret =  mem_serial_in(p, offset);
+	return ret;
+}
+
+static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value)
+{
+	offset = offset << p->regshift;
+	writel(value, p->membase + offset);
+}
+
+static void ce4100_serial_fixup(int port, struct uart_port *up,
+	unsigned short *capabilites)
+{
+#ifdef CONFIG_EARLY_PRINTK
+	/*
+	 * Over ride the legacy port configuration that comes from
+	 * asm/serial.h. Using the ioport driver then switching to the
+	 * PCI memmaped driver hangs the IOAPIC
+	 */
+	if (up->iotype !=  UPIO_MEM32) {
+		up->uartclk  = 14745600;
+		up->mapbase = 0xdffe0200;
+		set_fixmap_nocache(FIX_EARLYCON_MEM_BASE,
+				up->mapbase & PAGE_MASK);
+		up->membase =
+			(void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE);
+		up->membase += up->mapbase & ~PAGE_MASK;
+		up->iotype   = UPIO_MEM32;
+		up->regshift = 2;
+	}
+#endif
+	up->iobase = 0;
+	up->serial_in = ce4100_mem_serial_in;
+	up->serial_out = ce4100_mem_serial_out;
+
+	*capabilites |= (1 << 12);
+}
+
+static __init void sdv_serial_fixup(void)
+{
+	serial8250_set_isa_configurator(ce4100_serial_fixup);
+}
+
+#else
+static inline void sdv_serial_fixup(void);
+#endif
+
+static void __init sdv_arch_setup(void)
 {
+	sdv_serial_fixup();
 }
 
 /*
-- 
cgit v0.10.2


From e4d2ebcab11b308b5b59073578efd33eccd55d46 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Fri, 3 Dec 2010 11:51:38 +0800
Subject: x86, apbt: Setup affinity for apb timers acting as per-cpu timer

Commit a5ef2e70 "x86: Sanitize apb timer interrupt handling" forgot
the affinity setup when cleaning up the code, this patch just
adds the forgotten part

Signed-off-by: Feng Tang <feng.tang@intel.com>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: Alan Cox <alan@linux.intel.com>
LKML-Reference: <1291348298-21263-2-git-send-email-feng.tang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 92543c7..7c9ab59 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -315,6 +315,7 @@ static void apbt_setup_irq(struct apbt_dev *adev)
 
 	if (system_state == SYSTEM_BOOTING) {
 		irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+		irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
 		/* APB timer irqs are set up as mp_irqs, timer is edge type */
 		__set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
 		if (request_irq(adev->irq, apbt_interrupt_handler,
-- 
cgit v0.10.2


From 991cfffa7c19aa648546aff666595af896e568ba Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Fri, 3 Dec 2010 11:51:37 +0800
Subject: x86, earlyprintk: Move mrst early console to platform/ and fix a typo

Move the code to arch/x86/platform/mrst/. Also fix a typo to use
the correct config option: ONFIG_EARLY_PRINTK_MRST

Signed-off-by: Feng Tang <feng.tang@intel.com>
Cc: alan@linux.intel.com
LKML-Reference: <1291348298-21263-1-git-send-email-feng.tang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9e13763..f60153d 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -84,7 +84,6 @@ obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-obj-$(CONFIG_EARLY_PRINTK_MRST)	+= early_printk_mrst.o
 
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 4572f25..cd28a35 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -240,7 +240,7 @@ static int __init setup_early_printk(char *buf)
 		if (!strncmp(buf, "xen", 3))
 			early_console_register(&xenboot_console, keep);
 #endif
-#ifdef CONFIG_X86_MRST_EARLY_PRINTK
+#ifdef CONFIG_EARLY_PRINTK_MRST
 		if (!strncmp(buf, "mrst", 4)) {
 			mrst_early_console_init();
 			early_console_register(&early_mrst_console, keep);
@@ -250,7 +250,6 @@ static int __init setup_early_printk(char *buf)
 			hsu_early_console_init();
 			early_console_register(&early_hsu_console, keep);
 		}
-
 #endif
 		buf++;
 	}
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c
deleted file mode 100644
index 65df603..0000000
--- a/arch/x86/kernel/early_printk_mrst.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * early_printk_mrst.c - early consoles for Intel MID platforms
- *
- * Copyright (c) 2008-2010, Intel Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-/*
- * This file implements two early consoles named mrst and hsu.
- * mrst is based on Maxim3110 spi-uart device, it exists in both
- * Moorestown and Medfield platforms, while hsu is based on a High
- * Speed UART device which only exists in the Medfield platform
- */
-
-#include <linux/serial_reg.h>
-#include <linux/serial_mfd.h>
-#include <linux/kmsg_dump.h>
-#include <linux/console.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/io.h>
-
-#include <asm/fixmap.h>
-#include <asm/pgtable.h>
-#include <asm/mrst.h>
-
-#define MRST_SPI_TIMEOUT		0x200000
-#define MRST_REGBASE_SPI0		0xff128000
-#define MRST_REGBASE_SPI1		0xff128400
-#define MRST_CLK_SPI0_REG		0xff11d86c
-
-/* Bit fields in CTRLR0 */
-#define SPI_DFS_OFFSET			0
-
-#define SPI_FRF_OFFSET			4
-#define SPI_FRF_SPI			0x0
-#define SPI_FRF_SSP			0x1
-#define SPI_FRF_MICROWIRE		0x2
-#define SPI_FRF_RESV			0x3
-
-#define SPI_MODE_OFFSET			6
-#define SPI_SCPH_OFFSET			6
-#define SPI_SCOL_OFFSET			7
-#define SPI_TMOD_OFFSET			8
-#define	SPI_TMOD_TR			0x0		/* xmit & recv */
-#define SPI_TMOD_TO			0x1		/* xmit only */
-#define SPI_TMOD_RO			0x2		/* recv only */
-#define SPI_TMOD_EPROMREAD		0x3		/* eeprom read mode */
-
-#define SPI_SLVOE_OFFSET		10
-#define SPI_SRL_OFFSET			11
-#define SPI_CFS_OFFSET			12
-
-/* Bit fields in SR, 7 bits */
-#define SR_MASK				0x7f		/* cover 7 bits */
-#define SR_BUSY				(1 << 0)
-#define SR_TF_NOT_FULL			(1 << 1)
-#define SR_TF_EMPT			(1 << 2)
-#define SR_RF_NOT_EMPT			(1 << 3)
-#define SR_RF_FULL			(1 << 4)
-#define SR_TX_ERR			(1 << 5)
-#define SR_DCOL				(1 << 6)
-
-struct dw_spi_reg {
-	u32	ctrl0;
-	u32	ctrl1;
-	u32	ssienr;
-	u32	mwcr;
-	u32	ser;
-	u32	baudr;
-	u32	txfltr;
-	u32	rxfltr;
-	u32	txflr;
-	u32	rxflr;
-	u32	sr;
-	u32	imr;
-	u32	isr;
-	u32	risr;
-	u32	txoicr;
-	u32	rxoicr;
-	u32	rxuicr;
-	u32	msticr;
-	u32	icr;
-	u32	dmacr;
-	u32	dmatdlr;
-	u32	dmardlr;
-	u32	idr;
-	u32	version;
-
-	/* Currently operates as 32 bits, though only the low 16 bits matter */
-	u32	dr;
-} __packed;
-
-#define dw_readl(dw, name)		__raw_readl(&(dw)->name)
-#define dw_writel(dw, name, val)	__raw_writel((val), &(dw)->name)
-
-/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
-static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
-
-static u32 *pclk_spi0;
-/* Always contains an accessable address, start with 0 */
-static struct dw_spi_reg *pspi;
-
-static struct kmsg_dumper dw_dumper;
-static int dumper_registered;
-
-static void dw_kmsg_dump(struct kmsg_dumper *dumper,
-			enum kmsg_dump_reason reason,
-			const char *s1, unsigned long l1,
-			const char *s2, unsigned long l2)
-{
-	int i;
-
-	/* When run to this, we'd better re-init the HW */
-	mrst_early_console_init();
-
-	for (i = 0; i < l1; i++)
-		early_mrst_console.write(&early_mrst_console, s1 + i, 1);
-	for (i = 0; i < l2; i++)
-		early_mrst_console.write(&early_mrst_console, s2 + i, 1);
-}
-
-/* Set the ratio rate to 115200, 8n1, IRQ disabled */
-static void max3110_write_config(void)
-{
-	u16 config;
-
-	config = 0xc001;
-	dw_writel(pspi, dr, config);
-}
-
-/* Translate char to a eligible word and send to max3110 */
-static void max3110_write_data(char c)
-{
-	u16 data;
-
-	data = 0x8000 | c;
-	dw_writel(pspi, dr, data);
-}
-
-void mrst_early_console_init(void)
-{
-	u32 ctrlr0 = 0;
-	u32 spi0_cdiv;
-	u32 freq; /* Freqency info only need be searched once */
-
-	/* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
-	pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
-							MRST_CLK_SPI0_REG);
-	spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
-	freq = 100000000 / (spi0_cdiv + 1);
-
-	if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
-		mrst_spi_paddr = MRST_REGBASE_SPI1;
-
-	pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
-						mrst_spi_paddr);
-
-	/* Disable SPI controller */
-	dw_writel(pspi, ssienr, 0);
-
-	/* Set control param, 8 bits, transmit only mode */
-	ctrlr0 = dw_readl(pspi, ctrl0);
-
-	ctrlr0 &= 0xfcc0;
-	ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
-		      | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
-	dw_writel(pspi, ctrl0, ctrlr0);
-
-	/*
-	 * Change the spi0 clk to comply with 115200 bps, use 100000 to
-	 * calculate the clk dividor to make the clock a little slower
-	 * than real baud rate.
-	 */
-	dw_writel(pspi, baudr, freq/100000);
-
-	/* Disable all INT for early phase */
-	dw_writel(pspi, imr, 0x0);
-
-	/* Set the cs to spi-uart */
-	dw_writel(pspi, ser, 0x2);
-
-	/* Enable the HW, the last step for HW init */
-	dw_writel(pspi, ssienr, 0x1);
-
-	/* Set the default configuration */
-	max3110_write_config();
-
-	/* Register the kmsg dumper */
-	if (!dumper_registered) {
-		dw_dumper.dump = dw_kmsg_dump;
-		kmsg_dump_register(&dw_dumper);
-		dumper_registered = 1;
-	}
-}
-
-/* Slave select should be called in the read/write function */
-static void early_mrst_spi_putc(char c)
-{
-	unsigned int timeout;
-	u32 sr;
-
-	timeout = MRST_SPI_TIMEOUT;
-	/* Early putc needs to make sure the TX FIFO is not full */
-	while (--timeout) {
-		sr = dw_readl(pspi, sr);
-		if (!(sr & SR_TF_NOT_FULL))
-			cpu_relax();
-		else
-			break;
-	}
-
-	if (!timeout)
-		pr_warning("MRST earlycon: timed out\n");
-	else
-		max3110_write_data(c);
-}
-
-/* Early SPI only uses polling mode */
-static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
-{
-	int i;
-
-	for (i = 0; i < n && *str; i++) {
-		if (*str == '\n')
-			early_mrst_spi_putc('\r');
-		early_mrst_spi_putc(*str);
-		str++;
-	}
-}
-
-struct console early_mrst_console = {
-	.name =		"earlymrst",
-	.write =	early_mrst_spi_write,
-	.flags =	CON_PRINTBUFFER,
-	.index =	-1,
-};
-
-/*
- * Following is the early console based on Medfield HSU (High
- * Speed UART) device.
- */
-#define HSU_PORT2_PADDR		0xffa28180
-
-static void __iomem *phsu;
-
-void hsu_early_console_init(void)
-{
-	u8 lcr;
-
-	phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
-							HSU_PORT2_PADDR);
-
-	/* Disable FIFO */
-	writeb(0x0, phsu + UART_FCR);
-
-	/* Set to default 115200 bps, 8n1 */
-	lcr = readb(phsu + UART_LCR);
-	writeb((0x80 | lcr), phsu + UART_LCR);
-	writeb(0x18, phsu + UART_DLL);
-	writeb(lcr,  phsu + UART_LCR);
-	writel(0x3600, phsu + UART_MUL*4);
-
-	writeb(0x8, phsu + UART_MCR);
-	writeb(0x7, phsu + UART_FCR);
-	writeb(0x3, phsu + UART_LCR);
-
-	/* Clear IRQ status */
-	readb(phsu + UART_LSR);
-	readb(phsu + UART_RX);
-	readb(phsu + UART_IIR);
-	readb(phsu + UART_MSR);
-
-	/* Enable FIFO */
-	writeb(0x7, phsu + UART_FCR);
-}
-
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
-static void early_hsu_putc(char ch)
-{
-	unsigned int timeout = 10000; /* 10ms */
-	u8 status;
-
-	while (--timeout) {
-		status = readb(phsu + UART_LSR);
-		if (status & BOTH_EMPTY)
-			break;
-		udelay(1);
-	}
-
-	/* Only write the char when there was no timeout */
-	if (timeout)
-		writeb(ch, phsu + UART_TX);
-}
-
-static void early_hsu_write(struct console *con, const char *str, unsigned n)
-{
-	int i;
-
-	for (i = 0; i < n && *str; i++) {
-		if (*str == '\n')
-			early_hsu_putc('\r');
-		early_hsu_putc(*str);
-		str++;
-	}
-}
-
-struct console early_hsu_console = {
-	.name =		"earlyhsu",
-	.write =	early_hsu_write,
-	.flags =	CON_PRINTBUFFER,
-	.index =	-1,
-};
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index 4d3e256..f61ccdd 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_X86_MRST)		+= mrst.o
 obj-$(CONFIG_X86_MRST)		+= vrtc.o
+obj-$(CONFIG_EARLY_PRINTK_MRST)	+= early_printk_mrst.o
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
new file mode 100644
index 0000000..65df603
--- /dev/null
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -0,0 +1,319 @@
+/*
+ * early_printk_mrst.c - early consoles for Intel MID platforms
+ *
+ * Copyright (c) 2008-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+/*
+ * This file implements two early consoles named mrst and hsu.
+ * mrst is based on Maxim3110 spi-uart device, it exists in both
+ * Moorestown and Medfield platforms, while hsu is based on a High
+ * Speed UART device which only exists in the Medfield platform
+ */
+
+#include <linux/serial_reg.h>
+#include <linux/serial_mfd.h>
+#include <linux/kmsg_dump.h>
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
+
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/mrst.h>
+
+#define MRST_SPI_TIMEOUT		0x200000
+#define MRST_REGBASE_SPI0		0xff128000
+#define MRST_REGBASE_SPI1		0xff128400
+#define MRST_CLK_SPI0_REG		0xff11d86c
+
+/* Bit fields in CTRLR0 */
+#define SPI_DFS_OFFSET			0
+
+#define SPI_FRF_OFFSET			4
+#define SPI_FRF_SPI			0x0
+#define SPI_FRF_SSP			0x1
+#define SPI_FRF_MICROWIRE		0x2
+#define SPI_FRF_RESV			0x3
+
+#define SPI_MODE_OFFSET			6
+#define SPI_SCPH_OFFSET			6
+#define SPI_SCOL_OFFSET			7
+#define SPI_TMOD_OFFSET			8
+#define	SPI_TMOD_TR			0x0		/* xmit & recv */
+#define SPI_TMOD_TO			0x1		/* xmit only */
+#define SPI_TMOD_RO			0x2		/* recv only */
+#define SPI_TMOD_EPROMREAD		0x3		/* eeprom read mode */
+
+#define SPI_SLVOE_OFFSET		10
+#define SPI_SRL_OFFSET			11
+#define SPI_CFS_OFFSET			12
+
+/* Bit fields in SR, 7 bits */
+#define SR_MASK				0x7f		/* cover 7 bits */
+#define SR_BUSY				(1 << 0)
+#define SR_TF_NOT_FULL			(1 << 1)
+#define SR_TF_EMPT			(1 << 2)
+#define SR_RF_NOT_EMPT			(1 << 3)
+#define SR_RF_FULL			(1 << 4)
+#define SR_TX_ERR			(1 << 5)
+#define SR_DCOL				(1 << 6)
+
+struct dw_spi_reg {
+	u32	ctrl0;
+	u32	ctrl1;
+	u32	ssienr;
+	u32	mwcr;
+	u32	ser;
+	u32	baudr;
+	u32	txfltr;
+	u32	rxfltr;
+	u32	txflr;
+	u32	rxflr;
+	u32	sr;
+	u32	imr;
+	u32	isr;
+	u32	risr;
+	u32	txoicr;
+	u32	rxoicr;
+	u32	rxuicr;
+	u32	msticr;
+	u32	icr;
+	u32	dmacr;
+	u32	dmatdlr;
+	u32	dmardlr;
+	u32	idr;
+	u32	version;
+
+	/* Currently operates as 32 bits, though only the low 16 bits matter */
+	u32	dr;
+} __packed;
+
+#define dw_readl(dw, name)		__raw_readl(&(dw)->name)
+#define dw_writel(dw, name, val)	__raw_writel((val), &(dw)->name)
+
+/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
+static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
+
+static u32 *pclk_spi0;
+/* Always contains an accessable address, start with 0 */
+static struct dw_spi_reg *pspi;
+
+static struct kmsg_dumper dw_dumper;
+static int dumper_registered;
+
+static void dw_kmsg_dump(struct kmsg_dumper *dumper,
+			enum kmsg_dump_reason reason,
+			const char *s1, unsigned long l1,
+			const char *s2, unsigned long l2)
+{
+	int i;
+
+	/* When run to this, we'd better re-init the HW */
+	mrst_early_console_init();
+
+	for (i = 0; i < l1; i++)
+		early_mrst_console.write(&early_mrst_console, s1 + i, 1);
+	for (i = 0; i < l2; i++)
+		early_mrst_console.write(&early_mrst_console, s2 + i, 1);
+}
+
+/* Set the ratio rate to 115200, 8n1, IRQ disabled */
+static void max3110_write_config(void)
+{
+	u16 config;
+
+	config = 0xc001;
+	dw_writel(pspi, dr, config);
+}
+
+/* Translate char to a eligible word and send to max3110 */
+static void max3110_write_data(char c)
+{
+	u16 data;
+
+	data = 0x8000 | c;
+	dw_writel(pspi, dr, data);
+}
+
+void mrst_early_console_init(void)
+{
+	u32 ctrlr0 = 0;
+	u32 spi0_cdiv;
+	u32 freq; /* Freqency info only need be searched once */
+
+	/* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
+	pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+							MRST_CLK_SPI0_REG);
+	spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
+	freq = 100000000 / (spi0_cdiv + 1);
+
+	if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
+		mrst_spi_paddr = MRST_REGBASE_SPI1;
+
+	pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+						mrst_spi_paddr);
+
+	/* Disable SPI controller */
+	dw_writel(pspi, ssienr, 0);
+
+	/* Set control param, 8 bits, transmit only mode */
+	ctrlr0 = dw_readl(pspi, ctrl0);
+
+	ctrlr0 &= 0xfcc0;
+	ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
+		      | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
+	dw_writel(pspi, ctrl0, ctrlr0);
+
+	/*
+	 * Change the spi0 clk to comply with 115200 bps, use 100000 to
+	 * calculate the clk dividor to make the clock a little slower
+	 * than real baud rate.
+	 */
+	dw_writel(pspi, baudr, freq/100000);
+
+	/* Disable all INT for early phase */
+	dw_writel(pspi, imr, 0x0);
+
+	/* Set the cs to spi-uart */
+	dw_writel(pspi, ser, 0x2);
+
+	/* Enable the HW, the last step for HW init */
+	dw_writel(pspi, ssienr, 0x1);
+
+	/* Set the default configuration */
+	max3110_write_config();
+
+	/* Register the kmsg dumper */
+	if (!dumper_registered) {
+		dw_dumper.dump = dw_kmsg_dump;
+		kmsg_dump_register(&dw_dumper);
+		dumper_registered = 1;
+	}
+}
+
+/* Slave select should be called in the read/write function */
+static void early_mrst_spi_putc(char c)
+{
+	unsigned int timeout;
+	u32 sr;
+
+	timeout = MRST_SPI_TIMEOUT;
+	/* Early putc needs to make sure the TX FIFO is not full */
+	while (--timeout) {
+		sr = dw_readl(pspi, sr);
+		if (!(sr & SR_TF_NOT_FULL))
+			cpu_relax();
+		else
+			break;
+	}
+
+	if (!timeout)
+		pr_warning("MRST earlycon: timed out\n");
+	else
+		max3110_write_data(c);
+}
+
+/* Early SPI only uses polling mode */
+static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
+{
+	int i;
+
+	for (i = 0; i < n && *str; i++) {
+		if (*str == '\n')
+			early_mrst_spi_putc('\r');
+		early_mrst_spi_putc(*str);
+		str++;
+	}
+}
+
+struct console early_mrst_console = {
+	.name =		"earlymrst",
+	.write =	early_mrst_spi_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
+
+/*
+ * Following is the early console based on Medfield HSU (High
+ * Speed UART) device.
+ */
+#define HSU_PORT2_PADDR		0xffa28180
+
+static void __iomem *phsu;
+
+void hsu_early_console_init(void)
+{
+	u8 lcr;
+
+	phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+							HSU_PORT2_PADDR);
+
+	/* Disable FIFO */
+	writeb(0x0, phsu + UART_FCR);
+
+	/* Set to default 115200 bps, 8n1 */
+	lcr = readb(phsu + UART_LCR);
+	writeb((0x80 | lcr), phsu + UART_LCR);
+	writeb(0x18, phsu + UART_DLL);
+	writeb(lcr,  phsu + UART_LCR);
+	writel(0x3600, phsu + UART_MUL*4);
+
+	writeb(0x8, phsu + UART_MCR);
+	writeb(0x7, phsu + UART_FCR);
+	writeb(0x3, phsu + UART_LCR);
+
+	/* Clear IRQ status */
+	readb(phsu + UART_LSR);
+	readb(phsu + UART_RX);
+	readb(phsu + UART_IIR);
+	readb(phsu + UART_MSR);
+
+	/* Enable FIFO */
+	writeb(0x7, phsu + UART_FCR);
+}
+
+#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
+
+static void early_hsu_putc(char ch)
+{
+	unsigned int timeout = 10000; /* 10ms */
+	u8 status;
+
+	while (--timeout) {
+		status = readb(phsu + UART_LSR);
+		if (status & BOTH_EMPTY)
+			break;
+		udelay(1);
+	}
+
+	/* Only write the char when there was no timeout */
+	if (timeout)
+		writeb(ch, phsu + UART_TX);
+}
+
+static void early_hsu_write(struct console *con, const char *str, unsigned n)
+{
+	int i;
+
+	for (i = 0; i < n && *str; i++) {
+		if (*str == '\n')
+			early_hsu_putc('\r');
+		early_hsu_putc(*str);
+		str++;
+	}
+}
+
+struct console early_hsu_console = {
+	.name =		"earlyhsu",
+	.write =	early_hsu_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
-- 
cgit v0.10.2


From bcd7278d8a423a255e45f4d10afe564328f1885f Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 7 Dec 2010 13:58:56 -0500
Subject: GFS2: fsck.gfs2 reported statfs error after gfs2_grow

When you do gfs2_grow it failed to take the very last
rgrp into account when adding up the new free space due
to an off-by-one error.  It was not reading the last
rgrp from the rindex because of a check for "<=" that
should have been "<".  Therefore, fsck.gfs2 was finding
(and fixing) an error with the system statfs file.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 25dbe5c..7293ea2 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
 	for (rgrps = 0;; rgrps++) {
 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
 
-		if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
+		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
 			break;
 		error = gfs2_internal_read(ip, &ra_state, buf, &pos,
 					   sizeof(struct gfs2_rindex));
-- 
cgit v0.10.2


From 806c09a7db457be3758e14b1f152761135d89af5 Mon Sep 17 00:00:00 2001
From: Dario Faggioli <raistlin@linux.it>
Date: Tue, 30 Nov 2010 19:51:33 +0100
Subject: sched: Make pushable_tasks CONFIG_SMP dependant

As noted by Peter Zijlstra at https://lkml.org/lkml/2010/11/10/391
(while reviewing other stuff, though), tracking pushable tasks
only makes sense on SMP systems.

Signed-off-by: Dario Faggioli <raistlin@linux.it>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1291143093.2697.298.camel@Palantir>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f8c06c..6ed8812 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -12,6 +12,13 @@
 #include <linux/securebits.h>
 #include <net/net_namespace.h>
 
+#ifdef CONFIG_SMP
+# define INIT_PUSHABLE_TASKS(tsk)					\
+	.pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
+#else
+# define INIT_PUSHABLE_TASKS(tsk)
+#endif
+
 extern struct files_struct init_files;
 extern struct fs_struct init_fs;
 
@@ -137,7 +144,7 @@ extern struct cred init_cred;
 		.nr_cpus_allowed = NR_CPUS,				\
 	},								\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
-	.pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
+	INIT_PUSHABLE_TASKS(tsk)					\
 	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
 	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
 	.real_parent	= &tsk,						\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c2d46d..4f92a23 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1240,7 +1240,9 @@ struct task_struct {
 #endif
 
 	struct list_head tasks;
+#ifdef CONFIG_SMP
 	struct plist_node pushable_tasks;
+#endif
 
 	struct mm_struct *mm, *active_mm;
 #if defined(SPLIT_RSS_COUNTING)
diff --git a/kernel/sched.c b/kernel/sched.c
index b646dad..3925a1b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2595,7 +2595,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
+#ifdef CONFIG_SMP
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
 
 	put_cpu();
 }
-- 
cgit v0.10.2


From 40dc11ffb35e8c4e8fa71092048e0f8de9db758c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 26 Nov 2010 17:22:16 +0100
Subject: printk: Use this_cpu_{read|write} api on printk_pending

__get_cpu_var() is a bit inefficient, lets use __this_cpu_read() and
__this_cpu_write() to manipulate printk_pending.

printk_needs_cpu(cpu) is called only for the current cpu :
Use faster __this_cpu_read().

Remove the redundant unlikely on (cpu_is_offline(cpu)) test:

 # size kernel/printk.o*
   text	   data	    bss	    dec	    hex	filename
   9942	    756	 263488	 274186	  42f0a	kernel/printk.o.new
   9990	    756	 263488	 274234	  42f3a	kernel/printk.o.old

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1290788536.2855.237.camel@edumazet-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/printk.c b/kernel/printk.c
index a23315d..ab3ffc5 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending);
 
 void printk_tick(void)
 {
-	if (__get_cpu_var(printk_pending)) {
-		__get_cpu_var(printk_pending) = 0;
+	if (__this_cpu_read(printk_pending)) {
+		__this_cpu_write(printk_pending, 0);
 		wake_up_interruptible(&log_wait);
 	}
 }
 
 int printk_needs_cpu(int cpu)
 {
-	if (unlikely(cpu_is_offline(cpu)))
+	if (cpu_is_offline(cpu))
 		printk_tick();
-	return per_cpu(printk_pending, cpu);
+	return __this_cpu_read(printk_pending);
 }
 
 void wake_up_klogd(void)
-- 
cgit v0.10.2


From 1f5a24794a54588ea3a9efd521be31d826e0b9d7 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 9 Dec 2010 12:02:18 -0800
Subject: timers: Rename timerlist infrastructure to timerqueue

Thomas pointed out a namespace collision between the new timerlist
infrastructure I introduced and the existing timer_list.c

So to avoid confusion, I've renamed the timerlist infrastructure
to timerqueue.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/include/linux/timerlist.h b/include/linux/timerlist.h
deleted file mode 100644
index c46b28a..0000000
--- a/include/linux/timerlist.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef _LINUX_TIMERLIST_H
-#define _LINUX_TIMERLIST_H
-
-#include <linux/rbtree.h>
-#include <linux/ktime.h>
-
-
-struct timerlist_node {
-	struct rb_node node;
-	ktime_t expires;
-};
-
-struct timerlist_head {
-	struct rb_root head;
-	struct timerlist_node *next;
-};
-
-
-extern void timerlist_add(struct timerlist_head *head,
-				struct timerlist_node *node);
-extern void timerlist_del(struct timerlist_head *head,
-				struct timerlist_node *node);
-extern struct timerlist_node *timerlist_getnext(struct timerlist_head *head);
-extern struct timerlist_node *timerlist_iterate_next(
-						struct timerlist_node *node);
-
-static inline void timerlist_init(struct timerlist_node *node)
-{
-	RB_CLEAR_NODE(&node->node);
-}
-
-static inline void timerlist_init_head(struct timerlist_head *head)
-{
-	head->head = RB_ROOT;
-	head->next = NULL;
-}
-#endif /* _LINUX_TIMERLIST_H */
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
new file mode 100644
index 0000000..406b103
--- /dev/null
+++ b/include/linux/timerqueue.h
@@ -0,0 +1,37 @@
+#ifndef _LINUX_TIMERQUEUE_H
+#define _LINUX_TIMERQUEUE_H
+
+#include <linux/rbtree.h>
+#include <linux/ktime.h>
+
+
+struct timerqueue_node {
+	struct rb_node node;
+	ktime_t expires;
+};
+
+struct timerqueue_head {
+	struct rb_root head;
+	struct timerqueue_node *next;
+};
+
+
+extern void timerqueue_add(struct timerqueue_head *head,
+				struct timerqueue_node *node);
+extern void timerqueue_del(struct timerqueue_head *head,
+				struct timerqueue_node *node);
+extern struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head);
+extern struct timerqueue_node *timerqueue_iterate_next(
+						struct timerqueue_node *node);
+
+static inline void timerqueue_init(struct timerqueue_node *node)
+{
+	RB_CLEAR_NODE(&node->node);
+}
+
+static inline void timerqueue_init_head(struct timerqueue_head *head)
+{
+	head->head = RB_ROOT;
+	head->next = NULL;
+}
+#endif /* _LINUX_TIMERQUEUE_H */
diff --git a/lib/Makefile b/lib/Makefile
index 8b475cf..9e2db72 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
 endif
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
-	 rbtree.o radix-tree.o dump_stack.o timerlist.o\
+	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
 	 proportions.o prio_heap.o ratelimit.o show_mem.o \
diff --git a/lib/timerlist.c b/lib/timerlist.c
deleted file mode 100644
index 9101b42..0000000
--- a/lib/timerlist.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *  Generic Timer-list
- *
- *  Manages a simple list of timers, ordered by expiration time.
- *  Uses rbtrees for quick list adds and expiration.
- *
- *  NOTE: All of the following functions need to be serialized
- *  to avoid races. No locking is done by this libary code.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-#include <linux/timerlist.h>
-#include <linux/rbtree.h>
-
-/**
- * timerlist_add - Adds timer to timerlist.
- *
- * @head: head of timerlist
- * @node: timer node to be added
- *
- * Adds the timer node to the timerlist, sorted by the
- * node's expires value.
- */
-void timerlist_add(struct timerlist_head *head, struct timerlist_node *node)
-{
-	struct rb_node **p = &head->head.rb_node;
-	struct rb_node *parent = NULL;
-	struct timerlist_node  *ptr;
-
-	/* Make sure we don't add nodes that are already added */
-	WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));
-
-	while (*p) {
-		parent = *p;
-		ptr = rb_entry(parent, struct timerlist_node, node);
-		if (node->expires.tv64 < ptr->expires.tv64)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-	rb_link_node(&node->node, parent, p);
-	rb_insert_color(&node->node, &head->head);
-
-	if (!head->next || node->expires.tv64 < head->next->expires.tv64)
-		head->next = node;
-}
-
-/**
- * timerlist_del - Removes a timer from the timerlist.
- *
- * @head: head of timerlist
- * @node: timer node to be removed
- *
- * Removes the timer node from the timerlist.
- */
-void timerlist_del(struct timerlist_head *head, struct timerlist_node *node)
-{
-	WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
-
-	/* update next pointer */
-	if (head->next == node) {
-		struct rb_node *rbn = rb_next(&node->node);
-
-		head->next = rbn ?
-			rb_entry(rbn, struct timerlist_node, node) : NULL;
-	}
-	rb_erase(&node->node, &head->head);
-	RB_CLEAR_NODE(&node->node);
-}
-
-
-/**
- * timerlist_getnext - Returns the timer with the earlies expiration time
- *
- * @head: head of timerlist
- *
- * Returns a pointer to the timer node that has the
- * earliest expiration time.
- */
-struct timerlist_node *timerlist_getnext(struct timerlist_head *head)
-{
-	return head->next;
-}
-
-
-/**
- * timerlist_iterate_next - Returns the timer after the provided timer
- *
- * @node: Pointer to a timer.
- *
- * Provides the timer that is after the given node. This is used, when
- * necessary, to iterate through the list of timers in a timer list
- * without modifying the list.
- */
-struct timerlist_node *timerlist_iterate_next(struct timerlist_node *node)
-{
-	struct rb_node *next;
-
-	if (!node)
-		return NULL;
-	next = rb_next(&node->node);
-	if (!next)
-		return NULL;
-	return container_of(next, struct timerlist_node, node);
-}
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
new file mode 100644
index 0000000..f46de84
--- /dev/null
+++ b/lib/timerqueue.c
@@ -0,0 +1,118 @@
+/*
+ *  Generic Timer-queue
+ *
+ *  Manages a simple queue of timers, ordered by expiration time.
+ *  Uses rbtrees for quick list adds and expiration.
+ *
+ *  NOTE: All of the following functions need to be serialized
+ *  to avoid races. No locking is done by this libary code.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/timerqueue.h>
+#include <linux/rbtree.h>
+
+/**
+ * timerqueue_add - Adds timer to timerqueue.
+ *
+ * @head: head of timerqueue
+ * @node: timer node to be added
+ *
+ * Adds the timer node to the timerqueue, sorted by the
+ * node's expires value.
+ */
+void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
+{
+	struct rb_node **p = &head->head.rb_node;
+	struct rb_node *parent = NULL;
+	struct timerqueue_node  *ptr;
+
+	/* Make sure we don't add nodes that are already added */
+	WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));
+
+	while (*p) {
+		parent = *p;
+		ptr = rb_entry(parent, struct timerqueue_node, node);
+		if (node->expires.tv64 < ptr->expires.tv64)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&node->node, parent, p);
+	rb_insert_color(&node->node, &head->head);
+
+	if (!head->next || node->expires.tv64 < head->next->expires.tv64)
+		head->next = node;
+}
+
+/**
+ * timerqueue_del - Removes a timer from the timerqueue.
+ *
+ * @head: head of timerqueue
+ * @node: timer node to be removed
+ *
+ * Removes the timer node from the timerqueue.
+ */
+void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
+{
+	WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
+
+	/* update next pointer */
+	if (head->next == node) {
+		struct rb_node *rbn = rb_next(&node->node);
+
+		head->next = rbn ?
+			rb_entry(rbn, struct timerqueue_node, node) : NULL;
+	}
+	rb_erase(&node->node, &head->head);
+	RB_CLEAR_NODE(&node->node);
+}
+
+
+/**
+ * timerqueue_getnext - Returns the timer with the earlies expiration time
+ *
+ * @head: head of timerqueue
+ *
+ * Returns a pointer to the timer node that has the
+ * earliest expiration time.
+ */
+struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
+{
+	return head->next;
+}
+
+
+/**
+ * timerqueue_iterate_next - Returns the timer after the provided timer
+ *
+ * @node: Pointer to a timer.
+ *
+ * Provides the timer that is after the given node. This is used, when
+ * necessary, to iterate through the list of timers in a timer list
+ * without modifying the list.
+ */
+struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node)
+{
+	struct rb_node *next;
+
+	if (!node)
+		return NULL;
+	next = rb_next(&node->node);
+	if (!next)
+		return NULL;
+	return container_of(next, struct timerqueue_node, node);
+}
-- 
cgit v0.10.2


From 9bb99b147018945366c763b3d4d7008927dc8557 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 6 Dec 2010 13:32:12 -0800
Subject: timers: Fixup allmodconfig build issue

Adds missed EXPORT_SYMBOL lines that cause the following build
failures with allmodconfig:
ERROR: "timerqueue_add" [drivers/rtc/rtc-core.ko] undefined!
ERROR: "timerqueue_getnext" [drivers/rtc/rtc-core.ko] undefined!
ERROR: "timerqueue_del" [drivers/rtc/rtc-core.ko] undefined!

Reported-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index f46de84..444b093 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -24,6 +24,7 @@
 
 #include <linux/timerqueue.h>
 #include <linux/rbtree.h>
+#include <linux/module.h>
 
 /**
  * timerqueue_add - Adds timer to timerqueue.
@@ -57,6 +58,7 @@ void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 	if (!head->next || node->expires.tv64 < head->next->expires.tv64)
 		head->next = node;
 }
+EXPORT_SYMBOL_GPL(timerqueue_add);
 
 /**
  * timerqueue_del - Removes a timer from the timerqueue.
@@ -80,7 +82,7 @@ void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
 	rb_erase(&node->node, &head->head);
 	RB_CLEAR_NODE(&node->node);
 }
-
+EXPORT_SYMBOL_GPL(timerqueue_del);
 
 /**
  * timerqueue_getnext - Returns the timer with the earlies expiration time
@@ -94,7 +96,7 @@ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
 {
 	return head->next;
 }
-
+EXPORT_SYMBOL_GPL(timerqueue_getnext);
 
 /**
  * timerqueue_iterate_next - Returns the timer after the provided timer
@@ -116,3 +118,4 @@ struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node)
 		return NULL;
 	return container_of(next, struct timerqueue_node, node);
 }
+EXPORT_SYMBOL_GPL(timerqueue_iterate_next);
-- 
cgit v0.10.2


From 998adc3dda59f811966b3ccb21eb223680b25ec4 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 20 Sep 2010 19:19:17 -0700
Subject: hrtimers: Convert hrtimers to use timerlist infrastructure

Converts the hrtimer code to use the new timerlist infrastructure

Signed-off-by: John Stultz <john.stultz@linaro.org>
LKML Reference: <1290136329-18291-3-git-send-email-john.stultz@linaro.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Richard Cochran <richardcochran@gmail.com>

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index dd9954b..330586f 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -22,7 +22,7 @@
 #include <linux/wait.h>
 #include <linux/percpu.h>
 #include <linux/timer.h>
-
+#include <linux/timerqueue.h>
 
 struct hrtimer_clock_base;
 struct hrtimer_cpu_base;
@@ -79,8 +79,8 @@ enum hrtimer_restart {
 
 /**
  * struct hrtimer - the basic hrtimer structure
- * @node:	red black tree node for time ordered insertion
- * @_expires:	the absolute expiry time in the hrtimers internal
+ * @node:	timerqueue node, which also manages node.expires,
+ *		the absolute expiry time in the hrtimers internal
  *		representation. The time is related to the clock on
  *		which the timer is based. Is setup by adding
  *		slack to the _softexpires value. For non range timers
@@ -101,8 +101,7 @@ enum hrtimer_restart {
  * The hrtimer structure must be initialized by hrtimer_init()
  */
 struct hrtimer {
-	struct rb_node			node;
-	ktime_t				_expires;
+	struct timerqueue_node		node;
 	ktime_t				_softexpires;
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
@@ -141,8 +140,7 @@ struct hrtimer_sleeper {
 struct hrtimer_clock_base {
 	struct hrtimer_cpu_base	*cpu_base;
 	clockid_t		index;
-	struct rb_root		active;
-	struct rb_node		*first;
+	struct timerqueue_head	active;
 	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
 	ktime_t			softirq_time;
@@ -183,43 +181,43 @@ struct hrtimer_cpu_base {
 
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
-	timer->_expires = time;
+	timer->node.expires = time;
 	timer->_softexpires = time;
 }
 
 static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
 {
 	timer->_softexpires = time;
-	timer->_expires = ktime_add_safe(time, delta);
+	timer->node.expires = ktime_add_safe(time, delta);
 }
 
 static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
 {
 	timer->_softexpires = time;
-	timer->_expires = ktime_add_safe(time, ns_to_ktime(delta));
+	timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta));
 }
 
 static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
 {
-	timer->_expires.tv64 = tv64;
+	timer->node.expires.tv64 = tv64;
 	timer->_softexpires.tv64 = tv64;
 }
 
 static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
 {
-	timer->_expires = ktime_add_safe(timer->_expires, time);
+	timer->node.expires = ktime_add_safe(timer->node.expires, time);
 	timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
 }
 
 static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)
 {
-	timer->_expires = ktime_add_ns(timer->_expires, ns);
+	timer->node.expires = ktime_add_ns(timer->node.expires, ns);
 	timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
 }
 
 static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
 {
-	return timer->_expires;
+	return timer->node.expires;
 }
 
 static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
@@ -229,7 +227,7 @@ static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
 
 static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
 {
-	return timer->_expires.tv64;
+	return timer->node.expires.tv64;
 }
 static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
 {
@@ -238,12 +236,12 @@ static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
 
 static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
 {
-	return ktime_to_ns(timer->_expires);
+	return ktime_to_ns(timer->node.expires);
 }
 
 static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
 {
-    return ktime_sub(timer->_expires, timer->base->get_time());
+	return ktime_sub(timer->node.expires, timer->base->get_time());
 }
 
 #ifdef CONFIG_HIGH_RES_TIMERS
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ce66917..93976ad 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
 		struct hrtimer *timer;
+		struct timerqueue_node *next;
 
-		if (!base->first)
+		next = timerqueue_getnext(&base->active);
+		if (!next)
 			continue;
-		timer = rb_entry(base->first, struct hrtimer, node);
+		timer = container_of(next, struct hrtimer, node);
+
 		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
 		/*
 		 * clock_was_set() has changed base->offset so the
@@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
 static int enqueue_hrtimer(struct hrtimer *timer,
 			   struct hrtimer_clock_base *base)
 {
-	struct rb_node **link = &base->active.rb_node;
-	struct rb_node *parent = NULL;
-	struct hrtimer *entry;
-	int leftmost = 1;
-
 	debug_activate(timer);
 
-	/*
-	 * Find the right place in the rbtree:
-	 */
-	while (*link) {
-		parent = *link;
-		entry = rb_entry(parent, struct hrtimer, node);
-		/*
-		 * We dont care about collisions. Nodes with
-		 * the same expiry time stay together.
-		 */
-		if (hrtimer_get_expires_tv64(timer) <
-				hrtimer_get_expires_tv64(entry)) {
-			link = &(*link)->rb_left;
-		} else {
-			link = &(*link)->rb_right;
-			leftmost = 0;
-		}
-	}
+	timerqueue_add(&base->active, &timer->node);
 
 	/*
-	 * Insert the timer to the rbtree and check whether it
-	 * replaces the first pending timer
-	 */
-	if (leftmost)
-		base->first = &timer->node;
-
-	rb_link_node(&timer->node, parent, link);
-	rb_insert_color(&timer->node, &base->active);
-	/*
 	 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
 	 * state of a possibly running callback.
 	 */
 	timer->state |= HRTIMER_STATE_ENQUEUED;
 
-	return leftmost;
+	return (&timer->node == base->active.next);
 }
 
 /*
@@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
 	if (!(timer->state & HRTIMER_STATE_ENQUEUED))
 		goto out;
 
-	/*
-	 * Remove the timer from the rbtree and replace the first
-	 * entry pointer if necessary.
-	 */
-	if (base->first == &timer->node) {
-		base->first = rb_next(&timer->node);
+	if (&timer->node == timerqueue_getnext(&base->active)) {
 #ifdef CONFIG_HIGH_RES_TIMERS
 		/* Reprogram the clock event device. if enabled */
 		if (reprogram && hrtimer_hres_active()) {
@@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
 		}
 #endif
 	}
-	rb_erase(&timer->node, &base->active);
+	timerqueue_del(&base->active, &timer->node);
 out:
 	timer->state = newstate;
 }
@@ -1123,11 +1090,13 @@ ktime_t hrtimer_get_next_event(void)
 	if (!hrtimer_hres_active()) {
 		for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
 			struct hrtimer *timer;
+			struct timerqueue_node *next;
 
-			if (!base->first)
+			next = timerqueue_getnext(&base->active);
+			if (!next)
 				continue;
 
-			timer = rb_entry(base->first, struct hrtimer, node);
+			timer = container_of(next, struct hrtimer, node);
 			delta.tv64 = hrtimer_get_expires_tv64(timer);
 			delta = ktime_sub(delta, base->get_time());
 			if (delta.tv64 < mindelta.tv64)
@@ -1157,6 +1126,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 
 	timer->base = &cpu_base->clock_base[clock_id];
 	hrtimer_init_timer_hres(timer);
+	timerqueue_init(&timer->node);
 
 #ifdef CONFIG_TIMER_STATS
 	timer->start_site = NULL;
@@ -1270,14 +1240,14 @@ retry:
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		ktime_t basenow;
-		struct rb_node *node;
+		struct timerqueue_node *node;
 
 		basenow = ktime_add(now, base->offset);
 
-		while ((node = base->first)) {
+		while ((node = timerqueue_getnext(&base->active))) {
 			struct hrtimer *timer;
 
-			timer = rb_entry(node, struct hrtimer, node);
+			timer = container_of(node, struct hrtimer, node);
 
 			/*
 			 * The immediate goal for using the softexpires is
@@ -1433,7 +1403,7 @@ void hrtimer_run_pending(void)
  */
 void hrtimer_run_queues(void)
 {
-	struct rb_node *node;
+	struct timerqueue_node *node;
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 	struct hrtimer_clock_base *base;
 	int index, gettime = 1;
@@ -1442,9 +1412,11 @@ void hrtimer_run_queues(void)
 		return;
 
 	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
-		base = &cpu_base->clock_base[index];
+		struct timerqueue_node *next;
 
-		if (!base->first)
+		base = &cpu_base->clock_base[index];
+		next = timerqueue_getnext(&base->active);
+		if (!next)
 			continue;
 
 		if (gettime) {
@@ -1454,10 +1426,10 @@ void hrtimer_run_queues(void)
 
 		raw_spin_lock(&cpu_base->lock);
 
-		while ((node = base->first)) {
+		while ((node = next)) {
 			struct hrtimer *timer;
 
-			timer = rb_entry(node, struct hrtimer, node);
+			timer = container_of(node, struct hrtimer, node);
 			if (base->softirq_time.tv64 <=
 					hrtimer_get_expires_tv64(timer))
 				break;
@@ -1622,8 +1594,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 
 	raw_spin_lock_init(&cpu_base->lock);
 
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		cpu_base->clock_base[i].cpu_base = cpu_base;
+		timerqueue_init_head(&cpu_base->clock_base[i].active);
+	}
 
 	hrtimer_init_hres(cpu_base);
 }
@@ -1634,10 +1608,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 				struct hrtimer_clock_base *new_base)
 {
 	struct hrtimer *timer;
-	struct rb_node *node;
+	struct timerqueue_node *node;
 
-	while ((node = rb_first(&old_base->active))) {
-		timer = rb_entry(node, struct hrtimer, node);
+	while ((node = timerqueue_getnext(&old_base->active))) {
+		timer = container_of(node, struct hrtimer, node);
 		BUG_ON(hrtimer_callback_running(timer));
 		debug_deactivate(timer);
 
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ab8f5e3..32a19f9 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
 {
 	struct hrtimer *timer, tmp;
 	unsigned long next = 0, i;
-	struct rb_node *curr;
+	struct timerqueue_node *curr;
 	unsigned long flags;
 
 next_one:
 	i = 0;
 	raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
 
-	curr = base->first;
+	curr = timerqueue_getnext(&base->active);
 	/*
 	 * Crude but we have to do this O(N*N) thing, because
 	 * we have to unlock the base when printing:
 	 */
 	while (curr && i < next) {
-		curr = rb_next(curr);
+		curr = timerqueue_iterate_next(curr);
 		i++;
 	}
 
 	if (curr) {
 
-		timer = rb_entry(curr, struct hrtimer, node);
+		timer = container_of(curr, struct hrtimer, node);
 		tmp = *timer;
 		raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
 
-- 
cgit v0.10.2


From b007c389d3e09b823eccda1503390fa2a9adca0d Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 10 Dec 2010 22:19:53 -0800
Subject: hrtimer: fix timerqueue conversion flub

In converting the hrtimers to timerqueue, I missed
a spot in hrtimer_run_queues where we loop running
timers. We end up not pulling the new next value out
and instead just use the last next value, causing
boot time hangs in some cases.

The proper fix is to pull timerqueue_getnext each iteration
instead of using a local next value.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 93976ad..7a7a206 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1412,11 +1412,8 @@ void hrtimer_run_queues(void)
 		return;
 
 	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
-		struct timerqueue_node *next;
-
 		base = &cpu_base->clock_base[index];
-		next = timerqueue_getnext(&base->active);
-		if (!next)
+		if (!timerqueue_getnext(&base->active))
 			continue;
 
 		if (gettime) {
@@ -1426,7 +1423,7 @@ void hrtimer_run_queues(void)
 
 		raw_spin_lock(&cpu_base->lock);
 
-		while ((node = next)) {
+		while ((node = timerqueue_getnext(&base->active))) {
 			struct hrtimer *timer;
 
 			timer = container_of(node, struct hrtimer, node);
-- 
cgit v0.10.2


From 45f74264e18449cf3c93cccaf098ee6e9524ab78 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 11 Dec 2010 12:34:34 +0100
Subject: timerqueue: Make timerqueue_getnext() static inline

No point in calling a function just to dereference a pointer.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>

diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index 406b103..d24aaba 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -20,10 +20,23 @@ extern void timerqueue_add(struct timerqueue_head *head,
 				struct timerqueue_node *node);
 extern void timerqueue_del(struct timerqueue_head *head,
 				struct timerqueue_node *node);
-extern struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head);
 extern struct timerqueue_node *timerqueue_iterate_next(
 						struct timerqueue_node *node);
 
+/**
+ * timerqueue_getnext - Returns the timer with the earlies expiration time
+ *
+ * @head: head of timerqueue
+ *
+ * Returns a pointer to the timer node that has the
+ * earliest expiration time.
+ */
+static inline
+struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
+{
+	return head->next;
+}
+
 static inline void timerqueue_init(struct timerqueue_node *node)
 {
 	RB_CLEAR_NODE(&node->node);
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index 444b093..e3a1050 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -85,20 +85,6 @@ void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
 EXPORT_SYMBOL_GPL(timerqueue_del);
 
 /**
- * timerqueue_getnext - Returns the timer with the earlies expiration time
- *
- * @head: head of timerqueue
- *
- * Returns a pointer to the timer node that has the
- * earliest expiration time.
- */
-struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
-{
-	return head->next;
-}
-EXPORT_SYMBOL_GPL(timerqueue_getnext);
-
-/**
  * timerqueue_iterate_next - Returns the timer after the provided timer
  *
  * @node: Pointer to a timer.
-- 
cgit v0.10.2


From 7496351ad87e61e96b49dd7b43c6534e3401f566 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 30 Nov 2010 14:05:53 -0600
Subject: timers: Use this_cpu_read

Eric asked for this.

[tglx: Because it generates faster code according to Erics ]

Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: linux-mm@kvack.org
LKML-Reference: <alpine.DEB.2.00.1011301404490.4039@router.home>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/timer.c b/kernel/timer.c
index 483e54b..beb97fd 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1227,7 +1227,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
  */
 unsigned long get_next_timer_interrupt(unsigned long now)
 {
-	struct tvec_base *base = __get_cpu_var(tvec_bases);
+	struct tvec_base *base = __this_cpu_read(tvec_bases);
 	unsigned long expires;
 
 	spin_lock(&base->lock);
@@ -1267,7 +1267,7 @@ void update_process_times(int user_tick)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	struct tvec_base *base = __get_cpu_var(tvec_bases);
+	struct tvec_base *base = __this_cpu_read(tvec_bases);
 
 	hrtimer_run_pending();
 
-- 
cgit v0.10.2


From a8760eca6cf60ed303ad494ef45901f63165d2c8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 13 Dec 2010 11:28:02 +0100
Subject: x86: Check tsc available/disabled in the delayed init function

The delayed TSC init function does not check whether the system has no
TSC or TSC is disabled at the kernel command line, which results in a
crash in the work queue based extended calibration due to division by
zero because the basic calibration never happened.

Add the missing checks and do not touch TSC when not available or
disabled.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <johnstul@us.ibm.com>

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index dc1393e..356a0d4 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -965,6 +965,9 @@ out:
 
 static int __init init_tsc_clocksource(void)
 {
+	if (!cpu_has_tsc || tsc_disabled > 0)
+		return 0;
+
 	if (tsc_clocksource_reliable)
 		clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
 	/* lower the rating if we already know its unstable: */
-- 
cgit v0.10.2


From 3fb82d56ad003e804923185316236f26b30dfdd5 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Tue, 23 Nov 2010 16:11:40 -0800
Subject: x86, suspend: Avoid unnecessary smp alternatives switch during
 suspend/resume

During suspend, we disable all the non boot cpus. And during resume we bring
them all back again. So no need to do alternatives_smp_switch() in between.

On my core 2 based laptop, this speeds up the suspend path by 15msec and the
resume path by 5 msec (suspend/resume speed up differences can be attributed
to the different P-states that the cpu is in during suspend/resume).

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1290557500.4946.8.camel@sbsiddha-MOBL3.sc.intel.com>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 76561d2..01171f6 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -66,6 +66,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
 extern void alternatives_smp_module_del(struct module *mod);
 extern void alternatives_smp_switch(int smp);
 extern int alternatives_text_reserved(void *start, void *end);
+extern bool skip_smp_alternatives;
 #else
 static inline void alternatives_smp_module_add(struct module *mod, char *name,
 					       void *locks, void *locks_end,
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 5079f24..9f98eb4 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -353,6 +353,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
 	mutex_unlock(&smp_alt);
 }
 
+bool skip_smp_alternatives;
 void alternatives_smp_switch(int smp)
 {
 	struct smp_alt_module *mod;
@@ -368,7 +369,7 @@ void alternatives_smp_switch(int smp)
 	printk("lockdep: fixing up alternatives.\n");
 #endif
 
-	if (noreplace_smp || smp_alt_once)
+	if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
 		return;
 	BUG_ON(!smp && (num_online_cpus() > 1));
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 083e99d..837c81e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1166,6 +1166,20 @@ out:
 	preempt_enable();
 }
 
+void arch_disable_nonboot_cpus_begin(void)
+{
+	/*
+	 * Avoid the smp alternatives switch during the disable_nonboot_cpus().
+	 * In the suspend path, we will be back in the SMP mode shortly anyways.
+	 */
+	skip_smp_alternatives = true;
+}
+
+void arch_disable_nonboot_cpus_end(void)
+{
+	skip_smp_alternatives = false;
+}
+
 void arch_enable_nonboot_cpus_begin(void)
 {
 	set_mtrr_aps_delayed_init();
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f..8ccc182 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -386,6 +386,14 @@ out:
 #ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_var_t frozen_cpus;
 
+void __weak arch_disable_nonboot_cpus_begin(void)
+{
+}
+
+void __weak arch_disable_nonboot_cpus_end(void)
+{
+}
+
 int disable_nonboot_cpus(void)
 {
 	int cpu, first_cpu, error = 0;
@@ -397,6 +405,7 @@ int disable_nonboot_cpus(void)
 	 * with the userspace trying to use the CPU hotplug at the same time
 	 */
 	cpumask_clear(frozen_cpus);
+	arch_disable_nonboot_cpus_begin();
 
 	printk("Disabling non-boot CPUs ...\n");
 	for_each_online_cpu(cpu) {
@@ -412,6 +421,8 @@ int disable_nonboot_cpus(void)
 		}
 	}
 
+	arch_disable_nonboot_cpus_end();
+
 	if (!error) {
 		BUG_ON(num_online_cpus() > 1);
 		/* Make sure the CPUs won't be enabled by someone else */
-- 
cgit v0.10.2


From 88606e80da0e8d862a42ee19e5bb60b01b940ea7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Dec 2010 21:37:13 +0100
Subject: MAINTAINERS: Update timer related entries

Bring the existing file list up to date and add a new entry for
timekeeping and ntp. Assign John Stultz to this new entry so he gets
all the blame :)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <johnstul@us.ibm.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index 087912a..2a1b256 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2730,6 +2730,10 @@ M:	Thomas Gleixner <tglx@linutronix.de>
 S:	Maintained
 F:	Documentation/timers/
 F:	kernel/hrtimer.c
+F:	kernel/time/clockevents.c
+F:	kernel/time/tick*.*
+F:	kernel/time/timer_*.c
+F	include/linux/clockevents.h
 F:	include/linux/hrtimer.h
 
 HIGH-SPEED SCC DRIVER FOR AX.25
@@ -4986,6 +4990,18 @@ F:	drivers/media/common/saa7146*
 F:	drivers/media/video/*7146*
 F:	include/media/*7146*
 
+TIMEKEEPING, NTP
+M:	John Stultz <johnstul@us.ibm.com>
+M:	Thomas Gleixner <tglx@linutronix.de>
+S:	Supported
+F:	include/linux/clocksource.h
+F:	include/linux/time.h
+F:	include/linux/timex.h
+F:	include/linux/timekeeping.h
+F:	kernel/time/clocksource.c
+F:	kernel/time/time*.c
+F:	kernel/time/ntp.c
+
 TLG2300 VIDEO4LINUX-2 DRIVER
 M:	Huang Shijie <shijie8@gmail.com>
 M:	Kang Yong <kangyong@telegent.com>
-- 
cgit v0.10.2


From 846f40455276617275284a4b76b89311b4aed0b9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 16 Dec 2010 15:18:48 +0000
Subject: GFS2: Don't flush delete workqueue when releasing the transaction
 lock

There is no requirement to flush the delete workqueue before a
gfs2 filesystem is suspended. The workqueue's work will just
be suspended along with the rest of the tasks on the filesystem.

The resolves a deadlock situation where the transaction lock's
demotion code was trying to flush the delete workqueue while at
the same time, the workqueue was waiting for the transaction
lock.

The delete workqueue is flushed by gfs2_make_fs_ro() already, so
that umount/remount are correctly protected anyway.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 0d149dc..263561b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
 
 	if (gl->gl_state != LM_ST_UNLOCKED &&
 	    test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-		flush_workqueue(gfs2_delete_workqueue);
 		gfs2_meta_syncfs(sdp);
 		gfs2_log_shutdown(sdp);
 	}
-- 
cgit v0.10.2


From e27fc9641e8ddc8146f8e01f06e5eba2469698de Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 22 Nov 2010 21:36:11 -0800
Subject: rcu: increase synchronize_sched_expedited() batching

The fix in commit #6a0cc49 requires more than three concurrent instances
of synchronize_sched_expedited() before batching is possible.  This
patch uses a ticket-counter-like approach that is also not unrelated to
Lai Jiangshan's Ring RCU to allow sharing of expedited grace periods even
when there are only two concurrent instances of synchronize_sched_expedited().

This commit builds on Tejun's original posting, which may be found at
http://lkml.org/lkml/2010/11/9/204, adding memory barriers, avoiding
overflow of signed integers (other than via atomic_t), and fixing the
detection of batching.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 49e8e16..af56148 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -47,6 +47,8 @@
 extern int rcutorture_runnable; /* for sysctl */
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 
+#define UINT_CMP_GE(a, b)	(UINT_MAX / 2 >= (a) - (b))
+#define UINT_CMP_LT(a, b)	(UINT_MAX / 2 < (a) - (b))
 #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
 
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c22c4ef..a363871 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1025,7 +1025,8 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
 #else /* #ifndef CONFIG_SMP */
 
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
 
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
@@ -1041,8 +1042,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
 	 * robustness against future implementation changes.
 	 */
 	smp_mb(); /* See above comment block. */
-	if (cpumask_first(cpu_online_mask) == smp_processor_id())
-		atomic_inc(&synchronize_sched_expedited_count);
 	return 0;
 }
 
@@ -1056,43 +1055,86 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
  * lock that is acquired by a CPU-hotplug notifier.  Failing to
  * observe this restriction will result in deadlock.
  *
- * The synchronize_sched_expedited_cpu_stop() function is called
- * in stop-CPU context, but in order to keep overhead down to a dull
- * roar, we don't force this function to wait for its counterparts
- * on other CPUs.  One instance of this function will increment the
- * synchronize_sched_expedited_count variable per call to
- * try_stop_cpus(), but there is no guarantee what order this instance
- * will occur in.  The worst case is that it is last on one call
- * to try_stop_cpus(), and the first on the next call.  This means
- * that piggybacking requires that synchronize_sched_expedited_count
- * be incremented by 3: this guarantees that the piggybacking
- * task has waited through an entire cycle of context switches,
- * even in the worst case.
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word.  Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs.  If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done.  If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot.  In this case, our work is
+ * done for us, and we can simply return.  Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
  */
 void synchronize_sched_expedited(void)
 {
-	int snap, trycount = 0;
+	int firstsnap, s, snap, trycount = 0;
 
-	smp_mb();  /* ensure prior mod happens before capturing snap. */
-	snap = atomic_read(&synchronize_sched_expedited_count) + 2;
+	/* Note that atomic_inc_return() implies full memory barrier. */
+	firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
 	get_online_cpus();
+
+	/*
+	 * Each pass through the following loop attempts to force a
+	 * context switch on each CPU.
+	 */
 	while (try_stop_cpus(cpu_online_mask,
 			     synchronize_sched_expedited_cpu_stop,
 			     NULL) == -EAGAIN) {
 		put_online_cpus();
+
+		/* No joy, try again later.  Or just synchronize_sched(). */
 		if (trycount++ < 10)
 			udelay(trycount * num_online_cpus());
 		else {
 			synchronize_sched();
 			return;
 		}
-		if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
+
+		/* Check to see if someone else did our work for us. */
+		s = atomic_read(&sync_sched_expedited_done);
+		if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
 			smp_mb(); /* ensure test happens before caller kfree */
 			return;
 		}
+
+		/*
+		 * Refetching sync_sched_expedited_started allows later
+		 * callers to piggyback on our grace period.  We subtract
+		 * 1 to get the same token that the last incrementer got.
+		 * We retry after they started, so our grace period works
+		 * for them, and they started after our first try, so their
+		 * grace period works for us.
+		 */
 		get_online_cpus();
+		snap = atomic_read(&sync_sched_expedited_started) - 1;
+		smp_mb(); /* ensure read is before try_stop_cpus(). */
 	}
-	smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
+
+	/*
+	 * Everyone up to our most recent fetch is covered by our grace
+	 * period.  Update the counter, but only if our work is still
+	 * relevant -- which it won't be if someone who started later
+	 * than we did beat us to the punch.
+	 */
+	do {
+		s = atomic_read(&sync_sched_expedited_done);
+		if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+			smp_mb(); /* ensure test happens before caller kfree */
+			break;
+		}
+	} while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+
 	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-- 
cgit v0.10.2


From 20377f32dcb77941d450728da18cce5b1a7faec5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 10 Dec 2010 22:11:10 +0100
Subject: rcu: Stop chasing QS if another CPU did it for us

When a CPU is idle and others CPUs handled its extended
quiescent state to complete grace periods on its behalf,
it will catch up with completed grace periods numbers
when it wakes up.

But at this point there might be no more grace period to
complete, but still the woken CPU always keeps its stale
qs_pending value and will then continue to chase quiescent
states even if its not needed anymore.

This results in clusters of spurious softirqs until a new
real grace period is started. Because if we continue to
chase quiescent states but we have completed every grace
periods, rcu_report_qs_rdp() is puzzled and makes that
state run into infinite loops.

As suggested by Lai Jiangshan, just reset qs_pending if
someone completed every grace periods on our behalf.

Suggested-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 120820ff..916f42b 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -678,6 +678,14 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
 
 		/* Remember that we saw this grace-period completion. */
 		rdp->completed = rnp->completed;
+
+		/*
+		 * If another CPU handled our extended quiescent states and
+		 * we have no more grace period to complete yet, then stop
+		 * chasing quiescent states.
+		 */
+		if (rdp->completed == rnp->gpnum)
+			rdp->qs_pending = 0;
 	}
 }
 
-- 
cgit v0.10.2


From 5ff8e6f0535fe730e921ca347bc38dcb9e01791a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 10 Dec 2010 22:11:11 +0100
Subject: rcu: Keep gpnum and completed fields synchronized

When a CPU that was in an extended quiescent state wakes
up and catches up with grace periods that remote CPUs
completed on its behalf, we update the completed field
but not the gpnum that keeps a stale value of a backward
grace period ID.

Later, note_new_gpnum() will interpret the shift between
the local CPU and the node grace period ID as some new grace
period to handle and will then start to hunt quiescent state.

But if every grace periods have already been completed, this
interpretation becomes broken. And we'll be stuck in clusters
of spurious softirqs because rcu_report_qs_rdp() will make
this broken state run into infinite loop.

The solution, as suggested by Lai Jiangshan, is to ensure that
the gpnum and completed fields are well synchronized when we catch
up with completed grace periods on their behalf by other cpus.
This way we won't start noting spurious new grace periods.

Suggested-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 916f42b..8105271 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -680,6 +680,15 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
 		rdp->completed = rnp->completed;
 
 		/*
+		 * If we were in an extended quiescent state, we may have
+		 * missed some grace periods that others CPUs took care on
+		 * our behalf. Catch up with this state to avoid noting
+		 * spurious new grace periods.
+		 */
+		if (rdp->completed > rdp->gpnum)
+			rdp->gpnum = rdp->completed;
+
+		/*
 		 * If another CPU handled our extended quiescent states and
 		 * we have no more grace period to complete yet, then stop
 		 * chasing quiescent states.
-- 
cgit v0.10.2


From 121dfc4b3eba9e2f3c42d35205a3510cc65b9931 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 10 Dec 2010 15:02:47 -0800
Subject: rcu: fine-tune grace-period begin/end checks

Use the CPU's bit in rnp->qsmask to determine whether or not the CPU
should try to report a quiescent state.  Handle overflow in the check
for rdp->gpnum having fallen behind.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 8105271..c39ec5b 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -617,9 +617,17 @@ static void __init check_cpu_stall_init(void)
 static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
 	if (rdp->gpnum != rnp->gpnum) {
-		rdp->qs_pending = 1;
-		rdp->passed_quiesc = 0;
+		/*
+		 * If the current grace period is waiting for this CPU,
+		 * set up to detect a quiescent state, otherwise don't
+		 * go looking for one.
+		 */
 		rdp->gpnum = rnp->gpnum;
+		if (rnp->qsmask & rdp->grpmask) {
+			rdp->qs_pending = 1;
+			rdp->passed_quiesc = 0;
+		} else
+			rdp->qs_pending = 0;
 	}
 }
 
@@ -681,19 +689,20 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
 
 		/*
 		 * If we were in an extended quiescent state, we may have
-		 * missed some grace periods that others CPUs took care on
+		 * missed some grace periods that others CPUs handled on
 		 * our behalf. Catch up with this state to avoid noting
-		 * spurious new grace periods.
+		 * spurious new grace periods.  If another grace period
+		 * has started, then rnp->gpnum will have advanced, so
+		 * we will detect this later on.
 		 */
-		if (rdp->completed > rdp->gpnum)
+		if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
 			rdp->gpnum = rdp->completed;
 
 		/*
-		 * If another CPU handled our extended quiescent states and
-		 * we have no more grace period to complete yet, then stop
-		 * chasing quiescent states.
+		 * If RCU does not need a quiescent state from this CPU,
+		 * then make sure that this CPU doesn't go looking for one.
 		 */
-		if (rdp->completed == rnp->gpnum)
+		if ((rnp->qsmask & rdp->grpmask) == 0)
 			rdp->qs_pending = 0;
 	}
 }
-- 
cgit v0.10.2


From 0209f6490b030f35349a2bb71294f3fd75b0f36d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 14 Dec 2010 16:07:52 -0800
Subject: rcu: limit rcu_node leaf-level fanout

Some recent benchmarks have indicated possible lock contention on the
leaf-level rcu_node locks.  This commit therefore limits the number of
CPUs per leaf-level rcu_node structure to 16, in other words, there
can be at most 16 rcu_data structures fanning into a given rcu_node
structure.  Prior to this, the limit was 32 on 32-bit systems and 64 on
64-bit systems.

Note that the fanout of non-leaf rcu_node structures is unchanged.  The
organization of accesses to the rcu_node tree is such that references
to non-leaf rcu_node structures are much less frequent than to the
leaf structures.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index c39ec5b..01c8ad3 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1869,8 +1869,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
 	int i;
 
-	for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+	for (i = NUM_RCU_LVLS - 1; i > 0; i--)
 		rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+	rsp->levelspread[0] = RCU_FANOUT_LEAF;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1a54be2..e8f057e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
 /*
  * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
  * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this has not been tested, so there is probably some
- * bug somewhere.
+ * In practice, this did work well going from three levels to four.
+ * Of course, your mileage may vary.
  */
 #define MAX_RCU_LVLS 4
-#define RCU_FANOUT	      (CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_SQ	      (RCU_FANOUT * RCU_FANOUT)
-#define RCU_FANOUT_CUBE	      (RCU_FANOUT_SQ * RCU_FANOUT)
-#define RCU_FANOUT_FOURTH     (RCU_FANOUT_CUBE * RCU_FANOUT)
-
-#if NR_CPUS <= RCU_FANOUT
+#if CONFIG_RCU_FANOUT > 16
+#define RCU_FANOUT_LEAF       16
+#else /* #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_LEAF       (CONFIG_RCU_FANOUT)
+#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_1	      (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2	      (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_3	      (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_4	      (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+
+#if NR_CPUS <= RCU_FANOUT_1
 #  define NUM_RCU_LVLS	      1
 #  define NUM_RCU_LVL_0	      1
 #  define NUM_RCU_LVL_1	      (NR_CPUS)
 #  define NUM_RCU_LVL_2	      0
 #  define NUM_RCU_LVL_3	      0
 #  define NUM_RCU_LVL_4	      0
-#elif NR_CPUS <= RCU_FANOUT_SQ
+#elif NR_CPUS <= RCU_FANOUT_2
 #  define NUM_RCU_LVLS	      2
 #  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_2	      (NR_CPUS)
 #  define NUM_RCU_LVL_3	      0
 #  define NUM_RCU_LVL_4	      0
-#elif NR_CPUS <= RCU_FANOUT_CUBE
+#elif NR_CPUS <= RCU_FANOUT_3
 #  define NUM_RCU_LVLS	      3
 #  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-#  define NUM_RCU_LVL_3	      NR_CPUS
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_LVL_3	      (NR_CPUS)
 #  define NUM_RCU_LVL_4	      0
-#elif NR_CPUS <= RCU_FANOUT_FOURTH
+#elif NR_CPUS <= RCU_FANOUT_4
 #  define NUM_RCU_LVLS	      4
 #  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
-#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-#  define NUM_RCU_LVL_3	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-#  define NUM_RCU_LVL_4	      NR_CPUS
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
+#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+#  define NUM_RCU_LVL_3	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_LVL_4	      (NR_CPUS)
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
 
 #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
-- 
cgit v0.10.2


From b52573d2796274f7f31cfeff7185c320adcd4f12 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 14 Dec 2010 17:36:02 -0800
Subject: rcu: reduce __call_rcu()-induced contention on rcu_node structures

When the current __call_rcu() function was written, the expedited
APIs did not exist.  The __call_rcu() implementation therefore went
to great lengths to detect the end of old grace periods and to start
new ones, all in the name of reducing grace-period latency.  Now the
expedited APIs do exist, and the usage of __call_rcu() has increased
considerably.  This commit therefore causes __call_rcu() to avoid
worrying about grace periods unless there are a large number of
RCU callbacks stacked up on the current CPU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 01c8ad3..d0ddfea 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1435,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	 */
 	local_irq_save(flags);
 	rdp = this_cpu_ptr(rsp->rda);
-	rcu_process_gp_end(rsp, rdp);
-	check_for_new_grace_period(rsp, rdp);
 
 	/* Add the callback to our list. */
 	*rdp->nxttail[RCU_NEXT_TAIL] = head;
 	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
 
-	/* Start a new grace period if one not already started. */
-	if (!rcu_gp_in_progress(rsp)) {
-		unsigned long nestflag;
-		struct rcu_node *rnp_root = rcu_get_root(rsp);
-
-		raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
-		rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
-	}
-
 	/*
 	 * Force the grace period if too many callbacks or too long waiting.
 	 * Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1459,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	 * is the only one waiting for a grace period to complete.
 	 */
 	if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-		rdp->blimit = LONG_MAX;
-		if (rsp->n_force_qs == rdp->n_force_qs_snap &&
-		    *rdp->nxttail[RCU_DONE_TAIL] != head)
-			force_quiescent_state(rsp, 0);
-		rdp->n_force_qs_snap = rsp->n_force_qs;
-		rdp->qlen_last_fqs_check = rdp->qlen;
+
+		/* Are we ignoring a completed grace period? */
+		rcu_process_gp_end(rsp, rdp);
+		check_for_new_grace_period(rsp, rdp);
+
+		/* Start a new grace period if one not already started. */
+		if (!rcu_gp_in_progress(rsp)) {
+			unsigned long nestflag;
+			struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+			raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+			rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+		} else {
+			/* Give the grace period a kick. */
+			rdp->blimit = LONG_MAX;
+			if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+			    *rdp->nxttail[RCU_DONE_TAIL] != head)
+				force_quiescent_state(rsp, 0);
+			rdp->n_force_qs_snap = rsp->n_force_qs;
+			rdp->qlen_last_fqs_check = rdp->qlen;
+		}
 	} else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
 		force_quiescent_state(rsp, 1);
 	local_irq_restore(flags);
-- 
cgit v0.10.2


From 8a9c1cee26c0ece23b38c45b92b724438878f842 Mon Sep 17 00:00:00 2001
From: Mariusz Kozlowski <mk@lab.zgora.pl>
Date: Wed, 15 Dec 2010 23:11:12 +0100
Subject: rculist: fix borked __list_for_each_rcu() macro

This restores parentheses blance.

Signed-off-by: Mariusz Kozlowski <mk@lab.zgora.pl>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index f31ef61..70d3ba5 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -244,7 +244,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
 #define __list_for_each_rcu(pos, head) \
 	for (pos = rcu_dereference_raw(list_next_rcu(head)); \
 		pos != (head); \
-		pos = rcu_dereference_raw(list_next_rcu((pos)))
+		pos = rcu_dereference_raw(list_next_rcu(pos)))
 
 /**
  * list_for_each_entry_rcu	-	iterate over rcu list of given type
-- 
cgit v0.10.2


From 3c2dcf2aed5ea22ecf65a9a871c4963faec421b3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 15 Dec 2010 21:12:15 -0800
Subject: rcu: remove unused __list_for_each_rcu() macro

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 70d3ba5..2dea94f 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -241,11 +241,6 @@ static inline void list_splice_init_rcu(struct list_head *list,
 #define list_first_entry_rcu(ptr, type, member) \
 	list_entry_rcu((ptr)->next, type, member)
 
-#define __list_for_each_rcu(pos, head) \
-	for (pos = rcu_dereference_raw(list_next_rcu(head)); \
-		pos != (head); \
-		pos = rcu_dereference_raw(list_next_rcu(pos)))
-
 /**
  * list_for_each_entry_rcu	-	iterate over rcu list of given type
  * @pos:	the type * to use as a loop cursor.
-- 
cgit v0.10.2


From 43365bd7ff37979d2afdccbe953299ed64a4649b Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Wed, 15 Dec 2010 19:10:17 -0800
Subject: sched: Move periodic share updates to entity_tick()

Long running entities that do not block (dequeue) require periodic updates to
maintain accurate share values.  (Note: group entities with several threads are
quite likely to be non-blocking in many circumstances).

By virtue of being long-running however, we will see entity ticks (otherwise
the required update occurs in dequeue/put and we are done).  Thus we can move
the detection (and associated work) for these updates into the periodic path.

This restores the 'atomicity' of update_curr() with respect to accounting.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101216031038.067028969@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c886717..16ee398 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -564,10 +564,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
 #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->load_unacc_exec_time += delta_exec;
-	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
-		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
-	}
 #endif
 }
 
@@ -809,6 +805,14 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+		update_cfs_load(cfs_rq, 0);
+		update_cfs_shares(cfs_rq, 0);
+	}
+}
 #else /* CONFIG_FAIR_GROUP_SCHED */
 static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
@@ -817,6 +821,10 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 {
 }
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1133,6 +1141,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 */
 	update_curr(cfs_rq);
 
+	/*
+	 * Update share accounting for long-running entities.
+	 */
+	update_entity_shares_tick(cfs_rq);
+
 #ifdef CONFIG_SCHED_HRTICK
 	/*
 	 * queued ticks are scheduled to match the slice, so don't bother
-- 
cgit v0.10.2


From 19e5eebb8eaa5ca3ff8aa18cb57ccb7a9f67277d Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Wed, 15 Dec 2010 19:10:18 -0800
Subject: sched: Fix interactivity bug by charging unaccounted run-time on
 entity re-weight

Mike Galbraith reported poor interactivity[*] when the new shares distribution
code was combined with autogroups.

The root cause turns out to be a mis-ordering of accounting accrued execution
time and shares updates.  Since update_curr() is issued hierarchically,
updating the parent entity weights to reflect child enqueue/dequeue results in
the parent's unaccounted execution time then being accrued (vs vruntime) at the
new weight as opposed to the weight present at accumulation.

While this doesn't have much effect on processes with timeslices that cross a
tick, it is particularly problematic for an interactive process (e.g. Xorg)
which incurs many (tiny) timeslices.  In this scenario almost all updates are
at dequeue which can result in significant fairness perturbation (especially if
it is the only thread, resulting in potential {tg->shares, MIN_SHARES}
transitions).

Correct this by ensuring unaccounted time is accumulated prior to manipulating
an entity's weight.

[*] http://xkcd.com/619/ is perversely Nostradamian here.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20101216031038.159704378@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 16ee398..c62ebae 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -765,8 +765,12 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
-	if (se->on_rq)
+	if (se->on_rq) {
+		/* commit outstanding execution time */
+		if (cfs_rq->curr == se)
+			update_curr(cfs_rq);
 		account_entity_dequeue(cfs_rq, se);
+	}
 
 	update_load_set(&se->load, weight);
 
-- 
cgit v0.10.2


From e681041388e61ecd7f99dba66b3c1db11a564d92 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Tue, 30 Nov 2010 13:55:39 -0600
Subject: x86, UV: Add common uv_early_read_mmr() function for reading MMRs

Early in boot, reading MMRs from the UV hub controller require
calls to early_ioremap()/early_iounmap().  Rather than
duplicating code, add a common function to do the
map/read/unmap.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20101130195926.834804371@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index c1c52c3..0c3675f 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -48,6 +48,16 @@ unsigned int uv_apicid_hibits;
 EXPORT_SYMBOL_GPL(uv_apicid_hibits);
 static DEFINE_SPINLOCK(uv_nmi_lock);
 
+static unsigned long __init uv_early_read_mmr(unsigned long addr)
+{
+	unsigned long val, *mmr;
+
+	mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
+	val = *mmr;
+	early_iounmap(mmr, sizeof(*mmr));
+	return val;
+}
+
 static inline bool is_GRU_range(u64 start, u64 end)
 {
 	return start >= gru_start_paddr && end <= gru_end_paddr;
@@ -58,16 +68,12 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end)
 	return is_ISA_range(start, end) || is_GRU_range(start, end);
 }
 
-static int early_get_nodeid(void)
+static int __init early_get_nodeid(void)
 {
 	union uvh_node_id_u node_id;
-	unsigned long *mmr;
-
-	mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
-	node_id.v = *mmr;
-	early_iounmap(mmr, sizeof(*mmr));
 
 	/* Currently, all blades have same revision number */
+	node_id.v = uv_early_read_mmr(UVH_NODE_ID);
 	uv_min_hub_revision_id = node_id.s.revision;
 
 	return node_id.s.node_id;
@@ -75,11 +81,7 @@ static int early_get_nodeid(void)
 
 static void __init early_get_apic_pnode_shift(void)
 {
-	unsigned long *mmr;
-
-	mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr));
-	uvh_apicid.v = *mmr;
-	early_iounmap(mmr, sizeof(*mmr));
+	uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
 	if (!uvh_apicid.v)
 		/*
 		 * Old bios, use default value
@@ -95,12 +97,8 @@ static void __init early_get_apic_pnode_shift(void)
 static void __init uv_set_apicid_hibit(void)
 {
 	union uvh_lb_target_physical_apic_id_mask_u apicid_mask;
-	unsigned long *mmr;
 
-	mmr = early_ioremap(UV_LOCAL_MMR_BASE |
-		UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK, sizeof(*mmr));
-	apicid_mask.v = *mmr;
-	early_iounmap(mmr, sizeof(*mmr));
+	apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK);
 	uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK;
 }
 
-- 
cgit v0.10.2


From d8850ba425d9823d3184bd52f065899dac4689f9 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Tue, 30 Nov 2010 13:55:40 -0600
Subject: x86, UV: Fix the effect of extra bits in the hub nodeid register

UV systems can be partitioned into multiple independent SSIs.
Large partitioned systems may have extra bits in the node_id
register. These bits are used when the total memory on all SSIs
exceeds 16TB.  These extra bits need to be ignored when
calculating x2apic_extra_bits.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20101130195926.972776133@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 0c3675f..2a3f2a7 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -68,15 +68,19 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end)
 	return is_ISA_range(start, end) || is_GRU_range(start, end);
 }
 
-static int __init early_get_nodeid(void)
+static int __init early_get_pnodeid(void)
 {
 	union uvh_node_id_u node_id;
+	union uvh_rh_gam_config_mmr_u  m_n_config;
+	int pnode;
 
 	/* Currently, all blades have same revision number */
 	node_id.v = uv_early_read_mmr(UVH_NODE_ID);
+	m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
 	uv_min_hub_revision_id = node_id.s.revision;
 
-	return node_id.s.node_id;
+	pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
+	return pnode;
 }
 
 static void __init early_get_apic_pnode_shift(void)
@@ -104,10 +108,10 @@ static void __init uv_set_apicid_hibit(void)
 
 static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-	int nodeid;
+	int pnodeid;
 
 	if (!strcmp(oem_id, "SGI")) {
-		nodeid = early_get_nodeid();
+		pnodeid = early_get_pnodeid();
 		early_get_apic_pnode_shift();
 		x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
 		x86_platform.nmi_init = uv_nmi_init;
@@ -117,7 +121,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 			uv_system_type = UV_X2APIC;
 		else if (!strcmp(oem_table_id, "UVH")) {
 			__get_cpu_var(x2apic_extra_bits) =
-				nodeid << (uvh_apicid.s.pnode_shift - 1);
+				pnodeid << uvh_apicid.s.pnode_shift;
 			uv_system_type = UV_NON_UNIQUE_APIC;
 			uv_set_apicid_hibit();
 			return 1;
@@ -680,27 +684,32 @@ void uv_nmi_init(void)
 void __init uv_system_init(void)
 {
 	union uvh_rh_gam_config_mmr_u  m_n_config;
+	union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
 	union uvh_node_id_u node_id;
 	unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
-	int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
+	int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
 	int gnode_extra, max_pnode = 0;
 	unsigned long mmr_base, present, paddr;
-	unsigned short pnode_mask;
+	unsigned short pnode_mask, pnode_io_mask;
 
 	map_low_mmrs();
 
 	m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
 	m_val = m_n_config.s.m_skt;
 	n_val = m_n_config.s.n_skt;
+	mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
+	n_io = mmioh.s.n_io;
 	mmr_base =
 	    uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
 	    ~UV_MMR_ENABLE;
 	pnode_mask = (1 << n_val) - 1;
+	pnode_io_mask = (1 << n_io) - 1;
+
 	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
 	gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
 	gnode_upper = ((unsigned long)gnode_extra  << m_val);
-	printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
-			n_val, m_val, gnode_upper, gnode_extra);
+	printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
+			n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
 
 	printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
 
@@ -733,7 +742,7 @@ void __init uv_system_init(void)
 		for (j = 0; j < 64; j++) {
 			if (!test_bit(j, &present))
 				continue;
-			pnode = (i * 64 + j);
+			pnode = (i * 64 + j) & pnode_mask;
 			uv_blade_info[blade].pnode = pnode;
 			uv_blade_info[blade].nr_possible_cpus = 0;
 			uv_blade_info[blade].nr_online_cpus = 0;
@@ -754,6 +763,7 @@ void __init uv_system_init(void)
 		/*
 		 * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
 		 */
+		uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
 		uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
 		pnode = uv_apicid_to_pnode(apicid);
 		blade = boot_pnode_to_blade(pnode);
@@ -770,7 +780,6 @@ void __init uv_system_init(void)
 		uv_cpu_hub_info(cpu)->numa_blade_id = blade;
 		uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
 		uv_cpu_hub_info(cpu)->pnode = pnode;
-		uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
 		uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
 		uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
 		uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
@@ -794,7 +803,7 @@ void __init uv_system_init(void)
 
 	map_gru_high(max_pnode);
 	map_mmr_high(max_pnode);
-	map_mmioh_high(max_pnode);
+	map_mmioh_high(max_pnode & pnode_io_mask);
 
 	uv_cpu_init();
 	uv_scir_register_cpu_notifier();
-- 
cgit v0.10.2


From 94462ad3b14739d158a1ab87bb30008c1e5a6bc1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 29 Nov 2010 13:15:42 -0500
Subject: module: Move RO/NX module protection to after ftrace module update

The commit:

84e1c6bb38eb318e456558b610396d9f1afaabf0
x86: Add RO/NX protection for loadable kernel modules

Broke the function tracer with this output:

------------[ cut here ]------------
WARNING: at kernel/trace/ftrace.c:1014 ftrace_bug+0x114/0x171()
Hardware name: Precision WorkStation 470
Modules linked in: i2c_core(+)
Pid: 86, comm: modprobe Not tainted 2.6.37-rc2+ #68
Call Trace:
 [<ffffffff8104e957>] warn_slowpath_common+0x85/0x9d
 [<ffffffffa00026db>] ? __process_new_adapter+0x7/0x34 [i2c_core]
 [<ffffffffa00026db>] ? __process_new_adapter+0x7/0x34 [i2c_core]
 [<ffffffff8104e989>] warn_slowpath_null+0x1a/0x1c
 [<ffffffff810a9dfe>] ftrace_bug+0x114/0x171
 [<ffffffffa00026db>] ? __process_new_adapter+0x7/0x34 [i2c_core]
 [<ffffffff810aa0db>] ftrace_process_locs+0x1ae/0x274
 [<ffffffffa00026db>] ? __process_new_adapter+0x7/0x34 [i2c_core]
 [<ffffffff810aa29e>] ftrace_module_notify+0x39/0x44
 [<ffffffff814405cf>] notifier_call_chain+0x37/0x63
 [<ffffffff8106e054>] __blocking_notifier_call_chain+0x46/0x5b
 [<ffffffff8106e07d>] blocking_notifier_call_chain+0x14/0x16
 [<ffffffff8107ffde>] sys_init_module+0x73/0x1f3
 [<ffffffff8100acf2>] system_call_fastpath+0x16/0x1b
---[ end trace 2aff4f4ca53ec746 ]---
ftrace faulted on writing [<ffffffffa00026db>]
__process_new_adapter+0x7/0x34 [i2c_core]

The cause was that the module text was set to read only before ftrace
could convert the calls to mcount to nops. Thus, the conversions failed
due to not being able to write to the text locations.

The simple fix is to move setting the module to read only after the
module notifiers are called (where ftrace sets the module mcounts to nops).

Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/module.c b/kernel/module.c
index 562f665..34e00b7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2816,18 +2816,6 @@ static struct module *load_module(void __user *umod,
 	kfree(info.strmap);
 	free_copy(&info);
 
-	/* Set RO and NX regions for core */
-	set_section_ro_nx(mod->module_core,
-				mod->core_text_size,
-				mod->core_ro_size,
-				mod->core_size);
-
-	/* Set RO and NX regions for init */
-	set_section_ro_nx(mod->module_init,
-				mod->init_text_size,
-				mod->init_ro_size,
-				mod->init_size);
-
 	/* Done! */
 	trace_module_load(mod);
 	return mod;
@@ -2888,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	blocking_notifier_call_chain(&module_notify_list,
 			MODULE_STATE_COMING, mod);
 
+	/* Set RO and NX regions for core */
+	set_section_ro_nx(mod->module_core,
+				mod->core_text_size,
+				mod->core_ro_size,
+				mod->core_size);
+
+	/* Set RO and NX regions for init */
+	set_section_ro_nx(mod->module_init,
+				mod->init_text_size,
+				mod->init_ro_size,
+				mod->init_size);
+
 	do_mod_ctors(mod);
 	/* Start the module */
 	if (mod->init != NULL)
-- 
cgit v0.10.2


From 56d91f132c9be66e98cce1b1e77a28027048bb26 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 16 Dec 2010 19:09:24 -0800
Subject: x86, acpi: Add MAX_LOCAL_APIC for 32bit

We should use MAX_LOCAL_APIC for max apic ids and MAX_APICS as number
of local apics.

Also apic_version[] array should use MAX_LOCAL_APICs.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D0AD464.2020408@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index a859ca4..47a30ff 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -145,6 +145,7 @@
 
 #ifdef CONFIG_X86_32
 # define MAX_IO_APICS 64
+# define MAX_LOCAL_APIC 256
 #else
 # define MAX_IO_APICS 128
 # define MAX_LOCAL_APIC 32768
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index c82868e..7c1aebf 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -6,7 +6,7 @@
 #include <asm/mpspec_def.h>
 #include <asm/x86_init.h>
 
-extern int apic_version[MAX_APICS];
+extern int apic_version[];
 extern int pic_mode;
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 4638396..0d5d07f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1707,7 +1707,7 @@ void __init init_apic_mappings(void)
  * This initializes the IO-APIC and APIC hardware if this is
  * a UP kernel.
  */
-int apic_version[MAX_APICS];
+int apic_version[MAX_LOCAL_APIC];
 
 int __init APIC_init_uniprocessor(void)
 {
-- 
cgit v0.10.2


From d3bd058826aa8b79590cca6c8e6d1557bf576ada Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 16 Dec 2010 19:09:58 -0800
Subject: x86, acpi: Parse all SRAT cpu entries even above the cpu number
 limitation

Recent Intel new system have different order in MADT, aka will list all thread0
at first, then all thread1.
But SRAT table still old order, it will list cpus in one socket all together.

If the user have compiled limited NR_CPUS or boot with nr_cpus=, could have missed
to put some cpus apic id to node mapping into apicid_to_node[].

for example for 4 sockets system with 64 cpus with nr_cpus=32 will get crash...

[    9.106288] Total of 32 processors activated (136190.88 BogoMIPS).
[    9.235021] divide error: 0000 [#1] SMP
[    9.235315] last sysfs file:
[    9.235481] CPU 1
[    9.235592] Modules linked in:
[    9.245398]
[    9.245478] Pid: 2, comm: kthreadd Not tainted 2.6.37-rc1-tip-yh-01782-ge92ef79-dirty #274      /Sun Fire x4800
[    9.265415] RIP: 0010:[<ffffffff81075a8f>]  [<ffffffff81075a8f>] select_task_rq_fair+0x4f0/0x623
...
[    9.645938] RIP  [<ffffffff81075a8f>] select_task_rq_fair+0x4f0/0x623
[    9.665356]  RSP <ffff88103f8d1c40>
[    9.665568] ---[ end trace 2296156d35fdfc87 ]---

So let just parse all cpu entries in SRAT.

Also add apicid checking with MAX_LOCAL_APIC, in case We could out of boundaries of
apicid_to_node[].

it fixes following bug too.
https://bugzilla.kernel.org/show_bug.cgi?id=22662

-v2: expand to 32bit according to hpa
   need to add MAX_LOCAL_APIC for 32bit

Reported-and-Tested-by: Wu Fengguang <fengguang.wu@intel.com>
Reported-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Tested-by: Myron Stowe <myron.stowe@hp.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D0AD486.9020704@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c05872a..f19d667 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -198,6 +198,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
 {
 	unsigned int ver = 0;
 
+	if (id >= (MAX_LOCAL_APIC-1)) {
+		printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
+		return;
+	}
+
 	if (!enabled) {
 		++disabled_cpus;
 		return;
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index a17dffd..f164345 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -92,6 +92,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
 	/* mark this node as "seen" in node bitmap */
 	BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
 
+	/* don't need to check apic_id here, because it is always 8 bits */
 	apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
 
 	printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index a35cb9d..171a0aa 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 	}
 
 	apic_id = pa->apic_id;
+	if (apic_id >= MAX_LOCAL_APIC) {
+		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+		return;
+	}
 	apicid_to_node[apic_id] = node;
 	node_set(node, cpu_nodes_parsed);
 	acpi_numa = 1;
@@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
 	else
 		apic_id = pa->apic_id;
+
+	if (apic_id >= MAX_LOCAL_APIC) {
+		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+		return;
+	}
+
 	apicid_to_node[apic_id] = node;
 	node_set(node, cpu_nodes_parsed);
 	acpi_numa = 1;
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 5718566..d9926af 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -275,13 +275,23 @@ acpi_table_parse_srat(enum acpi_srat_type id,
 int __init acpi_numa_init(void)
 {
 	int ret = 0;
+	int nr_cpu_entries = nr_cpu_ids;
+
+#ifdef CONFIG_X86
+	/*
+	 * Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
+	 * SRAT cpu entries could have different order with that in MADT.
+	 * So go over all cpu entries in SRAT to get apicid to node mapping.
+	 */
+	nr_cpu_entries = MAX_LOCAL_APIC;
+#endif
 
 	/* SRAT: Static Resource Affinity Table */
 	if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
-				     acpi_parse_x2apic_affinity, nr_cpu_ids);
+				     acpi_parse_x2apic_affinity, nr_cpu_entries);
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
-				     acpi_parse_processor_affinity, nr_cpu_ids);
+				     acpi_parse_processor_affinity, nr_cpu_entries);
 		ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
 					    acpi_parse_memory_affinity,
 					    NR_NODE_MEMBLKS);
-- 
cgit v0.10.2


From c8217b8305e5e75c23617f2f4cd262527d952c0a Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Mon, 13 Dec 2010 10:51:57 -0600
Subject: x86, paravirt: Use native_halt on a halt, not native_safe_halt

halt() should use native_halt()
safe_halt() uses native_safe_halt()

If CONFIG_PARAVIRT=y, halt() is defined in arch/x86/include/asm/paravirt.h as

static inline void halt(void)
{
        PVOP_VCALL0(pv_irq_ops.safe_halt);
}

Otherwise (no CONFIG_PARAVIRT) halt() in arch/x86/include/asm/irqflags.h is

static inline void halt(void)
{
        native_halt();
}

So it looks to me like the CONFIG_PARAVIRT case of using native_safe_halt()
for a halt() is an oversight.
Am I missing something?

It probably hasn't shown up as a problem because the local apic is disabled
on a shutdown or restart.  But if we disable interrupts and call halt()
we shouldn't expect that the halt() will re-enable interrupts.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
LKML-Reference: <E1PSBcz-0001g1-FM@eag09.americas.sgi.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ef99758..7709c12 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -112,7 +112,7 @@ static inline void arch_safe_halt(void)
 
 static inline void halt(void)
 {
-	PVOP_VCALL0(pv_irq_ops.safe_halt);
+	PVOP_VCALL0(pv_irq_ops.halt);
 }
 
 static inline void wbinvd(void)
-- 
cgit v0.10.2


From 9e76a97efd31a08cb19d0ba12013b8fb4ad3e474 Mon Sep 17 00:00:00 2001
From: "R, Durgadoss" <durgadoss.r@intel.com>
Date: Mon, 3 Jan 2011 17:22:04 +0530
Subject: x86, hwmon: Add core threshold notification to therm_throt.c

This patch adds code to therm_throt.c to notify core thermal threshold
events. These thresholds are supported by the IA32_THERM_INTERRUPT register.
The status/log for the same is monitored using the IA32_THERM_STATUS register.
The necessary #defines are in msr-index.h. A call back is added to mce.h, to
further notify the thermal stack, about the threshold events.

Signed-off-by: Durgadoss R <durgadoss.r@intel.com>
LKML-Reference: <D6D887BA8C9DFF48B5233887EF04654105C1251710@bgsmsx502.gar.corp.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c62c13c..eb16e94 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -223,6 +223,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
 
 void mce_log_therm_throt_event(__u64 status);
 
+/* Interrupt Handler for core thermal thresholds */
+extern int (*platform_thermal_notify)(__u64 msr_val);
+
 #ifdef CONFIG_X86_THERMAL_VECTOR
 extern void mcheck_intel_therm_init(void);
 #else
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6b89f5e..622c80b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -253,6 +253,18 @@
 #define PACKAGE_THERM_INT_LOW_ENABLE		(1 << 1)
 #define PACKAGE_THERM_INT_PLN_ENABLE		(1 << 24)
 
+/* Thermal Thresholds Support */
+#define THERM_INT_THRESHOLD0_ENABLE    (1 << 15)
+#define THERM_SHIFT_THRESHOLD0        8
+#define THERM_MASK_THRESHOLD0          (0x7f << THERM_SHIFT_THRESHOLD0)
+#define THERM_INT_THRESHOLD1_ENABLE    (1 << 23)
+#define THERM_SHIFT_THRESHOLD1        16
+#define THERM_MASK_THRESHOLD1          (0x7f << THERM_SHIFT_THRESHOLD1)
+#define THERM_STATUS_THRESHOLD0        (1 << 6)
+#define THERM_LOG_THRESHOLD0           (1 << 7)
+#define THERM_STATUS_THRESHOLD1        (1 << 8)
+#define THERM_LOG_THRESHOLD1           (1 << 9)
+
 /* MISC_ENABLE bits: architectural */
 #define MSR_IA32_MISC_ENABLE_FAST_STRING	(1ULL << 0)
 #define MSR_IA32_MISC_ENABLE_TCC		(1ULL << 1)
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 4b68326..e12246f 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -53,8 +53,13 @@ struct thermal_state {
 	struct _thermal_state core_power_limit;
 	struct _thermal_state package_throttle;
 	struct _thermal_state package_power_limit;
+	struct _thermal_state core_thresh0;
+	struct _thermal_state core_thresh1;
 };
 
+/* Callback to handle core threshold interrupts */
+int (*platform_thermal_notify)(__u64 msr_val);
+
 static DEFINE_PER_CPU(struct thermal_state, thermal_state);
 
 static atomic_t therm_throt_en	= ATOMIC_INIT(0);
@@ -200,6 +205,22 @@ static int therm_throt_process(bool new_event, int event, int level)
 	return 0;
 }
 
+static int thresh_event_valid(int event)
+{
+	struct _thermal_state *state;
+	unsigned int this_cpu = smp_processor_id();
+	struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
+	u64 now = get_jiffies_64();
+
+	state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1;
+
+	if (time_before64(now, state->next_check))
+		return 0;
+
+	state->next_check = now + CHECK_INTERVAL;
+	return 1;
+}
+
 #ifdef CONFIG_SYSFS
 /* Add/Remove thermal_throttle interface for CPU device: */
 static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
@@ -313,6 +334,22 @@ device_initcall(thermal_throttle_init_device);
 #define PACKAGE_THROTTLED	((__u64)2 << 62)
 #define PACKAGE_POWER_LIMIT	((__u64)3 << 62)
 
+static void notify_thresholds(__u64 msr_val)
+{
+	/* check whether the interrupt handler is defined;
+	 * otherwise simply return
+	 */
+	if (!platform_thermal_notify)
+		return;
+
+	/* lower threshold reached */
+	if ((msr_val & THERM_LOG_THRESHOLD0) &&	thresh_event_valid(0))
+		platform_thermal_notify(msr_val);
+	/* higher threshold reached */
+	if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1))
+		platform_thermal_notify(msr_val);
+}
+
 /* Thermal transition interrupt handler */
 static void intel_thermal_interrupt(void)
 {
@@ -321,6 +358,9 @@ static void intel_thermal_interrupt(void)
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
 
+	/* Check for violation of core thermal thresholds*/
+	notify_thresholds(msr_val);
+
 	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
 				THERMAL_THROTTLING_EVENT,
 				CORE_LEVEL) != 0)
-- 
cgit v0.10.2


From cfa60917f0ba6eca83f41aef3cb4a7dd7736ac9f Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Mon, 3 Jan 2011 12:03:53 -0600
Subject: x86, UV, BAU: Extend for more than 16 cpus per socket

Fix a hard-coded limit of a maximum of 16 cpu's per socket.

The UV Broadcast Assist Unit code initializes by scanning the
cpu topology of the system and assigning a master cpu for each
socket and UV hub. That scan had an assumption of a limit of 16
cpus per socket. With Westmere we are going over that limit.
The UV hub hardware will allow up to 32.

If the scan finds the system has gone over that limit it returns
an error and we print a warning and fall back to doing TLB
shootdowns without the BAU.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: <stable@kernel.org> # .37.x
LKML-Reference: <E1PZol7-0000mM-77@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 42d412f..ce1d54c 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -26,20 +26,22 @@
  * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
  * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
  *
- * We will use 31 sets, one for sending BAU messages from each of the 32
+ * We will use one set for sending BAU messages from each of the
  * cpu's on the uvhub.
  *
  * TLB shootdown will use the first of the 8 descriptors of each set.
  * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
  */
 
+#define MAX_CPUS_PER_UVHUB		64
+#define MAX_CPUS_PER_SOCKET		32
+#define UV_ADP_SIZE			64 /* hardware-provided max. */
+#define UV_CPUS_PER_ACT_STATUS		32 /* hardware-provided max. */
 #define UV_ITEMS_PER_DESCRIPTOR		8
 /* the 'throttle' to prevent the hardware stay-busy bug */
 #define MAX_BAU_CONCURRENT		3
-#define UV_CPUS_PER_ACT_STATUS		32
 #define UV_ACT_STATUS_MASK		0x3
 #define UV_ACT_STATUS_SIZE		2
-#define UV_ADP_SIZE			32
 #define UV_DISTRIBUTION_SIZE		256
 #define UV_SW_ACK_NPENDING		8
 #define UV_NET_ENDPOINT_INTD		0x38
@@ -100,7 +102,6 @@
  * number of destination side software ack resources
  */
 #define DEST_NUM_RESOURCES		8
-#define MAX_CPUS_PER_NODE		32
 /*
  * completion statuses for sending a TLB flush message
  */
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index ba9caa8..df58e9c 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1341,7 +1341,7 @@ uv_activation_descriptor_init(int node, int pnode)
 
 	/*
 	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
-	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
+	 * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE)
 	 */
 	bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE
 				* UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
@@ -1490,7 +1490,7 @@ calculate_destination_timeout(void)
 /*
  * initialize the bau_control structure for each cpu
  */
-static void __init uv_init_per_cpu(int nuvhubs)
+static int __init uv_init_per_cpu(int nuvhubs)
 {
 	int i;
 	int cpu;
@@ -1507,7 +1507,7 @@ static void __init uv_init_per_cpu(int nuvhubs)
 	struct bau_control *smaster = NULL;
 	struct socket_desc {
 		short num_cpus;
-		short cpu_number[16];
+		short cpu_number[MAX_CPUS_PER_SOCKET];
 	};
 	struct uvhub_desc {
 		unsigned short socket_mask;
@@ -1540,6 +1540,10 @@ static void __init uv_init_per_cpu(int nuvhubs)
 		sdp = &bdp->socket[socket];
 		sdp->cpu_number[sdp->num_cpus] = cpu;
 		sdp->num_cpus++;
+		if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
+			printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus);
+			return 1;
+		}
 	}
 	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
 		if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
@@ -1570,6 +1574,12 @@ static void __init uv_init_per_cpu(int nuvhubs)
 				bcp->uvhub_master = hmaster;
 				bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
 						blade_processor_id;
+				if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
+					printk(KERN_EMERG
+						"%d cpus per uvhub invalid\n",
+						bcp->uvhub_cpu);
+					return 1;
+				}
 			}
 nextsocket:
 			socket++;
@@ -1595,6 +1605,7 @@ nextsocket:
 		bcp->congested_reps = congested_reps;
 		bcp->congested_period = congested_period;
 	}
+	return 0;
 }
 
 /*
@@ -1625,7 +1636,10 @@ static int __init uv_bau_init(void)
 	spin_lock_init(&disable_lock);
 	congested_cycles = microsec_2_cycles(congested_response_us);
 
-	uv_init_per_cpu(nuvhubs);
+	if (uv_init_per_cpu(nuvhubs)) {
+		nobau = 1;
+		return 0;
+	}
 
 	uv_partition_base_pnode = 0x7fffffff;
 	for (uvhub = 0; uvhub < nuvhubs; uvhub++)
-- 
cgit v0.10.2


From 6706125e291bd3dddd269e043323a6ab93ccd5fb Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Fri, 31 Dec 2010 21:58:58 +0800
Subject: sched: Remove redundant CONFIG_CGROUP_SCHED ifdef

CONFIG_[FAIR|RT]_GROUP_SCHED always means CONFIG_CGROUP_SCHED

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1293803938-8157-1-git-send-email-yong.zhang0@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 3925a1b..9f9dd8d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7777,7 +7777,6 @@ void __init sched_init(void)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.shares = init_task_group_load;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
 		/*
 		 * How much cpu bandwidth does init_task_group get?
 		 *
@@ -7798,16 +7797,13 @@ void __init sched_init(void)
 		 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
 		 */
 		init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
-#endif
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
 		init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
 #endif
-#endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
-- 
cgit v0.10.2


From d50d8fe192428090790e7178e9507e981e0b005b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 4 Jan 2011 17:20:54 +1030
Subject: x86, mm: Initialize initial_page_table before paravirt jumps

v2.6.36-rc8-54-gb40827f (x86-32, mm: Add an initial page table
for core bootstrapping) made x86 boot using initial_page_table
and broke lguest.

For 2.6.37 we simply cut & paste the initialization code into
lguest (da32dac10126 "lguest: populate initial_page_table"), now
we fix it properly by doing that initialization before the
paravirt jump.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: lguest <lguest@ozlabs.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <201101041720.54535.rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c0dbd9a..e3a8bb9 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -139,39 +139,6 @@ ENTRY(startup_32)
 	movl %eax, pa(olpc_ofw_pgd)
 #endif
 
-#ifdef CONFIG_PARAVIRT
-	/* This is can only trip for a broken bootloader... */
-	cmpw $0x207, pa(boot_params + BP_version)
-	jb default_entry
-
-	/* Paravirt-compatible boot parameters.  Look to see what architecture
-		we're booting under. */
-	movl pa(boot_params + BP_hardware_subarch), %eax
-	cmpl $num_subarch_entries, %eax
-	jae bad_subarch
-
-	movl pa(subarch_entries)(,%eax,4), %eax
-	subl $__PAGE_OFFSET, %eax
-	jmp *%eax
-
-bad_subarch:
-WEAK(lguest_entry)
-WEAK(xen_entry)
-	/* Unknown implementation; there's really
-	   nothing we can do at this point. */
-	ud2a
-
-	__INITDATA
-
-subarch_entries:
-	.long default_entry		/* normal x86/PC */
-	.long lguest_entry		/* lguest hypervisor */
-	.long xen_entry			/* Xen hypervisor */
-	.long default_entry		/* Moorestown MID */
-num_subarch_entries = (. - subarch_entries) / 4
-.previous
-#endif /* CONFIG_PARAVIRT */
-
 /*
  * Initialize page tables.  This creates a PDE and a set of page
  * tables, which are located immediately beyond __brk_base.  The variable
@@ -181,7 +148,6 @@ num_subarch_entries = (. - subarch_entries) / 4
  *
  * Note that the stack is not yet set up!
  */
-default_entry:
 #ifdef CONFIG_X86_PAE
 
 	/*
@@ -261,7 +227,42 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
 	movl %eax,pa(initial_page_table+0xffc)
 #endif
-	jmp 3f
+
+#ifdef CONFIG_PARAVIRT
+	/* This is can only trip for a broken bootloader... */
+	cmpw $0x207, pa(boot_params + BP_version)
+	jb default_entry
+
+	/* Paravirt-compatible boot parameters.  Look to see what architecture
+		we're booting under. */
+	movl pa(boot_params + BP_hardware_subarch), %eax
+	cmpl $num_subarch_entries, %eax
+	jae bad_subarch
+
+	movl pa(subarch_entries)(,%eax,4), %eax
+	subl $__PAGE_OFFSET, %eax
+	jmp *%eax
+
+bad_subarch:
+WEAK(lguest_entry)
+WEAK(xen_entry)
+	/* Unknown implementation; there's really
+	   nothing we can do at this point. */
+	ud2a
+
+	__INITDATA
+
+subarch_entries:
+	.long default_entry		/* normal x86/PC */
+	.long lguest_entry		/* lguest hypervisor */
+	.long xen_entry			/* Xen hypervisor */
+	.long default_entry		/* Moorestown MID */
+num_subarch_entries = (. - subarch_entries) / 4
+.previous
+#else
+	jmp default_entry
+#endif /* CONFIG_PARAVIRT */
+
 /*
  * Non-boot CPU entry point; entered from trampoline.S
  * We can't lgdt here, because lgdt itself uses a data segment, but
@@ -282,7 +283,7 @@ ENTRY(startup_32_smp)
 	movl %eax,%fs
 	movl %eax,%gs
 #endif /* CONFIG_SMP */
-3:
+default_entry:
 
 /*
  *	New page tables may be in 4Mbyte page mode and may
@@ -622,13 +623,13 @@ ENTRY(initial_code)
 __PAGE_ALIGNED_BSS
 	.align PAGE_SIZE_asm
 #ifdef CONFIG_X86_PAE
-ENTRY(initial_pg_pmd)
+initial_pg_pmd:
 	.fill 1024*KPMDS,4,0
 #else
 ENTRY(initial_page_table)
 	.fill 1024,4,0
 #endif
-ENTRY(initial_pg_fixmap)
+initial_pg_fixmap:
 	.fill 1024,4,0
 ENTRY(empty_zero_page)
 	.fill 4096,1,0
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index e7d5382..4f420c2f 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -4,7 +4,6 @@
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/processor-flags.h>
-#include <asm/pgtable.h>
 
 /*G:020
  * Our story starts with the kernel booting into startup_32 in
@@ -38,113 +37,9 @@ ENTRY(lguest_entry)
 	/* Set up the initial stack so we can run C code. */
 	movl $(init_thread_union+THREAD_SIZE),%esp
 
-	call init_pagetables
-
 	/* Jumps are relative: we're running __PAGE_OFFSET too low. */
 	jmp lguest_init+__PAGE_OFFSET
 
-/*
- * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond __brk_base.  The variable
- * _brk_end is set up to point to the first "safe" location.
- * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end.
- *
- * FIXME: This code is taken verbatim from arch/x86/kernel/head_32.S: they
- * don't have a stack at this point, so we can't just use call and ret.
- */
-init_pagetables:
-#if PTRS_PER_PMD > 1
-#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
-#else
-#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
-#endif
-#define pa(X) ((X) - __PAGE_OFFSET)
-
-/* Enough space to fit pagetables for the low memory linear map */
-MAPPING_BEYOND_END = \
-	PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
-#ifdef CONFIG_X86_PAE
-
-	/*
-	 * In PAE mode initial_page_table is statically defined to contain
-	 * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
-	 * entries). The identity mapping is handled by pointing two PGD entries
-	 * to the first kernel PMD.
-	 *
-	 * Note the upper half of each PMD or PTE are always zero at this stage.
-	 */
-
-#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
-
-	xorl %ebx,%ebx				/* %ebx is kept at zero */
-
-	movl $pa(__brk_base), %edi
-	movl $pa(initial_pg_pmd), %edx
-	movl $PTE_IDENT_ATTR, %eax
-10:
-	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PMD entry */
-	movl %ecx,(%edx)			/* Store PMD entry */
-						/* Upper half already zero */
-	addl $8,%edx
-	movl $512,%ecx
-11:
-	stosl
-	xchgl %eax,%ebx
-	stosl
-	xchgl %eax,%ebx
-	addl $0x1000,%eax
-	loop 11b
-
-	/*
-	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
-	 */
-	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
-	cmpl %ebp,%eax
-	jb 10b
-1:
-	addl $__PAGE_OFFSET, %edi
-	movl %edi, pa(_brk_end)
-	shrl $12, %eax
-	movl %eax, pa(max_pfn_mapped)
-
-	/* Do early initialization of the fixmap area */
-	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
-	movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
-#else	/* Not PAE */
-
-page_pde_offset = (__PAGE_OFFSET >> 20);
-
-	movl $pa(__brk_base), %edi
-	movl $pa(initial_page_table), %edx
-	movl $PTE_IDENT_ATTR, %eax
-10:
-	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PDE entry */
-	movl %ecx,(%edx)			/* Store identity PDE entry */
-	movl %ecx,page_pde_offset(%edx)		/* Store kernel PDE entry */
-	addl $4,%edx
-	movl $1024, %ecx
-11:
-	stosl
-	addl $0x1000,%eax
-	loop 11b
-	/*
-	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
-	 */
-	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
-	cmpl %ebp,%eax
-	jb 10b
-	addl $__PAGE_OFFSET, %edi
-	movl %edi, pa(_brk_end)
-	shrl $12, %eax
-	movl %eax, pa(max_pfn_mapped)
-
-	/* Do early initialization of the fixmap area */
-	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
-	movl %eax,pa(initial_page_table+0xffc)
-#endif
-	ret
-
 /*G:055
  * We create a macro which puts the assembler code between lgstart_ and lgend_
  * markers.  These templates are put in the .text section: they can't be
-- 
cgit v0.10.2


From 4f8219875a0dad2cfad9e93a3fafcd9626db98d2 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 16 Dec 2010 15:09:52 +0100
Subject: sched, autogroup: Fix potential access to freed memory

Oleg pointed out that the /proc interface kref_get() useage may race with
the final put during autogroup_move_group().  A signal->autogroup assignment
may be in flight when the /proc interface dereference, leaving them taking
a reference to an already dead group.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292508592.5940.28.camel@maggy.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 57a7ac28..c80fedc 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -41,6 +41,20 @@ static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
 	return ag;
 }
 
+static inline struct autogroup *autogroup_task_get(struct task_struct *p)
+{
+	struct autogroup *ag;
+	unsigned long flags;
+
+	if (!lock_task_sighand(p, &flags))
+		return autogroup_kref_get(&autogroup_default);
+
+	ag = autogroup_kref_get(p->signal->autogroup);
+	unlock_task_sighand(p, &flags);
+
+	return ag;
+}
+
 static inline struct autogroup *autogroup_create(void)
 {
 	struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -149,11 +163,7 @@ EXPORT_SYMBOL(sched_autogroup_detach);
 
 void sched_autogroup_fork(struct signal_struct *sig)
 {
-	struct task_struct *p = current;
-
-	spin_lock_irq(&p->sighand->siglock);
-	sig->autogroup = autogroup_kref_get(p->signal->autogroup);
-	spin_unlock_irq(&p->sighand->siglock);
+	sig->autogroup = autogroup_task_get(current);
 }
 
 void sched_autogroup_exit(struct signal_struct *sig)
@@ -172,7 +182,6 @@ __setup("noautogroup", setup_autogroup);
 
 #ifdef CONFIG_PROC_FS
 
-/* Called with siglock held. */
 int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
 {
 	static unsigned long next = INITIAL_JIFFIES;
@@ -194,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
 		return -EAGAIN;
 
 	next = HZ / 10 + jiffies;
-	ag = autogroup_kref_get(p->signal->autogroup);
+	ag = autogroup_task_get(p);
 
 	down_write(&ag->lock);
 	err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
@@ -209,7 +218,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
 
 void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
 {
-	struct autogroup *ag = autogroup_kref_get(p->signal->autogroup);
+	struct autogroup *ag = autogroup_task_get(p);
 
 	down_read(&ag->lock);
 	seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
-- 
cgit v0.10.2


From 101e5f77bf35679809586e250b6c62193d2ed179 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 31 Dec 2010 09:32:30 +0100
Subject: sched, autogroup: Fix reference leak

The cgroup exit mess also uncovered a struct autogroup reference leak.
copy_process() was simply freeing vs putting the signal_struct,
stranding a reference.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
LKML-Reference: <1293784350.6839.2.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/fork.c b/kernel/fork.c
index b6f2475..0672444 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1317,7 +1317,7 @@ bad_fork_cleanup_mm:
 	}
 bad_fork_cleanup_signal:
 	if (!(clone_flags & CLONE_THREAD))
-		free_signal_struct(p->signal);
+		put_signal_struct(p->signal);
 bad_fork_cleanup_sighand:
 	__cleanup_sighand(p->sighand);
 bad_fork_cleanup_fs:
-- 
cgit v0.10.2


From cb2ded37fd2e1039f96c8c892da024a8f033add5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 4 Jan 2011 16:38:52 -0800
Subject: x86: Fix APIC ID sizing bug on larger systems, clean up MAX_APICS
 confusion

Found one x2apic pre-enabled system, x2apic_mode suddenly get
corrupted after register some cpus, when compiled
CONFIG_NR_CPUS=255 instead of 512.

It turns out that generic_processor_info() ==> phyid_set(apicid,
phys_cpu_present_map) causes the problem.

phys_cpu_present_map is sized by MAX_APICS bits, and pre-enabled
system some cpus have an apic id > 255.

The variable after phys_cpu_present_map may get corrupted
silently:

 ffffffff828e8420 B phys_cpu_present_map
 ffffffff828e8440 B apic_verbosity
 ffffffff828e8444 B local_apic_timer_c2_ok
 ffffffff828e8448 B disable_apic
 ffffffff828e844c B x2apic_mode
 ffffffff828e8450 B x2apic_disabled
 ffffffff828e8454 B num_processors
 ...

Actually phys_cpu_present_map is referenced via apic id, instead
index. We should use MAX_LOCAL_APIC instead MAX_APICS.

For 64-bit it will be 32768 in all cases. BSS will increase by 4k bytes
on 64-bit:

	text		data		bss		dec		filename
	21696943	4193748		12787712	38678403	vmlinux.before
	21696943	4193748		12791808	38682499	vmlinux.after

No change on 32bit.

Finally we can remove MAX_APCIS that was rather confusing.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
LKML-Reference: <4D23BD9C.3070102@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 7c1aebf..0c90dd9 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -5,6 +5,7 @@
 
 #include <asm/mpspec_def.h>
 #include <asm/x86_init.h>
+#include <asm/apicdef.h>
 
 extern int apic_version[];
 extern int pic_mode;
@@ -107,7 +108,7 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
 				 int active_high_low);
 #endif /* CONFIG_ACPI */
 
-#define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_APICS)
+#define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_LOCAL_APIC)
 
 struct physid_mask {
 	unsigned long mask[PHYSID_ARRAY_SIZE];
@@ -122,31 +123,31 @@ typedef struct physid_mask physid_mask_t;
 	test_and_set_bit(physid, (map).mask)
 
 #define physids_and(dst, src1, src2)					\
-	bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+	bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
 
 #define physids_or(dst, src1, src2)					\
-	bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+	bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
 
 #define physids_clear(map)					\
-	bitmap_zero((map).mask, MAX_APICS)
+	bitmap_zero((map).mask, MAX_LOCAL_APIC)
 
 #define physids_complement(dst, src)				\
-	bitmap_complement((dst).mask, (src).mask, MAX_APICS)
+	bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC)
 
 #define physids_empty(map)					\
-	bitmap_empty((map).mask, MAX_APICS)
+	bitmap_empty((map).mask, MAX_LOCAL_APIC)
 
 #define physids_equal(map1, map2)				\
-	bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
+	bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC)
 
 #define physids_weight(map)					\
-	bitmap_weight((map).mask, MAX_APICS)
+	bitmap_weight((map).mask, MAX_LOCAL_APIC)
 
 #define physids_shift_right(d, s, n)				\
-	bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
+	bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC)
 
 #define physids_shift_left(d, s, n)				\
-	bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
+	bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC)
 
 static inline unsigned long physids_coerce(physid_mask_t *map)
 {
@@ -159,14 +160,6 @@ static inline void physids_promote(unsigned long physids, physid_mask_t *map)
 	map->mask[0] = physids;
 }
 
-/* Note: will create very large stack frames if physid_mask_t is big */
-#define physid_mask_of_physid(physid)					\
-	({								\
-		physid_mask_t __physid_mask = PHYSID_MASK_NONE;		\
-		physid_set(physid, __physid_mask);			\
-		__physid_mask;						\
-	})
-
 static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
 {
 	physids_clear(*map);
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
index 4a7f96d..c0a955a 100644
--- a/arch/x86/include/asm/mpspec_def.h
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -15,13 +15,6 @@
 
 #ifdef CONFIG_X86_32
 # define MAX_MPC_ENTRY 1024
-# define MAX_APICS      256
-#else
-# if NR_CPUS <= 255
-#  define MAX_APICS     255
-# else
-#  define MAX_APICS   32768
-# endif
 #endif
 
 /* Intel MP Floating Pointer Structure */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7235e5f..17c8090 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -915,13 +915,13 @@ static int __init acpi_parse_madt_lapic_entries(void)
 	acpi_register_lapic_address(acpi_lapic_addr);
 
 	count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
-				      acpi_parse_sapic, MAX_APICS);
+				      acpi_parse_sapic, MAX_LOCAL_APIC);
 
 	if (!count) {
 		x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
-						acpi_parse_x2apic, MAX_APICS);
+					acpi_parse_x2apic, MAX_LOCAL_APIC);
 		count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
-					      acpi_parse_lapic, MAX_APICS);
+					acpi_parse_lapic, MAX_LOCAL_APIC);
 	}
 	if (!count && !x2count) {
 		printk(KERN_ERR PREFIX "No LAPIC entries present\n");
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 943d814..2fc696e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -4109,7 +4109,8 @@ void __init pre_init_apic_IRQ0(void)
 
 	printk(KERN_INFO "Early APIC setup for system timer0\n");
 #ifndef CONFIG_SMP
-	phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+	physid_set_mask_of_physid(boot_cpu_physical_apicid,
+					 &phys_cpu_present_map);
 #endif
 	/* Make sure the irq descriptor is set up */
 	cfg = alloc_irq_and_cfg_at(0, 0);
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
index dd4c281..ca54875 100644
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -48,9 +48,9 @@ static void __init mp_sfi_register_lapic_address(unsigned long address)
 /* All CPUs enumerated by SFI must be present and enabled */
 static void __cpuinit mp_sfi_register_lapic(u8 id)
 {
-	if (MAX_APICS - id <= 0) {
+	if (MAX_LOCAL_APIC - id <= 0) {
 		pr_warning("Processor #%d invalid (max %d)\n",
-			id, MAX_APICS);
+			id, MAX_LOCAL_APIC);
 		return;
 	}
 
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index 3371bd0..6320376 100644
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -171,7 +171,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
 	ver = m->apicver;
 	if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
 		printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
-			m->apicid, MAX_APICS);
+			m->apicid, MAX_LOCAL_APIC);
 		return;
 	}
 
-- 
cgit v0.10.2


From 6bf4123760a5aece6e4829ce90b70b6ffd751d65 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 5 Jan 2011 12:50:16 +1100
Subject: sched: Change wait_for_completion_*_timeout() to return a signed long

wait_for_completion_*_timeout() can return:

   0: if the wait timed out
 -ve: if the wait was interrupted
 +ve: if the completion was completed.

As they currently return an 'unsigned long', the last two cases
are not easily distinguished which can easily result in buggy
code, as is the case for the recently added
wait_for_completion_interruptible_timeout() call in
net/sunrpc/cache.c

So change them both to return 'long'.  As MAX_SCHEDULE_TIMEOUT
is LONG_MAX, a large +ve return value should never overflow.

Signed-off-by: NeilBrown <neilb@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: J.  Bruce Fields <bfields@fieldses.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20110105125016.64ccab0e@notabene.brown>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 36d57f74..51494e6 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -81,10 +81,10 @@ extern int wait_for_completion_interruptible(struct completion *x);
 extern int wait_for_completion_killable(struct completion *x);
 extern unsigned long wait_for_completion_timeout(struct completion *x,
 						   unsigned long timeout);
-extern unsigned long wait_for_completion_interruptible_timeout(
-			struct completion *x, unsigned long timeout);
-extern unsigned long wait_for_completion_killable_timeout(
-			struct completion *x, unsigned long timeout);
+extern long wait_for_completion_interruptible_timeout(
+	struct completion *x, unsigned long timeout);
+extern long wait_for_completion_killable_timeout(
+	struct completion *x, unsigned long timeout);
 extern bool try_wait_for_completion(struct completion *x);
 extern bool completion_done(struct completion *x);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index f2f914e..114a0de 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4395,7 +4395,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
  * This waits for either a completion of a specific task to be signaled or for a
  * specified timeout to expire. It is interruptible. The timeout is in jiffies.
  */
-unsigned long __sched
+long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
 					  unsigned long timeout)
 {
@@ -4428,7 +4428,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
  * signaled or for a specified timeout to expire. It can be
  * interrupted by a kill signal. The timeout is in jiffies.
  */
-unsigned long __sched
+long __sched
 wait_for_completion_killable_timeout(struct completion *x,
 				     unsigned long timeout)
 {
-- 
cgit v0.10.2


From a40649781b9c10e192fad9f00a60c6d946da553f Mon Sep 17 00:00:00 2001
From: Michael Prokop <mika@grml.org>
Date: Wed, 5 Jan 2011 16:27:15 -0800
Subject: Documentation/dontdiff: add further autogenerated files to ignore
 list

Mainly resulting from (but not limited to) autogenerated files of
lib/raid6 and drivers/gpu/drm/radeon. List generated as result of
a diff of a clean 2.6.36 tree against a built one.

Signed-off-by: Michael Prokop <mika@grml.org>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index d9bcffd..470d3db 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -62,6 +62,10 @@ aic7*reg_print.c*
 aic7*seq.h*
 aicasm
 aicdb.h*
+altivec1.c
+altivec2.c
+altivec4.c
+altivec8.c
 asm-offsets.h
 asm_offsets.h
 autoconf.h*
@@ -76,6 +80,7 @@ btfixupprep
 build
 bvmlinux
 bzImage*
+capflags.c
 classlist.h*
 comp*.log
 compile.h*
@@ -94,6 +99,7 @@ devlist.h*
 docproc
 elf2ecoff
 elfconfig.h*
+evergreen_reg_safe.h
 fixdep
 flask.h
 fore200e_mkfirm
@@ -108,9 +114,16 @@ genksyms
 *_gray256.c
 ihex2fw
 ikconfig.h*
+inat-tables.c
 initramfs_data.cpio
 initramfs_data.cpio.gz
 initramfs_list
+int16.c
+int1.c
+int2.c
+int32.c
+int4.c
+int8.c
 kallsyms
 kconfig
 keywords.c
@@ -140,6 +153,7 @@ mkprep
 mktables
 mktree
 modpost
+modules.builtin
 modules.order
 modversions.h*
 ncscope.*
@@ -153,14 +167,23 @@ pca200e.bin
 pca200e_ecd.bin2
 piggy.gz
 piggyback
+piggy.S
 pnmtologo
 ppc_defs.h*
 pss_boot.h
 qconf
+r100_reg_safe.h
+r200_reg_safe.h
+r300_reg_safe.h
+r420_reg_safe.h
+r600_reg_safe.h
 raid6altivec*.c
 raid6int*.c
 raid6tables.c
 relocs
+rn50_reg_safe.h
+rs600_reg_safe.h
+rv515_reg_safe.h
 series
 setup
 setup.bin
@@ -169,6 +192,7 @@ sImage
 sm_tbl*
 split-include
 syscalltab.h
+tables.c
 tags
 tftpboot.img
 timeconst.h
@@ -190,6 +214,7 @@ vmlinux
 vmlinux-*
 vmlinux.aout
 vmlinux.lds
+voffset.h
 vsyscall.lds
 vsyscall_32.lds
 wanxlfw.inc
@@ -200,3 +225,4 @@ wakeup.elf
 wakeup.lds
 zImage*
 zconf.hash.c
+zoffset.h
-- 
cgit v0.10.2


From d5ba92b7958e3ff2f0878e45b9b42cb6976853dd Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Wed, 5 Jan 2011 16:27:53 -0800
Subject: Documentation: update kernel-docs.txt

Fixed typos, and removed duplicated entries.

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt
index 715eaaf..9a86746 100644
--- a/Documentation/kernel-docs.txt
+++ b/Documentation/kernel-docs.txt
@@ -537,7 +537,7 @@
        Notes: Further information in
        http://www.oreilly.com/catalog/linuxdrive2/
 
-     * Title: "Linux Device Drivers, 3nd Edition"
+     * Title: "Linux Device Drivers, 3rd Edition"
        Authors: Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman
        Publisher: O'Reilly & Associates.
        Date: 2005.
@@ -592,14 +592,6 @@
        Pages: 600.
        ISBN: 0-13-101908-2
 
-     * Title:  "The  Design  and Implementation of the 4.4 BSD UNIX
-       Operating System"
-       Author: Marshall Kirk McKusick, Keith Bostic, Michael J. Karels,
-       John S. Quarterman.
-       Publisher: Addison-Wesley.
-       Date: 1996.
-       ISBN: 0-201-54979-4
-
      * Title: "Programming for the real world - POSIX.4"
        Author: Bill O. Gallmeister.
        Publisher: O'Reilly & Associates, Inc..
@@ -610,28 +602,13 @@
        POSIX. Good reference.
 
      * Title:  "UNIX  Systems  for  Modern Architectures: Symmetric
-       Multiprocesssing and Caching for Kernel Programmers"
+       Multiprocessing and Caching for Kernel Programmers"
        Author: Curt Schimmel.
        Publisher: Addison Wesley.
        Date: June, 1994.
        Pages: 432.
        ISBN: 0-201-63338-8
 
-     * Title:  "The  Design  and Implementation of the 4.3 BSD UNIX
-       Operating System"
-       Author: Samuel J. Leffler, Marshall Kirk McKusick, Michael J.
-       Karels, John S. Quarterman.
-       Publisher: Addison-Wesley.
-       Date: 1989 (reprinted with corrections on October, 1990).
-       ISBN: 0-201-06196-1
-
-     * Title: "The Design of the UNIX Operating System"
-       Author: Maurice J. Bach.
-       Publisher: Prentice Hall.
-       Date: 1986.
-       Pages: 471.
-       ISBN: 0-13-201757-1
-
      MISCELLANEOUS:
 
      * Name: linux/Documentation
-- 
cgit v0.10.2


From 8484baaa5065b460e5eb18ee721d8417251f7897 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 5 Jan 2011 16:28:43 -0800
Subject: kernel-doc: code reorganization

Move 'main' code vs. subroutines around so that they are not so
intermixed, for better readability/understanding (relative to Perl).
It was messy to follow the primary flow of code execution with the
code being mixed.  Now the code begins with data initialization,
followed by all subroutines, then ends with the main code execution.

This is almost totally source code movement, with a few changes as
needed for forward declarations.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 39580a5..9f85012 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -155,6 +155,8 @@ use strict;
 # '@parameter' - name of a parameter
 # '%CONST' - name of a constant.
 
+## init lots of data
+
 my $errors = 0;
 my $warnings = 0;
 my $anon_struct_union = 0;
@@ -218,21 +220,14 @@ my %highlights_list = ( $type_constant, "\$1",
 			$type_param, "\$1" );
 my $blankline_list = "";
 
-sub usage {
-    print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n";
-    print "         [ -no-doc-sections ]\n";
-    print "         [ -function funcname [ -function funcname ...] ]\n";
-    print "         [ -nofunction funcname [ -nofunction funcname ...] ]\n";
-    print "         c source file(s) > outputfile\n";
-    print "         -v : verbose output, more warnings & other info listed\n";
-    exit 1;
-}
-
 # read arguments
 if ($#ARGV == -1) {
     usage();
 }
 
+my $kernelversion;
+my $dohighlight = "";
+
 my $verbose = 0;
 my $output_mode = "man";
 my $no_doc_sections = 0;
@@ -245,7 +240,7 @@ my $man_date = ('January', 'February', 'March', 'April', 'May', 'June',
 		'November', 'December')[(localtime)[4]] .
   " " . ((localtime)[5]+1900);
 
-# Essentially these are globals
+# Essentially these are globals.
 # They probably want to be tidied up, made more localised or something.
 # CAVEAT EMPTOR!  Some of the others I localised may not want to be, which
 # could cause "use of undefined value" or other bugs.
@@ -353,6 +348,18 @@ while ($ARGV[0] =~ m/^-(.*)/) {
     }
 }
 
+# continue execution near EOF;
+
+sub usage {
+    print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n";
+    print "         [ -no-doc-sections ]\n";
+    print "         [ -function funcname [ -function funcname ...] ]\n";
+    print "         [ -nofunction funcname [ -nofunction funcname ...] ]\n";
+    print "         c source file(s) > outputfile\n";
+    print "         -v : verbose output, more warnings & other info listed\n";
+    exit 1;
+}
+
 # get kernel version from env
 sub get_kernel_version() {
     my $version = 'unknown kernel version';
@@ -362,15 +369,6 @@ sub get_kernel_version() {
     }
     return $version;
 }
-my $kernelversion = get_kernel_version();
-
-# generate a sequence of code that will splice in highlighting information
-# using the s// operator.
-my $dohighlight = "";
-foreach my $pattern (keys %highlights) {
-#   print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n";
-    $dohighlight .=  "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n";
-}
 
 ##
 # dumps section contents to arrays/hashes intended for that purpose.
@@ -1851,34 +1849,6 @@ sub dump_function($$) {
 		       });
 }
 
-sub process_file($);
-
-# Read the file that maps relative names to absolute names for
-# separate source and object directories and for shadow trees.
-if (open(SOURCE_MAP, "<.tmp_filelist.txt")) {
-	my ($relname, $absname);
-	while(<SOURCE_MAP>) {
-		chop();
-		($relname, $absname) = (split())[0..1];
-		$relname =~ s:^/+::;
-		$source_map{$relname} = $absname;
-	}
-	close(SOURCE_MAP);
-}
-
-foreach (@ARGV) {
-    chomp;
-    process_file($_);
-}
-if ($verbose && $errors) {
-  print STDERR "$errors errors\n";
-}
-if ($verbose && $warnings) {
-  print STDERR "$warnings warnings\n";
-}
-
-exit($errors);
-
 sub reset_state {
     $function = "";
     %constants = ();
@@ -2285,3 +2255,39 @@ sub process_file($) {
 	}
     }
 }
+
+
+$kernelversion = get_kernel_version();
+
+# generate a sequence of code that will splice in highlighting information
+# using the s// operator.
+foreach my $pattern (keys %highlights) {
+#   print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n";
+    $dohighlight .=  "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n";
+}
+
+# Read the file that maps relative names to absolute names for
+# separate source and object directories and for shadow trees.
+if (open(SOURCE_MAP, "<.tmp_filelist.txt")) {
+	my ($relname, $absname);
+	while(<SOURCE_MAP>) {
+		chop();
+		($relname, $absname) = (split())[0..1];
+		$relname =~ s:^/+::;
+		$source_map{$relname} = $absname;
+	}
+	close(SOURCE_MAP);
+}
+
+foreach (@ARGV) {
+    chomp;
+    process_file($_);
+}
+if ($verbose && $errors) {
+  print STDERR "$errors errors\n";
+}
+if ($verbose && $warnings) {
+  print STDERR "$warnings warnings\n";
+}
+
+exit($errors);
-- 
cgit v0.10.2


From 3a9f987b3141f086de27832514aad9f50a53f754 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Jan 2011 15:40:10 -0500
Subject: tracing: Include module.h in define_trace.h

While doing some developing, Peter Zijlstra and I have found
that if a CREATE_TRACE_POINTS include is done before module.h
is included, it can break the build.

We have been lucky so far that this has not broke the build
since module.h is included in almost everything.

Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index b0b4eb2..da39b22 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -21,6 +21,16 @@
 #undef CREATE_TRACE_POINTS
 
 #include <linux/stringify.h>
+/*
+ * module.h includes tracepoints, and because ftrace.h
+ * pulls in module.h:
+ *  trace/ftrace.h -> linux/ftrace_event.h -> linux/perf_event.h ->
+ *  linux/ftrace.h -> linux/module.h
+ * we must include module.h here before we play with any of
+ * the TRACE_EVENT() macros, otherwise the tracepoints included
+ * by module.h may break the build.
+ */
+#include <linux/module.h>
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
-- 
cgit v0.10.2


From ec6e7c3ae39cb1dc279b5274aaaadd09ff8e224b Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 6 Jan 2011 13:45:32 -0500
Subject: tracing/trivial: Add missing comma in TRACE_EVENT comment

Add missing comma within the TRACE_EVENT() example in tracepoint.h.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
LKML-Reference: <20110106184532.GA2526@Krystal>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index d3e4f87..899103c 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -326,7 +326,7 @@ do_trace:								\
  *		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
  *		__entry->next_pid	= next->pid;
  *		__entry->next_prio	= next->prio;
- *	)
+ *	),
  *
  *	*
  *	* Formatted output of a trace record via TP_printk().
-- 
cgit v0.10.2


From 881fe4bdcdc899674524e30a76c76d298b447008 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 6 Jan 2011 12:53:19 -0500
Subject: tracing: remove duplicate null-pointer check in skb tracepoint

The check for NULL skb in the kfree_skb trace event is a duplicate from the
check already done in its only caller, kfree_skb(). Remove this duplicate check.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
LKML-Reference: <20110106175319.GA30610@Krystal>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: David S. Miller <davem@davemloft.net>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Zhaolei <zhaolei@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index 75ce9d5..f10293c 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -25,9 +25,7 @@ TRACE_EVENT(kfree_skb,
 
 	TP_fast_assign(
 		__entry->skbaddr = skb;
-		if (skb) {
-			__entry->protocol = ntohs(skb->protocol);
-		}
+		__entry->protocol = ntohs(skb->protocol);
 		__entry->location = location;
 	),
 
-- 
cgit v0.10.2


From bd1c8b22b7b81c6f6c4f5c19cb2387da3d02fb0f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 4 Jan 2011 16:06:09 +0800
Subject: tracepoint: Add __rcu annotation

Add __rcu annotation to :
	(struct tracepoint)->funcs

Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4D22D4F1.50505@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 899103c..c681461 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -32,7 +32,7 @@ struct tracepoint {
 	int state;			/* State. */
 	void (*regfunc)(void);
 	void (*unregfunc)(void);
-	struct tracepoint_func *funcs;
+	struct tracepoint_func __rcu *funcs;
 } __attribute__((aligned(32)));		/*
 					 * Aligned on 32 bytes because it is
 					 * globally visible and gcc happily
-- 
cgit v0.10.2


From 1dbd1951f39e13da579ffe879cce19586d0462de Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 9 Dec 2010 15:47:56 +0800
Subject: tracing: Fix preempt count leak

While running my ftrace stress test, this showed up:

BUG: sleeping function called from invalid context at mm/mmap.c:233
...
note: cat[3293] exited with preempt_count 1

The bug was introduced by commit 91e86e560d0b3ce4c5fc64fd2bbb99f856a30a4e
("tracing: Fix recursive user stack trace")

Cc: <stable@kernel.org>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4D0089AC.1020802@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8cf959..dc53ecb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 
 	__this_cpu_inc(user_stack_count);
 
-
-
 	event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
 					  sizeof(*entry), flags, pc);
 	if (!event)
-		return;
+		goto out_drop_count;
 	entry	= ring_buffer_event_data(event);
 
 	entry->tgid		= current->tgid;
@@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 	if (!filter_check_discard(call, entry, buffer, event))
 		ring_buffer_unlock_commit(buffer, event);
 
+ out_drop_count:
 	__this_cpu_dec(user_stack_count);
-
  out:
 	preempt_enable();
 }
-- 
cgit v0.10.2


From 870915e047a2da695be118d32dd5a900f0c3e933 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 28 Oct 2010 11:31:17 -0400
Subject: tracing: Fix TRACE_EVENT power tracepoint creation

DEFINE_TRACE should also exist when CONFIG_EVENT_TRACING=n. Otherwise, setting
only TRACEPOINTS=y is broken.

Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
LKML-Reference: <20101028153117.GA4051@Krystal>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/Makefile b/kernel/Makefile
index 0b5ff08..e0f2831 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f3381..761c510 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
-- 
cgit v0.10.2


From 2d75af2f2a7a6103a6d539a492fe81deacabde44 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Fri, 7 Jan 2011 13:36:58 -0500
Subject: dynamic debug: Fix build issue with older gcc

On older gcc (3.3) dynamic debug fails to compile:

include/net/inet_connection_sock.h: In function `inet_csk_reset_xmit_timer':
include/net/inet_connection_sock.h:236: error: duplicate label declaration `do_printk'
include/net/inet_connection_sock.h:219: error: this is a previous declaration
include/net/inet_connection_sock.h:236: error: duplicate label declaration `out'
include/net/inet_connection_sock.h:219: error: this is a previous declaration
include/net/inet_connection_sock.h:236: error: duplicate label `do_printk'
include/net/inet_connection_sock.h:236: error: duplicate label `out'

Fix, by reverting the usage of JUMP_LABEL() in dynamic debug for now.

Cc: <stable@kernel.org>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index a90b389..1c70028 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -44,34 +44,24 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
 extern int ddebug_remove_module(const char *mod_name);
 
 #define dynamic_pr_debug(fmt, ...) do {					\
-	__label__ do_printk;						\
-	__label__ out;							\
 	static struct _ddebug descriptor				\
 	__used								\
 	__attribute__((section("__verbose"), aligned(8))) =		\
 	{ KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,		\
 		_DPRINTK_FLAGS_DEFAULT };				\
-	JUMP_LABEL(&descriptor.enabled, do_printk);			\
-	goto out;							\
-do_printk:								\
-	printk(KERN_DEBUG pr_fmt(fmt),	##__VA_ARGS__);			\
-out:	;								\
+	if (unlikely(descriptor.enabled))				\
+		printk(KERN_DEBUG pr_fmt(fmt),	##__VA_ARGS__);		\
 	} while (0)
 
 
 #define dynamic_dev_dbg(dev, fmt, ...) do {				\
-	__label__ do_printk;						\
-	__label__ out;							\
 	static struct _ddebug descriptor				\
 	__used								\
 	__attribute__((section("__verbose"), aligned(8))) =		\
 	{ KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,		\
 		_DPRINTK_FLAGS_DEFAULT };				\
-	JUMP_LABEL(&descriptor.enabled, do_printk);			\
-	goto out;							\
-do_printk:								\
-	dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);		\
-out:	;								\
+	if (unlikely(descriptor.enabled))				\
+		dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);	\
 	} while (0)
 
 #else
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 3094318..b335acb 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -141,11 +141,10 @@ static void ddebug_change(const struct ddebug_query *query,
 			else if (!dp->flags)
 				dt->num_enabled++;
 			dp->flags = newflags;
-			if (newflags) {
-				jump_label_enable(&dp->enabled);
-			} else {
-				jump_label_disable(&dp->enabled);
-			}
+			if (newflags)
+				dp->enabled = 1;
+			else
+				dp->enabled = 0;
 			if (verbose)
 				printk(KERN_INFO
 					"ddebug: changed %s:%d [%s]%s %s\n",
-- 
cgit v0.10.2