From 1d93e52eb48df986a3c4d5ad8a520bf1f6837367 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <jw@emlix.com>
Date: Wed, 11 Feb 2009 08:47:19 -0700
Subject: dmaengine: update kerneldoc

Some of the kerneldoc comments in the dmaengine header describe
already removed structure members.  Remove them.

Also add a short description for dma_device->device_is_tx_complete.

Signed-off-by: Johannes Weiner <jw@emlix.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 3e68469..087e79a 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -97,7 +97,6 @@ typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t;
 
 /**
  * struct dma_chan_percpu - the per-CPU part of struct dma_chan
- * @refcount: local_t used for open-coded "bigref" counting
  * @memcpy_count: transaction counter
  * @bytes_transferred: byte counter
  */
@@ -114,9 +113,6 @@ struct dma_chan_percpu {
  * @cookie: last cookie value returned to client
  * @chan_id: channel ID for sysfs
  * @dev: class device for sysfs
- * @refcount: kref, used in "bigref" slow-mode
- * @slow_ref: indicates that the DMA channel is free
- * @rcu: the DMA channel's RCU head
  * @device_node: used to add this to the device chan list
  * @local: per-cpu pointer to a struct dma_chan_percpu
  * @client-count: how many clients are using this channel
@@ -211,8 +207,6 @@ struct dma_async_tx_descriptor {
  * @global_node: list_head for global dma_device_list
  * @cap_mask: one or more dma_capability flags
  * @max_xor: maximum number of xor sources, 0 if no capability
- * @refcount: reference count
- * @done: IO completion struct
  * @dev_id: unique device ID
  * @dev: struct device reference for dma mapping api
  * @device_alloc_chan_resources: allocate resources and return the
@@ -225,6 +219,7 @@ struct dma_async_tx_descriptor {
  * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
  * @device_prep_slave_sg: prepares a slave dma operation
  * @device_terminate_all: terminate all pending operations
+ * @device_is_tx_complete: poll for transaction completion
  * @device_issue_pending: push pending transactions to hardware
  */
 struct dma_device {
-- 
cgit v0.10.2


From 973a2dd1d50a11d380086601f14e59116f93e8c5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:39:32 +0100
Subject: x86, mce: disable machine checks on suspend

Impact: Bug fix

During suspend it is not reliable to process machine check
exceptions, because CPUs disappear but can still get machine check
broadcasts.  Also the system is slightly more likely to
machine check them, but the handler is typically not a position
to handle them in a meaningfull way.

So disable them during suspend and enable them during resume.

Also make sure they are always disabled on hot-unplugged CPUs.

This new code assumes that suspend always hotunplugs all
non BP CPUs.

v2: Remove the WARN_ONs Thomas objected to.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 25cf624..5ed80991 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -728,6 +728,29 @@ __setup("mce=", mcheck_enable);
  * Sysfs support
  */
 
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static int mce_disable(void)
+{
+	int i;
+
+	for (i = 0; i < banks; i++)
+		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+	return 0;
+}
+
+static int mce_suspend(struct sys_device *dev, pm_message_t state)
+{
+	return mce_disable();
+}
+
+static int mce_shutdown(struct sys_device *dev)
+{
+	return mce_disable();
+}
+
 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
    Only one CPU is active at this time, the others get readded later using
    CPU hotplug. */
@@ -752,6 +775,8 @@ static void mce_restart(void)
 }
 
 static struct sysdev_class mce_sysclass = {
+	.suspend = mce_suspend,
+	.shutdown = mce_shutdown,
 	.resume = mce_resume,
 	.name = "machinecheck",
 };
-- 
cgit v0.10.2


From 123aa76ec0cab5d4881cd8509faed43231e68801 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:39:27 +0100
Subject: x86, mce: don't disable machine checks during code patching

Impact: low priority bug fix

This removes part of a a patch I added myself some time ago. After some
consideration the patch was a bad idea. In particular it stopped machine check
exceptions during code patching.

To quote the comment:

        * MCEs only happen when something got corrupted and in this
        * case we must do something about the corruption.
        * Ignoring it is worse than a unlikely patching race.
        * Also machine checks tend to be broadcast and if one CPU
        * goes into machine check the others follow quickly, so we don't
        * expect a machine check to cause undue problems during to code
        * patching.

So undo the machine check related parts of
8f4e956b313dcccbc7be6f10808952345e3b638c NMIs are still disabled.

This only removes code, the only additions are a new comment.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 32c6e17..5522273 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -120,8 +120,6 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
 #else
 #define mcheck_init(c) do { } while (0)
 #endif
-extern void stop_mce(void);
-extern void restart_mce(void);
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a84ac7b..5b8394a 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
 	   that might execute the to be patched code.
 	   Other CPUs are not running. */
 	stop_nmi();
-#ifdef CONFIG_X86_MCE
-	stop_mce();
-#endif
+
+	/*
+	 * Don't stop machine check exceptions while patching.
+	 * MCEs only happen when something got corrupted and in this
+	 * case we must do something about the corruption.
+	 * Ignoring it is worse than a unlikely patching race.
+	 * Also machine checks tend to be broadcast and if one CPU
+	 * goes into machine check the others follow quickly, so we don't
+	 * expect a machine check to cause undue problems during to code
+	 * patching.
+	 */
 
 	apply_alternatives(__alt_instructions, __alt_instructions_end);
 
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
 				(unsigned long)__smp_locks_end);
 
 	restart_nmi();
-#ifdef CONFIG_X86_MCE
-	restart_mce();
-#endif
 }
 
 /**
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce..3552119 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
 	}
 }
 
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-	old_cr4 = read_cr4();
-	clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-	if (old_cr4 & X86_CR4_MCE)
-		set_in_cr4(X86_CR4_MCE);
-}
-
 static int __init mcheck_disable(char *str)
 {
 	mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 5ed80991..25ccdbe 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -680,20 +680,6 @@ static struct miscdevice mce_log_device = {
 	&mce_chrdev_ops,
 };
 
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-	old_cr4 = read_cr4();
-	clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-	if (old_cr4 & X86_CR4_MCE)
-		set_in_cr4(X86_CR4_MCE);
-}
-
 /*
  * Old style boot options parsing. Only for compatibility.
  */
-- 
cgit v0.10.2


From 9bd984058088d6ef7af6946591a207e51a2f4890 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:39:28 +0100
Subject: x86, mce: always use separate work queue to run trigger

Impact: Needed for bug fix in next patch

This relaxes the requirement that mce_notify_user has to run in process
context. Useful for future changes, but also leads to cleaner
behaviour now. Now instead mce_notify_user can be called directly
from interrupt (but not NMI) context.

The work queue only uses a single global work struct, which can be done safely
because it is always free to reuse before the trigger function is executed.
This way no events can be lost.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 25ccdbe..18b379c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -380,11 +380,17 @@ static void mcheck_timer(struct work_struct *work)
 	schedule_delayed_work(&mcheck_work, next_interval);
 }
 
+static void mce_do_trigger(struct work_struct *work)
+{
+	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
 /*
- * This is only called from process context.  This is where we do
- * anything we need to alert userspace about new MCEs.  This is called
- * directly from the poller and also from entry.S and idle, thanks to
- * TIF_MCE_NOTIFY.
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
  */
 int mce_notify_user(void)
 {
@@ -394,9 +400,14 @@ int mce_notify_user(void)
 		unsigned long now = jiffies;
 
 		wake_up_interruptible(&mce_wait);
-		if (trigger[0])
-			call_usermodehelper(trigger, trigger_argv, NULL,
-						UMH_NO_WAIT);
+
+		/*
+		 * There is no risk of missing notifications because
+		 * work_pending is always cleared before the function is
+		 * executed.
+		 */
+		if (trigger[0] && !work_pending(&mce_trigger_work))
+			schedule_work(&mce_trigger_work);
 
 		if (time_after_eq(now, last_print + (check_interval*HZ))) {
 			last_print = now;
-- 
cgit v0.10.2


From 52d168e28bc11dd026b620fe1767cadde5a747cd Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:39:29 +0100
Subject: x86, mce: switch machine check polling to per CPU timer

Impact: Higher priority bug fix

The machine check poller runs a single timer and then broadcasted an
IPI to all CPUs to check them. This leads to unnecessary
synchronization between CPUs. The original CPU running the timer has
to wait potentially a long time for all other CPUs answering. This is
also real time unfriendly and in general inefficient.

This was especially a problem on systems with a lot of events where
the poller run with a higher frequency after processing some events.
There could be more and more CPU time wasted with this, to
the point of significantly slowing down machines.

The machine check polling is actually fully independent per CPU, so
there's no reason to not just do this all with per CPU timers.  This
patch implements that.

Also switch the poller also to use standard timers instead of work
queues. It was using work queues to be able to execute a user program
on a event, but mce_notify_user() handles this case now with a
separate callback. So instead always run the poll code in in a
standard per CPU timer, which means that in the common case of not
having to execute a trigger there will be less overhead.

This allows to clean up the initialization significantly, because
standard timers are already up when machine checks get init'ed.  No
multiple initialization functions.

Thanks to Thomas Gleixner for some help.

Cc: thockin@google.com
v2: Use del_timer_sync() on cpu shutdown and don't try to handle
migrated timers.
v3: Add WARN_ON for timer running on unexpected CPU

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 18b379c..3f0550d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -353,18 +353,17 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 
 static int check_interval = 5 * 60; /* 5 minutes */
 static int next_interval; /* in jiffies */
-static void mcheck_timer(struct work_struct *work);
-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
+static void mcheck_timer(unsigned long);
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
-static void mcheck_check_cpu(void *info)
+static void mcheck_timer(unsigned long data)
 {
+	struct timer_list *t = &per_cpu(mce_timer, data);
+
+	WARN_ON(smp_processor_id() != data);
+
 	if (mce_available(&current_cpu_data))
 		do_machine_check(NULL, 0);
-}
-
-static void mcheck_timer(struct work_struct *work)
-{
-	on_each_cpu(mcheck_check_cpu, NULL, 1);
 
 	/*
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -377,7 +376,8 @@ static void mcheck_timer(struct work_struct *work)
 				(int)round_jiffies_relative(check_interval*HZ));
 	}
 
-	schedule_delayed_work(&mcheck_work, next_interval);
+	t->expires = jiffies + next_interval;
+	add_timer(t);
 }
 
 static void mce_do_trigger(struct work_struct *work)
@@ -436,16 +436,11 @@ static struct notifier_block mce_idle_notifier = {
 
 static __init int periodic_mcheck_init(void)
 {
-	next_interval = check_interval * HZ;
-	if (next_interval)
-		schedule_delayed_work(&mcheck_work,
-				      round_jiffies_relative(next_interval));
-	idle_notifier_register(&mce_idle_notifier);
-	return 0;
+       idle_notifier_register(&mce_idle_notifier);
+       return 0;
 }
 __initcall(periodic_mcheck_init);
 
-
 /*
  * Initialize Machine Checks for a CPU.
  */
@@ -515,6 +510,20 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 	}
 }
 
+static void mce_init_timer(void)
+{
+	struct timer_list *t = &__get_cpu_var(mce_timer);
+
+	/* data race harmless because everyone sets to the same value */
+	if (!next_interval)
+		next_interval = check_interval * HZ;
+	if (!next_interval)
+		return;
+	setup_timer(t, mcheck_timer, smp_processor_id());
+	t->expires = round_jiffies_relative(jiffies + next_interval);
+	add_timer(t);
+}
+
 /*
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off.
@@ -529,6 +538,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 
 	mce_init(NULL);
 	mce_cpu_features(c);
+	mce_init_timer();
 }
 
 /*
@@ -758,17 +768,19 @@ static int mce_resume(struct sys_device *dev)
 	return 0;
 }
 
+static void mce_cpu_restart(void *data)
+{
+	del_timer_sync(&__get_cpu_var(mce_timer));
+	if (mce_available(&current_cpu_data))
+		mce_init(NULL);
+	mce_init_timer();
+}
+
 /* Reinit MCEs after user configuration changes */
 static void mce_restart(void)
 {
-	if (next_interval)
-		cancel_delayed_work(&mcheck_work);
-	/* Timer race is harmless here */
-	on_each_cpu(mce_init, NULL, 1);
 	next_interval = check_interval * HZ;
-	if (next_interval)
-		schedule_delayed_work(&mcheck_work,
-				      round_jiffies_relative(next_interval));
+	on_each_cpu(mce_cpu_restart, NULL, 1);
 }
 
 static struct sysdev_class mce_sysclass = {
@@ -899,6 +911,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
+	struct timer_list *t = &per_cpu(mce_timer, cpu);
 
 	switch (action) {
 	case CPU_ONLINE:
@@ -913,6 +926,15 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 			threshold_cpu_callback(action, cpu);
 		mce_remove_device(cpu);
 		break;
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		del_timer_sync(t);
+		break;
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+		t->expires = round_jiffies_relative(jiffies + next_interval);
+		add_timer_on(t, cpu);
+		break;
 	}
 	return NOTIFY_OK;
 }
-- 
cgit v0.10.2


From 5b4408fdaa62474dd9485cddb9126370d90d4b82 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:39:30 +0100
Subject: x86, mce: don't set up mce sysdev devices with mce=off

Impact: bug fix, in this case the resume handler shouldn't run which
	avoids incorrectly reenabling machine checks on resume

When MCEs are completely disabled on the command line don't set
up the sysdev devices for them either.

Includes a comment fix from Thomas Gleixner.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 3f0550d..4e2b1bc 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -151,6 +151,8 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 
 static int mce_available(struct cpuinfo_x86 *c)
 {
+	if (mce_dont_init)
+		return 0;
 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
 
@@ -532,8 +534,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
 	mce_cpu_quirks(c);
 
-	if (mce_dont_init ||
-	    !mce_available(c))
+	if (!mce_available(c))
 		return;
 
 	mce_init(NULL);
@@ -710,8 +711,7 @@ static int __init mcheck_disable(char *str)
 	return 1;
 }
 
-/* mce=off disables machine check. Note you can re-enable it later
-   using sysfs.
+/* mce=off disables machine check.
    mce=TOLERANCELEVEL (number, see above)
    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
    mce=nobootlog Don't log MCEs from before booting. */
-- 
cgit v0.10.2


From d6b75584a3eaab8cb2ab3e8cf90c5e57c1928a85 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:39:31 +0100
Subject: x86, mce: disable machine checks on offlined CPUs

Impact: Lower priority bug fix

Offlined CPUs could still get machine checks, but the machine check handler
cannot handle them properly, leading to an unconditional crash. Disable
machine checks on CPUs that are going down.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 4e2b1bc..1db94c0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -906,6 +906,27 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 	cpu_clear(cpu, mce_device_initialized);
 }
 
+/* Make sure there are no machine checks on offlined CPUs. */
+static void __cpuexit mce_disable_cpu(void *h)
+{
+	int i;
+
+	if (!mce_available(&current_cpu_data))
+		return;
+	for (i = 0; i < banks; i++)
+		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+}
+
+static void __cpuexit mce_reenable_cpu(void *h)
+{
+	int i;
+
+	if (!mce_available(&current_cpu_data))
+		return;
+	for (i = 0; i < banks; i++)
+		wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+}
+
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
@@ -929,11 +950,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		del_timer_sync(t);
+		smp_call_function_single(cpu, mce_disable_cpu, NULL, 1);
 		break;
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 		t->expires = round_jiffies_relative(jiffies + next_interval);
 		add_timer_on(t, cpu);
+		smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1);
 		break;
 	}
 	return NOTIFY_OK;
-- 
cgit v0.10.2


From ef41df4344ff952c79746d44a6126bd2cf7ed2bc Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Thu, 12 Feb 2009 13:39:34 +0100
Subject: x86, mce: fix a race condition in mce_read()

Impact: bugfix

Considering the situation as follow:

before: mcelog.next == 1, mcelog.entry[0].finished = 1

+--------------------------------------------------------------------------
R                   W1                  W2                  W3

read mcelog.next (1)
                    mcelog.next++ (2)
                    (working on entry 1,
                    finished == 0)

mcelog.next = 0
                                        mcelog.next++ (1)
                                        (working on entry 0)
                                                           mcelog.next++ (2)
                                                           (working on entry 1)
                        <----------------- race ---------------->
                    (done on entry 1,
                    finished = 1)
                                                           (done on entry 1,
                                                           finished = 1)

To fix the race condition, a cmpxchg loop is added to mce_read() to
ensure no new MCE record can be added between mcelog.next reading and
mcelog.next = 0.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 1db94c0..870d08d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -595,7 +595,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 {
 	unsigned long *cpu_tsc;
 	static DEFINE_MUTEX(mce_read_mutex);
-	unsigned next;
+	unsigned prev, next;
 	char __user *buf = ubuf;
 	int i, err;
 
@@ -614,25 +614,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	}
 
 	err = 0;
-	for (i = 0; i < next; i++) {
-		unsigned long start = jiffies;
-
-		while (!mcelog.entry[i].finished) {
-			if (time_after_eq(jiffies, start + 2)) {
-				memset(mcelog.entry + i,0, sizeof(struct mce));
-				goto timeout;
+	prev = 0;
+	do {
+		for (i = prev; i < next; i++) {
+			unsigned long start = jiffies;
+
+			while (!mcelog.entry[i].finished) {
+				if (time_after_eq(jiffies, start + 2)) {
+					memset(mcelog.entry + i, 0,
+					       sizeof(struct mce));
+					goto timeout;
+				}
+				cpu_relax();
 			}
-			cpu_relax();
+			smp_rmb();
+			err |= copy_to_user(buf, mcelog.entry + i,
+					    sizeof(struct mce));
+			buf += sizeof(struct mce);
+timeout:
+			;
 		}
-		smp_rmb();
-		err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
-		buf += sizeof(struct mce);
- timeout:
-		;
-	}
 
-	memset(mcelog.entry, 0, next * sizeof(struct mce));
-	mcelog.next = 0;
+		memset(mcelog.entry + prev, 0,
+		       (next - prev) * sizeof(struct mce));
+		prev = next;
+		next = cmpxchg(&mcelog.next, prev, 0);
+	} while (next != prev);
 
 	synchronize_sched();
 
-- 
cgit v0.10.2


From e35849e910a6543d37c0d13648ef166678d03565 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:43:20 +0100
Subject: x86, mce: enable machine checks in 64-bit defconfig

Impact: Low priority fix

The 32-bit defconfig already had it enabled. And it's a pretty
fundamental feature, so better enable it on 64 bits too.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 322dd27..786ce5a 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -247,7 +247,10 @@ CONFIG_PREEMPT_VOLUNTARY=y
 # CONFIG_PREEMPT is not set
 CONFIG_X86_LOCAL_APIC=y
 CONFIG_X86_IO_APIC=y
-# CONFIG_X86_MCE is not set
+CONFIG_X86_MCE=y
+CONFIG_X86_NEW_MCE=y
+CONFIG_X86_MCE_INTEL=y
+CONFIG_X86_MCE_AMD=y
 # CONFIG_I8K is not set
 CONFIG_MICROCODE=y
 CONFIG_MICROCODE_OLD_INTERFACE=y
-- 
cgit v0.10.2


From 0d7482e3d76522157c9d741d79fce22c401fa0c5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 17 Feb 2009 23:07:13 +0100
Subject: x86, mce: implement dynamic machine check banks support

Impact: cleanup; making code future proof; memory saving on small systems

This patch replaces the hardcoded max number of machine check banks with
dynamic allocation depending on what the CPU reports. The sysfs
data structures and the banks array are dynamically allocated.

There is still a hard bank limit (128) because the mcelog protocol uses
banks >= 128 as pseudo banks to escape other events. But we expect
that 128 banks is beyond any reasonable CPU for now.

This supersedes an earlier patch by Venki, but it solves the problem
more completely by making the limit fully dynamic (up to the 128
boundary).

This saves some memory on machines with less than 6 banks because
they won't need sysdevs for unused ones and also allows to
use sysfs to control these banks on possible future CPUs with
more than 6 banks.

This is an updated patch addressing Venki's comments.  I also added in
another patch from Thomas which fixed the error allocation path (that
patch was previously separated)

Cc: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 870d08d..2297730 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -24,6 +24,8 @@
 #include <linux/ctype.h>
 #include <linux/kmod.h>
 #include <linux/kdebug.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
@@ -32,7 +34,12 @@
 #include <asm/idle.h>
 
 #define MISC_MCELOG_MINOR 227
-#define NR_SYSFS_BANKS 6
+
+/*
+ * To support more than 128 would need to escape the predefined
+ * Linux defined extended banks first.
+ */
+#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
 
 atomic_t mce_entry;
 
@@ -47,7 +54,7 @@ static int mce_dont_init;
  */
 static int tolerant = 1;
 static int banks;
-static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
+static u64 *bank;
 static unsigned long notify_user;
 static int rip_msr;
 static int mce_bootlog = -1;
@@ -212,7 +219,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	barrier();
 
 	for (i = 0; i < banks; i++) {
-		if (i < NR_SYSFS_BANKS && !bank[i])
+		if (!bank[i])
 			continue;
 
 		m.misc = 0;
@@ -446,37 +453,54 @@ __initcall(periodic_mcheck_init);
 /*
  * Initialize Machine Checks for a CPU.
  */
-static void mce_init(void *dummy)
+static int mce_cap_init(void)
 {
 	u64 cap;
-	int i;
+	unsigned b;
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
-	banks = cap & 0xff;
-	if (banks > MCE_EXTENDED_BANK) {
-		banks = MCE_EXTENDED_BANK;
-		printk(KERN_INFO "MCE: warning: using only %d banks\n",
-		       MCE_EXTENDED_BANK);
+	b = cap & 0xff;
+	if (b > MAX_NR_BANKS) {
+		printk(KERN_WARNING
+		       "MCE: Using only %u machine check banks out of %u\n",
+			MAX_NR_BANKS, b);
+		b = MAX_NR_BANKS;
+	}
+
+	/* Don't support asymmetric configurations today */
+	WARN_ON(banks != 0 && b != banks);
+	banks = b;
+	if (!bank) {
+		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
+		if (!bank)
+			return -ENOMEM;
+		memset(bank, 0xff, banks * sizeof(u64));
 	}
+
 	/* Use accurate RIP reporting if available. */
 	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 		rip_msr = MSR_IA32_MCG_EIP;
 
+	return 0;
+}
+
+static void mce_init(void *dummy)
+{
+	u64 cap;
+	int i;
+
 	/* Log the machine checks left over from the previous reset.
 	   This also clears all registers */
 	do_machine_check(NULL, mce_bootlog ? -1 : -2);
 
 	set_in_cr4(X86_CR4_MCE);
 
+	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	if (cap & MCG_CTL_P)
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 
 	for (i = 0; i < banks; i++) {
-		if (i < NR_SYSFS_BANKS)
-			wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
-		else
-			wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
-
+		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	}
 }
@@ -486,10 +510,10 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 {
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD) {
-		if(c->x86 == 15)
+		if (c->x86 == 15 && banks > 4)
 			/* disable GART TBL walk error reporting, which trips off
 			   incorrectly with the IOMMU & 3ware & Cerberus. */
-			clear_bit(10, &bank[4]);
+			clear_bit(10, (unsigned long *)&bank[4]);
 		if(c->x86 <= 17 && mce_bootlog < 0)
 			/* Lots of broken BIOS around that don't clear them
 			   by default and leave crap in there. Don't log. */
@@ -532,11 +556,15 @@ static void mce_init_timer(void)
  */
 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
-	mce_cpu_quirks(c);
-
 	if (!mce_available(c))
 		return;
 
+	if (mce_cap_init() < 0) {
+		mce_dont_init = 1;
+		return;
+	}
+	mce_cpu_quirks(c);
+
 	mce_init(NULL);
 	mce_cpu_features(c);
 	mce_init_timer();
@@ -819,16 +847,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
 	}								\
 	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 
-/*
- * TBD should generate these dynamically based on number of available banks.
- * Have only 6 contol banks in /sysfs until then.
- */
-ACCESSOR(bank0ctl,bank[0],mce_restart())
-ACCESSOR(bank1ctl,bank[1],mce_restart())
-ACCESSOR(bank2ctl,bank[2],mce_restart())
-ACCESSOR(bank3ctl,bank[3],mce_restart())
-ACCESSOR(bank4ctl,bank[4],mce_restart())
-ACCESSOR(bank5ctl,bank[5],mce_restart())
+static struct sysdev_attribute *bank_attrs;
+
+static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+			 char *buf)
+{
+	u64 b = bank[attr - bank_attrs];
+	return sprintf(buf, "%Lx\n", b);
+}
+
+static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+			const char *buf, size_t siz)
+{
+	char *end;
+	u64 new = simple_strtoull(buf, &end, 0);
+	if (end == buf)
+		return -EINVAL;
+	bank[attr - bank_attrs] = new;
+	mce_restart();
+	return end-buf;
+}
 
 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 				char *buf)
@@ -855,8 +893,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 ACCESSOR(check_interval,check_interval,mce_restart())
 static struct sysdev_attribute *mce_attributes[] = {
-	&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
-	&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
 	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 	NULL
 };
@@ -886,11 +922,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
 		if (err)
 			goto error;
 	}
+	for (i = 0; i < banks; i++) {
+		err = sysdev_create_file(&per_cpu(device_mce, cpu),
+					&bank_attrs[i]);
+		if (err)
+			goto error2;
+	}
 	cpu_set(cpu, mce_device_initialized);
 
 	return 0;
+error2:
+	while (--i >= 0) {
+		sysdev_remove_file(&per_cpu(device_mce, cpu),
+					&bank_attrs[i]);
+	}
 error:
-	while (i--) {
+	while (--i >= 0) {
 		sysdev_remove_file(&per_cpu(device_mce,cpu),
 				   mce_attributes[i]);
 	}
@@ -909,6 +956,9 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 	for (i = 0; mce_attributes[i]; i++)
 		sysdev_remove_file(&per_cpu(device_mce,cpu),
 			mce_attributes[i]);
+	for (i = 0; i < banks; i++)
+		sysdev_remove_file(&per_cpu(device_mce, cpu),
+			&bank_attrs[i]);
 	sysdev_unregister(&per_cpu(device_mce,cpu));
 	cpu_clear(cpu, mce_device_initialized);
 }
@@ -973,6 +1023,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
 	.notifier_call = mce_cpu_callback,
 };
 
+static __init int mce_init_banks(void)
+{
+	int i;
+
+	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
+				GFP_KERNEL);
+	if (!bank_attrs)
+		return -ENOMEM;
+
+	for (i = 0; i < banks; i++) {
+		struct sysdev_attribute *a = &bank_attrs[i];
+		a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+		if (!a->attr.name)
+			goto nomem;
+		a->attr.mode = 0644;
+		a->show = show_bank;
+		a->store = set_bank;
+	}
+	return 0;
+
+nomem:
+	while (--i >= 0)
+		kfree(bank_attrs[i].attr.name);
+	kfree(bank_attrs);
+	bank_attrs = NULL;
+	return -ENOMEM;
+}
+
 static __init int mce_init_device(void)
 {
 	int err;
@@ -980,6 +1058,11 @@ static __init int mce_init_device(void)
 
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
+
+	err = mce_init_banks();
+	if (err)
+		return err;
+
 	err = sysdev_class_register(&mce_sysclass);
 	if (err)
 		return err;
-- 
cgit v0.10.2


From b5f2fa4ea00a179ac1c2ff342ceeee261dd75e53 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:43:22 +0100
Subject: x86, mce: factor out duplicated struct mce setup into one function

Impact: cleanup

This merely factors out duplicated code to set up
the initial struct mce state into a single function.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 5522273..048b71d 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -90,6 +90,7 @@ extern int mce_disabled;
 
 #include <asm/atomic.h>
 
+void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
 DECLARE_PER_CPU(struct sys_device, device_mce);
 extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -106,7 +107,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
 static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
 #endif
 
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
+void mce_log_therm_throt_event(__u64 status);
 
 extern atomic_t mce_entry;
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 2297730..fed8757 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -65,6 +65,14 @@ static char *trigger_argv[2] = { trigger, NULL };
 
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
 
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+	memset(m, 0, sizeof(struct mce));
+	m->cpu = smp_processor_id();
+	rdtscll(m->tsc);
+}
+
 /*
  * Lockless MCE logging infrastructure.
  * This avoids deadlocks on printk locks without having to break locks. Also
@@ -208,8 +216,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	    || !banks)
 		goto out2;
 
-	memset(&m, 0, sizeof(struct mce));
-	m.cpu = smp_processor_id();
+	mce_setup(&m);
+
 	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 	/* if the restart IP is not valid, we're done for */
 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -225,7 +233,6 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 		m.misc = 0;
 		m.addr = 0;
 		m.bank = i;
-		m.tsc = 0;
 
 		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 		if ((m.status & MCI_STATUS_VAL) == 0)
@@ -252,8 +259,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 
 		mce_get_rip(&m, regs);
-		if (error_code >= 0)
-			rdtscll(m.tsc);
+		if (error_code < 0)
+			m.tsc = 0;
 		if (error_code != -2)
 			mce_log(&m);
 
@@ -341,15 +348,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
  * and historically has been the register value of the
  * MSR_IA32_THERMAL_STATUS (Intel) msr.
  */
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+void mce_log_therm_throt_event(__u64 status)
 {
 	struct mce m;
 
-	memset(&m, 0, sizeof(m));
-	m.cpu = cpu;
+	mce_setup(&m);
 	m.bank = MCE_THERMAL_BANK;
 	m.status = status;
-	rdtscll(m.tsc);
 	mce_log(&m);
 }
 #endif /* CONFIG_X86_MCE_INTEL */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 8ae8c4f..75d9dd2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -197,9 +197,7 @@ asmlinkage void mce_threshold_interrupt(void)
 	exit_idle();
 	irq_enter();
 
-	memset(&m, 0, sizeof(m));
-	rdtscll(m.tsc);
-	m.cpu = smp_processor_id();
+	mce_setup(&m);
 
 	/* assume first bank caused it */
 	for (bank = 0; bank < NR_BANKS; ++bank) {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 4b48f25..7f7f101 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -24,7 +24,7 @@ asmlinkage void smp_thermal_interrupt(void)
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
 	if (therm_throt_process(msr_val & 1))
-		mce_log_therm_throt_event(smp_processor_id(), msr_val);
+		mce_log_therm_throt_event(msr_val);
 
 	inc_irq_stat(irq_thermal_count);
 	irq_exit();
-- 
cgit v0.10.2


From b79109c3bbcf52cac5103979b283b9e5df4e796c Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:43:23 +0100
Subject: x86, mce: separate correct machine check poller and fatal exception
 handler

Impact: cleanup, performance enhancement

The machine check poller is diverging more and more from the fatal
exception handler. Instead of adding more special cases separate the code
paths completely. The corrected poll path is actually quite simple,
and this doesn't result in much code duplication.

This makes both handlers much easier to read and results in
cleaner code flow.  The exception handler now only needs to care
about uncorrected errors, which also simplifies the handling of multiple
errors. The corrected poller also now always runs in standard interrupt
context and does not need to do anything special to handle NMI context.

Minor behaviour changes:
- MCG status is now not cleared on polling.
- Only the banks which had corrected errors get cleared on polling
- The exception handler only clears banks with errors now

v2: Forward port to new patch order. Add "uc" argument.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 048b71d..225cdb5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -112,6 +112,13 @@ void mce_log_therm_throt_event(__u64 status);
 extern atomic_t mce_entry;
 
 extern void do_machine_check(struct pt_regs *, long);
+
+enum mcp_flags {
+	MCP_TIMESTAMP = (1 << 0),	/* log time stamp */
+	MCP_UC = (1 << 1),		/* log uncorrected errors */
+};
+extern void machine_check_poll(enum mcp_flags flags);
+
 extern int mce_notify_user(void);
 
 #endif /* !CONFIG_X86_32 */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fed8757..268b05e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  * Rest from unknown author(s).
  * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
  */
 
 #include <linux/init.h>
@@ -189,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 }
 
 /*
- * The actual machine check handler
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ */
+void machine_check_poll(enum mcp_flags flags)
+{
+	struct mce m;
+	int i;
+
+	mce_setup(&m);
+
+	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+	for (i = 0; i < banks; i++) {
+		if (!bank[i])
+			continue;
+
+		m.misc = 0;
+		m.addr = 0;
+		m.bank = i;
+		m.tsc = 0;
+
+		barrier();
+		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+		if (!(m.status & MCI_STATUS_VAL))
+			continue;
+
+		/*
+		 * Uncorrected events are handled by the exception handler
+		 * when it is enabled. But when the exception is disabled log
+		 * everything.
+		 *
+		 * TBD do the same check for MCI_STATUS_EN here?
+		 */
+		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
+			continue;
+
+		if (m.status & MCI_STATUS_MISCV)
+			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+		if (m.status & MCI_STATUS_ADDRV)
+			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+
+		if (!(flags & MCP_TIMESTAMP))
+			m.tsc = 0;
+		/*
+		 * Don't get the IP here because it's unlikely to
+		 * have anything to do with the actual error location.
+		 */
+
+		mce_log(&m);
+		add_taint(TAINT_MACHINE_CHECK);
+
+		/*
+		 * Clear state for this bank.
+		 */
+		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+	}
+
+	/*
+	 * Don't clear MCG_STATUS here because it's only defined for
+	 * exceptions.
+	 */
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
  */
 void do_machine_check(struct pt_regs * regs, long error_code)
 {
@@ -207,13 +279,14 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	 * error.
 	 */
 	int kill_it = 0;
+	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 
 	atomic_inc(&mce_entry);
 
-	if ((regs
-	     && notify_die(DIE_NMI, "machine check", regs, error_code,
+	if (notify_die(DIE_NMI, "machine check", regs, error_code,
 			   18, SIGKILL) == NOTIFY_STOP)
-	    || !banks)
+		goto out2;
+	if (!banks)
 		goto out2;
 
 	mce_setup(&m);
@@ -227,6 +300,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	barrier();
 
 	for (i = 0; i < banks; i++) {
+		__clear_bit(i, toclear);
 		if (!bank[i])
 			continue;
 
@@ -238,6 +312,20 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 		if ((m.status & MCI_STATUS_VAL) == 0)
 			continue;
 
+		/*
+		 * Non uncorrected errors are handled by machine_check_poll
+		 * Leave them alone.
+		 */
+		if ((m.status & MCI_STATUS_UC) == 0)
+			continue;
+
+		/*
+		 * Set taint even when machine check was not enabled.
+		 */
+		add_taint(TAINT_MACHINE_CHECK);
+
+		__set_bit(i, toclear);
+
 		if (m.status & MCI_STATUS_EN) {
 			/* if PCC was set, there's no way out */
 			no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -251,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 					no_way_out = 1;
 				kill_it = 1;
 			}
+		} else {
+			/*
+			 * Machine check event was not enabled. Clear, but
+			 * ignore.
+			 */
+			continue;
 		}
 
 		if (m.status & MCI_STATUS_MISCV)
@@ -259,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 
 		mce_get_rip(&m, regs);
-		if (error_code < 0)
-			m.tsc = 0;
-		if (error_code != -2)
-			mce_log(&m);
+		mce_log(&m);
 
 		/* Did this bank cause the exception? */
 		/* Assume that the bank with uncorrectable errors did it,
@@ -271,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 			panicm = m;
 			panicm_found = 1;
 		}
-
-		add_taint(TAINT_MACHINE_CHECK);
 	}
 
-	/* Never do anything final in the polling timer */
-	if (!regs)
-		goto out;
-
 	/* If we didn't find an uncorrectable error, pick
 	   the last one (shouldn't happen, just being safe). */
 	if (!panicm_found)
@@ -325,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	/* notify userspace ASAP */
 	set_thread_flag(TIF_MCE_NOTIFY);
 
- out:
 	/* the last thing we do is clear state */
-	for (i = 0; i < banks; i++)
-		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+	for (i = 0; i < banks; i++) {
+		if (test_bit(i, toclear))
+			wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+	}
 	wrmsrl(MSR_IA32_MCG_STATUS, 0);
  out2:
 	atomic_dec(&mce_entry);
@@ -377,7 +463,7 @@ static void mcheck_timer(unsigned long data)
 	WARN_ON(smp_processor_id() != data);
 
 	if (mce_available(&current_cpu_data))
-		do_machine_check(NULL, 0);
+		machine_check_poll(MCP_TIMESTAMP);
 
 	/*
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -494,9 +580,10 @@ static void mce_init(void *dummy)
 	u64 cap;
 	int i;
 
-	/* Log the machine checks left over from the previous reset.
-	   This also clears all registers */
-	do_machine_check(NULL, mce_bootlog ? -1 : -2);
+	/*
+	 * Log the machine checks left over from the previous reset.
+	 */
+	machine_check_poll(MCP_UC);
 
 	set_in_cr4(X86_CR4_MCE);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 75d9dd2..0069c65 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -231,7 +231,7 @@ asmlinkage void mce_threshold_interrupt(void)
 
 			/* Log the machine check that caused the threshold
 			   event. */
-			do_machine_check(NULL, 0);
+			machine_check_poll(MCP_TIMESTAMP);
 
 			if (high & MASK_OVERFLOW_HI) {
 				rdmsrl(address, m.misc);
-- 
cgit v0.10.2


From f6d1826dfad0d15fd14a455facc80b91f2ee642f Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Thu, 19 Feb 2009 15:44:58 -0800
Subject: x86, mce: use %ll instead of %L for 64-bit numbers

Impact: Cleanup

The standard spelling of a printf pattern for long long is "ll", not
"L", which is for long double.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 268b05e..60a114c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -136,11 +136,11 @@ static void print_mce(struct mce *m)
 			print_symbol("{%s}", m->ip);
 		printk("\n");
 	}
-	printk(KERN_EMERG "TSC %Lx ", m->tsc);
+	printk(KERN_EMERG "TSC %llx ", m->tsc);
 	if (m->addr)
-		printk("ADDR %Lx ", m->addr);
+		printk("ADDR %llx ", m->addr);
 	if (m->misc)
-		printk("MISC %Lx ", m->misc);
+		printk("MISC %llx ", m->misc);
 	printk("\n");
 	printk(KERN_EMERG "This is not a software problem!\n");
 	printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -945,7 +945,7 @@ static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
 			 char *buf)
 {
 	u64 b = bank[attr - bank_attrs];
-	return sprintf(buf, "%Lx\n", b);
+	return sprintf(buf, "%llx\n", b);
 }
 
 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
-- 
cgit v0.10.2


From 42f8faecf7a88371de0f30aebb052d1ae51762c0 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 17 Feb 2009 11:46:42 +0800
Subject: x86: use percpu data for 4k hardirq and softirq stacks

Impact: economize memory for large NR_CPUS

percpu data is setup earlier than irq, we can use percpu data
to economize memory.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index e0f29be..90cd94a 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/uaccess.h>
+#include <linux/percpu.h>
 
 #include <asm/apic.h>
 
@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
 union irq_ctx {
 	struct thread_info      tinfo;
 	u32                     stack[THREAD_SIZE/sizeof(u32)];
-};
+} __attribute__((aligned(PAGE_SIZE)));
 
-static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
-static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
+static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
+static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
 
-static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
-static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
 
 static void call_on_stack(void *func, void *stack)
 {
@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
 	u32 *isp, arg1, arg2;
 
 	curctx = (union irq_ctx *) current_thread_info();
-	irqctx = hardirq_ctx[smp_processor_id()];
+	irqctx = __get_cpu_var(hardirq_ctx);
 
 	/*
 	 * this is where we switch to the IRQ stack. However, if we are
@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
 {
 	union irq_ctx *irqctx;
 
-	if (hardirq_ctx[cpu])
+	if (per_cpu(hardirq_ctx, cpu))
 		return;
 
-	irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+	irqctx = &per_cpu(hardirq_stack, cpu);
 	irqctx->tinfo.task		= NULL;
 	irqctx->tinfo.exec_domain	= NULL;
 	irqctx->tinfo.cpu		= cpu;
 	irqctx->tinfo.preempt_count	= HARDIRQ_OFFSET;
 	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
 
-	hardirq_ctx[cpu] = irqctx;
+	per_cpu(hardirq_ctx, cpu) = irqctx;
 
-	irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE];
+	irqctx = &per_cpu(softirq_stack, cpu);
 	irqctx->tinfo.task		= NULL;
 	irqctx->tinfo.exec_domain	= NULL;
 	irqctx->tinfo.cpu		= cpu;
 	irqctx->tinfo.preempt_count	= 0;
 	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
 
-	softirq_ctx[cpu] = irqctx;
+	per_cpu(softirq_ctx, cpu) = irqctx;
 
 	printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
-	       cpu, hardirq_ctx[cpu], softirq_ctx[cpu]);
+	       cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu));
 }
 
 void irq_ctx_exit(int cpu)
 {
-	hardirq_ctx[cpu] = NULL;
+	per_cpu(hardirq_ctx, cpu) = NULL;
 }
 
 asmlinkage void do_softirq(void)
@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
 
 	if (local_softirq_pending()) {
 		curctx = current_thread_info();
-		irqctx = softirq_ctx[smp_processor_id()];
+		irqctx = __get_cpu_var(softirq_ctx);
 		irqctx->tinfo.task = curctx->task;
 		irqctx->tinfo.previous_esp = current_stack_pointer;
 
-- 
cgit v0.10.2


From 734269521e320ad14ed39ae9b64d482b9028dcd2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:07 +0900
Subject: vmalloc: call flush_cache_vunmap() from unmap_kernel_range()

Impact: proper vcache flush on unmap_kernel_range()

flush_cache_vunmap() should be called before pages are unmapped.  Add
a call to it in unmap_kernel_range().

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 75f49d3..c37924a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1012,6 +1012,8 @@ void __init vmalloc_init(void)
 void unmap_kernel_range(unsigned long addr, unsigned long size)
 {
 	unsigned long end = addr + size;
+
+	flush_cache_vunmap(addr, end);
 	vunmap_page_range(addr, end);
 	flush_tlb_kernel_range(addr, end);
 }
-- 
cgit v0.10.2


From 6b588c18f8dacfa6d7957c33c5ff832096e752d3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:07 +0900
Subject: module: reorder module pcpu related functions

Impact: cleanup

Move percpu_modinit() upwards.  This is to ease further changes.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/kernel/module.c b/kernel/module.c
index ba22484..52b3497 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -480,21 +480,6 @@ static void percpu_modfree(void *freeme)
 	}
 }
 
-static unsigned int find_pcpusec(Elf_Ehdr *hdr,
-				 Elf_Shdr *sechdrs,
-				 const char *secstrings)
-{
-	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
-}
-
-static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		memcpy(pcpudest + per_cpu_offset(cpu), from, size);
-}
-
 static int percpu_modinit(void)
 {
 	pcpu_num_used = 2;
@@ -513,7 +498,24 @@ static int percpu_modinit(void)
 	return 0;
 }
 __initcall(percpu_modinit);
+
+static unsigned int find_pcpusec(Elf_Ehdr *hdr,
+				 Elf_Shdr *sechdrs,
+				 const char *secstrings)
+{
+	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+}
+
+static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+}
+
 #else /* ... !CONFIG_SMP */
+
 static inline void *percpu_modalloc(unsigned long size, unsigned long align,
 				    const char *name)
 {
@@ -535,6 +537,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
 	/* pcpusec should be 0, and size of that section should be 0. */
 	BUG_ON(size != 0);
 }
+
 #endif /* CONFIG_SMP */
 
 #define MODINFO_ATTR(field)	\
-- 
cgit v0.10.2


From b36128c830a8f5bd7d4981f5b0b69950f5928ee6 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: alloc_percpu: change percpu_ptr to per_cpu_ptr

Impact: cleanup

There are two allocated per-cpu accessor macros with almost identical
spelling.  The original and far more popular is per_cpu_ptr (44
files), so change over the other 4 files.

tj: kill percpu_ptr() and update UP too

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: mingo@redhat.com
Cc: lenb@kernel.org
Cc: cpufreq@vger.kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319..22590cf 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (!data)
 		return -ENOMEM;
 
-	data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
+	data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
 	per_cpu(drv_data, cpu) = data;
 
 	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 9cc769b..68fd3d2 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -516,12 +516,12 @@ int acpi_processor_preregister_performance(
 			continue;
 		}
 
-		if (!performance || !percpu_ptr(performance, i)) {
+		if (!performance || !per_cpu_ptr(performance, i)) {
 			retval = -EINVAL;
 			continue;
 		}
 
-		pr->performance = percpu_ptr(performance, i);
+		pr->performance = per_cpu_ptr(performance, i);
 		cpumask_set_cpu(i, pr->performance->shared_cpu_map);
 		if (acpi_processor_get_psd(pr)) {
 			retval = -EINVAL;
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 3577ffd..c80cfe1 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -81,23 +81,13 @@ struct percpu_data {
 };
 
 #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
-/* 
- * Use this to get to a cpu's version of the per-cpu object dynamically
- * allocated. Non-atomic access to the current CPU's version should
- * probably be combined with get_cpu()/put_cpu().
- */ 
-#define percpu_ptr(ptr, cpu)                              \
-({                                                        \
-        struct percpu_data *__p = __percpu_disguise(ptr); \
-        (__typeof__(ptr))__p->ptrs[(cpu)];	          \
-})
 
 extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
 extern void percpu_free(void *__pdata);
 
 #else /* CONFIG_SMP */
 
-#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
+#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
 
 static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
 {
@@ -122,6 +112,15 @@ static inline void percpu_free(void *__pdata)
 						  cpu_possible_map)
 #define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type))
 #define free_percpu(ptr)	percpu_free((ptr))
-#define per_cpu_ptr(ptr, cpu)	percpu_ptr((ptr), (cpu))
+/*
+ * Use this to get to a cpu's version of the per-cpu object dynamically
+ * allocated. Non-atomic access to the current CPU's version should
+ * probably be combined with get_cpu()/put_cpu().
+ */
+#define per_cpu_ptr(ptr, cpu)						\
+({									\
+        struct percpu_data *__p = __percpu_disguise(ptr);		\
+        (__typeof__(ptr))__p->ptrs[(cpu)];				\
+})
 
 #endif /* __LINUX_PERCPU_H */
diff --git a/kernel/sched.c b/kernel/sched.c
index fc17fd9..9d30ac9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9472,7 +9472,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 
 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 {
-	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 	u64 data;
 
 #ifndef CONFIG_64BIT
@@ -9491,7 +9491,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 
 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 {
-	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 
 #ifndef CONFIG_64BIT
 	/*
@@ -9587,7 +9587,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 	ca = task_ca(tsk);
 
 	for (; ca; ca = ca->parent) {
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
 }
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 0cd415e..74541ca 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 	 * doesn't hit this CPU until we're ready. */
 	get_cpu();
 	for_each_online_cpu(i) {
-		sm_work = percpu_ptr(stop_machine_work, i);
+		sm_work = per_cpu_ptr(stop_machine_work, i);
 		INIT_WORK(sm_work, stop_cpu);
 		queue_work_on(i, stop_machine_wq, sm_work);
 	}
-- 
cgit v0.10.2


From 313e458f81ec3852106c5a83830fe0d4f405a71a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: alloc_percpu: add align argument to __alloc_percpu.

This prepares for a real __alloc_percpu, by adding an alignment argument.
Only one place uses __alloc_percpu directly, and that's for a string.

tj: af_inet also uses __alloc_percpu(), update it.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>

diff --git a/block/blktrace.c b/block/blktrace.c
index 39cc3bf..4877662 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -363,7 +363,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->sequence)
 		goto err;
 
-	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
+	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
 	if (!bt->msg_data)
 		goto err;
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index c80cfe1..1fdaee9 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -108,9 +108,10 @@ static inline void percpu_free(void *__pdata)
 
 /* (legacy) interface for use without CPU hotplug handling */
 
-#define __alloc_percpu(size)	percpu_alloc_mask((size), GFP_KERNEL, \
+#define __alloc_percpu(size, align)	percpu_alloc_mask((size), GFP_KERNEL, \
 						  cpu_possible_map)
-#define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type))
+#define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type), \
+						       __alignof__(type))
 #define free_percpu(ptr)	percpu_free((ptr))
 /*
  * Use this to get to a cpu's version of the per-cpu object dynamically
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 743f554..3a3dad8 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field);
 int snmp_mib_init(void *ptr[2], size_t mibsize)
 {
 	BUG_ON(ptr == NULL);
-	ptr[0] = __alloc_percpu(mibsize);
+	ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
 	if (!ptr[0])
 		goto err0;
-	ptr[1] = __alloc_percpu(mibsize);
+	ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
 	if (!ptr[1])
 		goto err1;
 	return 0;
-- 
cgit v0.10.2


From f2a8205c4ef1af917d175c36a4097ae5587791c8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: percpu: kill percpu_alloc() and friends

Impact: kill unused functions

percpu_alloc() and its friends never saw much action.  It was supposed
to replace the cpu-mask unaware __alloc_percpu() but it never happened
and in fact __percpu_alloc_mask() itself never really grew proper
up/down handling interface either (no exported interface for
populate/depopulate).

percpu allocation is about to go through major reimplementation and
there's no reason to carry this unused interface around.  Replace it
with __alloc_percpu() and free_percpu().

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 1fdaee9..d99e24a 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -82,46 +82,43 @@ struct percpu_data {
 
 #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
 
-extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
-extern void percpu_free(void *__pdata);
+/*
+ * Use this to get to a cpu's version of the per-cpu object
+ * dynamically allocated. Non-atomic access to the current CPU's
+ * version should probably be combined with get_cpu()/put_cpu().
+ */
+#define per_cpu_ptr(ptr, cpu)						\
+({									\
+        struct percpu_data *__p = __percpu_disguise(ptr);		\
+        (__typeof__(ptr))__p->ptrs[(cpu)];				\
+})
+
+extern void *__alloc_percpu(size_t size, size_t align);
+extern void free_percpu(void *__pdata);
 
 #else /* CONFIG_SMP */
 
 #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
 
-static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+static inline void *__alloc_percpu(size_t size, size_t align)
 {
+	/*
+	 * Can't easily make larger alignment work with kmalloc.  WARN
+	 * on it.  Larger alignment should only be used for module
+	 * percpu sections on SMP for which this path isn't used.
+	 */
+	WARN_ON_ONCE(align > __alignof__(unsigned long long));
 	return kzalloc(size, gfp);
 }
 
-static inline void percpu_free(void *__pdata)
+static inline void free_percpu(void *p)
 {
-	kfree(__pdata);
+	kfree(p);
 }
 
 #endif /* CONFIG_SMP */
 
-#define percpu_alloc_mask(size, gfp, mask) \
-	__percpu_alloc_mask((size), (gfp), &(mask))
-
-#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map)
-
-/* (legacy) interface for use without CPU hotplug handling */
-
-#define __alloc_percpu(size, align)	percpu_alloc_mask((size), GFP_KERNEL, \
-						  cpu_possible_map)
 #define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type), \
 						       __alignof__(type))
-#define free_percpu(ptr)	percpu_free((ptr))
-/*
- * Use this to get to a cpu's version of the per-cpu object dynamically
- * allocated. Non-atomic access to the current CPU's version should
- * probably be combined with get_cpu()/put_cpu().
- */
-#define per_cpu_ptr(ptr, cpu)						\
-({									\
-        struct percpu_data *__p = __percpu_disguise(ptr);		\
-        (__typeof__(ptr))__p->ptrs[(cpu)];				\
-})
 
 #endif /* __LINUX_PERCPU_H */
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 4297bc4..3653c57 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
 	__percpu_populate_mask((__pdata), (size), (gfp), &(mask))
 
 /**
- * percpu_alloc_mask - initial setup of per-cpu data
+ * alloc_percpu - initial setup of per-cpu data
  * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @mask: populate per-data for cpu's selected through mask bits
+ * @align: alignment
  *
- * Populating per-cpu data for all online cpu's would be a typical use case,
- * which is simplified by the percpu_alloc() wrapper.
- * Per-cpu objects are populated with zeroed buffers.
+ * Allocate dynamic percpu area.  Percpu objects are populated with
+ * zeroed buffers.
  */
-void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+void *__alloc_percpu(size_t size, size_t align)
 {
 	/*
 	 * We allocate whole cache lines to avoid false sharing
 	 */
 	size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
-	void *pdata = kzalloc(sz, gfp);
+	void *pdata = kzalloc(sz, GFP_KERNEL);
 	void *__pdata = __percpu_disguise(pdata);
 
+	/*
+	 * Can't easily make larger alignment work with kmalloc.  WARN
+	 * on it.  Larger alignment should only be used for module
+	 * percpu sections on SMP for which this path isn't used.
+	 */
+	WARN_ON_ONCE(align > __alignof__(unsigned long long));
+
 	if (unlikely(!pdata))
 		return NULL;
-	if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
+	if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
+					   &cpu_possible_map)))
 		return __pdata;
 	kfree(pdata);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
+EXPORT_SYMBOL_GPL(__alloc_percpu);
 
 /**
- * percpu_free - final cleanup of per-cpu data
+ * free_percpu - final cleanup of per-cpu data
  * @__pdata: object to clean up
  *
  * We simply clean up any per-cpu object left. No need for the client to
  * track and specify through a bis mask which per-cpu objects are to free.
  */
-void percpu_free(void *__pdata)
+void free_percpu(void *__pdata)
 {
 	if (unlikely(!__pdata))
 		return;
 	__percpu_depopulate_mask(__pdata, &cpu_possible_map);
 	kfree(__percpu_disguise(__pdata));
 }
-EXPORT_SYMBOL_GPL(percpu_free);
+EXPORT_SYMBOL_GPL(free_percpu);
-- 
cgit v0.10.2


From f0aa6617903648077dffe5cfcf7c4458f4610fa7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: vmalloc: implement vm_area_register_early()

Impact: allow multiple early vm areas

There are places where kernel VM area needs to be allocated before
vmalloc is initialized.  This is done by allocating static vm_struct,
initializing several fields and linking it to vmlist and later vmalloc
initialization picking up these from vmlist.  This is currently done
manually and if there's more than one such areas, there's no defined
way to arbitrate who gets which address.

This patch implements vm_area_register_early(), which takes vm_area
struct with flags and size initialized, assigns address to it and puts
it on the vmlist.  This way, multiple early vm areas can determine
which addresses they should use.  The only current user - alpha mm
init - is converted to use it.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 5d7a16e..df6df02 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -189,9 +189,21 @@ callback_init(void * kernel_end)
 
 	if (alpha_using_srm) {
 		static struct vm_struct console_remap_vm;
-		unsigned long vaddr = VMALLOC_START;
+		unsigned long nr_pages = 0;
+		unsigned long vaddr;
 		unsigned long i, j;
 
+		/* calculate needed size */
+		for (i = 0; i < crb->map_entries; ++i)
+			nr_pages += crb->map[i].count;
+
+		/* register the vm area */
+		console_remap_vm.flags = VM_ALLOC;
+		console_remap_vm.size = nr_pages << PAGE_SHIFT;
+		vm_area_register_early(&console_remap_vm);
+
+		vaddr = (unsigned long)consle_remap_vm.addr;
+
 		/* Set up the third level PTEs and update the virtual
 		   addresses of the CRB entries.  */
 		for (i = 0; i < crb->map_entries; ++i) {
@@ -213,12 +225,6 @@ callback_init(void * kernel_end)
 				vaddr += PAGE_SIZE;
 			}
 		}
-
-		/* Let vmalloc know that we've allocated some space.  */
-		console_remap_vm.flags = VM_ALLOC;
-		console_remap_vm.addr = (void *) VMALLOC_START;
-		console_remap_vm.size = vaddr - VMALLOC_START;
-		vmlist = &console_remap_vm;
 	}
 
 	callback_init_done = 1;
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 506e762..bbc0513 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -106,5 +106,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
  */
 extern rwlock_t vmlist_lock;
 extern struct vm_struct *vmlist;
+extern __init void vm_area_register_early(struct vm_struct *vm);
 
 #endif /* _LINUX_VMALLOC_H */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c37924a..d206261 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -24,6 +24,7 @@
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
 #include <linux/bootmem.h>
+#include <linux/pfn.h>
 
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
@@ -982,6 +983,29 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
 }
 EXPORT_SYMBOL(vm_map_ram);
 
+/**
+ * vm_area_register_early - register vmap area early during boot
+ * @vm: vm_struct to register
+ * @size: size of area to register
+ *
+ * This function is used to register kernel vm area before
+ * vmalloc_init() is called.  @vm->size and @vm->flags should contain
+ * proper values on entry and other fields should be zero.  On return,
+ * vm->addr contains the allocated address.
+ *
+ * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
+ */
+void __init vm_area_register_early(struct vm_struct *vm)
+{
+	static size_t vm_init_off __initdata;
+
+	vm->addr = (void *)VMALLOC_START + vm_init_off;
+	vm_init_off = PFN_ALIGN(vm_init_off + vm->size);
+
+	vm->next = vmlist;
+	vmlist = vm;
+}
+
 void __init vmalloc_init(void)
 {
 	struct vmap_area *va;
-- 
cgit v0.10.2


From 8fc48985006da4ceba24508db64ec77fc0dfe3bb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: vmalloc: add un/map_kernel_range_noflush()

Impact: two more public map/unmap functions

Implement map_kernel_range_noflush() and unmap_kernel_range_noflush().
These functions respectively map and unmap address range in kernel VM
area but doesn't do any vcache or tlb flushing.  These will be used by
new percpu allocator.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index bbc0513..599ba79 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -91,6 +91,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
 
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
 			struct page ***pages);
+extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
+				    pgprot_t prot, struct page **pages);
+extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
 extern void unmap_kernel_range(unsigned long addr, unsigned long size);
 
 /* Allocate/destroy a 'vmalloc' VM area. */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d206261..224eca9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -153,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
  *
  * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
  */
-static int vmap_page_range(unsigned long start, unsigned long end,
-				pgprot_t prot, struct page **pages)
+static int vmap_page_range_noflush(unsigned long start, unsigned long end,
+				   pgprot_t prot, struct page **pages)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -170,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end,
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	flush_cache_vmap(start, end);
 
 	if (unlikely(err))
 		return err;
 	return nr;
 }
 
+static int vmap_page_range(unsigned long start, unsigned long end,
+			   pgprot_t prot, struct page **pages)
+{
+	int ret;
+
+	ret = vmap_page_range_noflush(start, end, prot, pages);
+	flush_cache_vmap(start, end);
+	return ret;
+}
+
 static inline int is_vmalloc_or_module_addr(const void *x)
 {
 	/*
@@ -1033,6 +1042,58 @@ void __init vmalloc_init(void)
 	vmap_initialized = true;
 }
 
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
+ *
+ * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size
+ * specify should have been allocated using get_vm_area() and its
+ * friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing.  The caller is
+ * responsible for calling flush_cache_vmap() on to-be-mapped areas
+ * before calling this function.
+ *
+ * RETURNS:
+ * The number of pages mapped on success, -errno on failure.
+ */
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+			     pgprot_t prot, struct page **pages)
+{
+	return vmap_page_range_noflush(addr, addr + size, prot, pages);
+}
+
+/**
+ * unmap_kernel_range_noflush - unmap kernel VM area
+ * @addr: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size
+ * specify should have been allocated using get_vm_area() and its
+ * friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing.  The caller is
+ * responsible for calling flush_cache_vunmap() on to-be-mapped areas
+ * before calling this function and flush_tlb_kernel_range() after.
+ */
+void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
+{
+	vunmap_page_range(addr, addr + size);
+}
+
+/**
+ * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
+ * @addr: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Similar to unmap_kernel_range_noflush() but flushes vcache before
+ * the unmapping and tlb after.
+ */
 void unmap_kernel_range(unsigned long addr, unsigned long size)
 {
 	unsigned long end = addr + size;
-- 
cgit v0.10.2


From fbf59bc9d74d1fb30b8e0630743aff2806eafcea Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: percpu: implement new dynamic percpu allocator

Impact: new scalable dynamic percpu allocator which allows dynamic
        percpu areas to be accessed the same way as static ones

Implement scalable dynamic percpu allocator which can be used for both
static and dynamic percpu areas.  This will allow static and dynamic
areas to share faster direct access methods.  This feature is optional
and enabled only when CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is defined by
arch.  Please read comment on top of mm/percpu.c for details.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index d99e24a..1808099 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -76,23 +76,37 @@
 
 #ifdef CONFIG_SMP
 
-struct percpu_data {
-	void *ptrs[1];
-};
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 
-#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
+extern void *pcpu_base_addr;
 
+typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
+
+extern size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
+				       struct page **pages, size_t cpu_size);
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
  * version should probably be combined with get_cpu()/put_cpu().
  */
+#define per_cpu_ptr(ptr, cpu)	SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
+
+#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
+struct percpu_data {
+	void *ptrs[1];
+};
+
+#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
+
 #define per_cpu_ptr(ptr, cpu)						\
 ({									\
         struct percpu_data *__p = __percpu_disguise(ptr);		\
         (__typeof__(ptr))__p->ptrs[(cpu)];				\
 })
 
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
 extern void *__alloc_percpu(size_t size, size_t align);
 extern void free_percpu(void *__pdata);
 
diff --git a/kernel/module.c b/kernel/module.c
index 52b3497..1f0657a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -51,6 +51,7 @@
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
+#include <linux/percpu.h>
 
 #if 0
 #define DEBUGP printk
@@ -366,6 +367,34 @@ static struct module *find_module(const char *name)
 }
 
 #ifdef CONFIG_SMP
+
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+
+static void *percpu_modalloc(unsigned long size, unsigned long align,
+			     const char *name)
+{
+	void *ptr;
+
+	if (align > PAGE_SIZE) {
+		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+		       name, align, PAGE_SIZE);
+		align = PAGE_SIZE;
+	}
+
+	ptr = __alloc_percpu(size, align);
+	if (!ptr)
+		printk(KERN_WARNING
+		       "Could not allocate %lu bytes percpu data\n", size);
+	return ptr;
+}
+
+static void percpu_modfree(void *freeme)
+{
+	free_percpu(freeme);
+}
+
+#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
 /* Number of blocks used and allocated. */
 static unsigned int pcpu_num_used, pcpu_num_allocated;
 /* Size of each block.  -ve means used. */
@@ -499,6 +528,8 @@ static int percpu_modinit(void)
 }
 __initcall(percpu_modinit);
 
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
 				 Elf_Shdr *sechdrs,
 				 const char *secstrings)
diff --git a/mm/Makefile b/mm/Makefile
index 72255be..818569b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
+ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+obj-$(CONFIG_SMP) += percpu.o
+else
 obj-$(CONFIG_SMP) += allocpercpu.o
+endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/percpu.c b/mm/percpu.c
new file mode 100644
index 0000000..4617d97
--- /dev/null
+++ b/mm/percpu.c
@@ -0,0 +1,890 @@
+/*
+ * linux/mm/percpu.c - percpu memory allocator
+ *
+ * Copyright (C) 2009		SUSE Linux Products GmbH
+ * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * This is percpu allocator which can handle both static and dynamic
+ * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
+ * chunk is consisted of num_possible_cpus() units and the first chunk
+ * is used for static percpu variables in the kernel image (special
+ * boot time alloc/init handling necessary as these areas need to be
+ * brought up before allocation services are running).  Unit grows as
+ * necessary and all units grow or shrink in unison.  When a chunk is
+ * filled up, another chunk is allocated.  ie. in vmalloc area
+ *
+ *  c0                           c1                         c2
+ *  -------------------          -------------------        ------------
+ * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
+ *  -------------------  ......  -------------------  ....  ------------
+ *
+ * Allocation is done in offset-size areas of single unit space.  Ie,
+ * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
+ * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
+ * percpu base registers UNIT_SIZE apart.
+ *
+ * There are usually many small percpu allocations many of them as
+ * small as 4 bytes.  The allocator organizes chunks into lists
+ * according to free size and tries to allocate from the fullest one.
+ * Each chunk keeps the maximum contiguous area size hint which is
+ * guaranteed to be eqaul to or larger than the maximum contiguous
+ * area in the chunk.  This helps the allocator not to iterate the
+ * chunk maps unnecessarily.
+ *
+ * Allocation state in each chunk is kept using an array of integers
+ * on chunk->map.  A positive value in the map represents a free
+ * region and negative allocated.  Allocation inside a chunk is done
+ * by scanning this map sequentially and serving the first matching
+ * entry.  This is mostly copied from the percpu_modalloc() allocator.
+ * Chunks are also linked into a rb tree to ease address to chunk
+ * mapping during free.
+ *
+ * To use this allocator, arch code should do the followings.
+ *
+ * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ *
+ * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
+ *   regular address to percpu pointer and back
+ *
+ * - use pcpu_setup_static() during percpu area initialization to
+ *   setup kernel static percpu area
+ */
+
+#include <linux/bitmap.h>
+#include <linux/bootmem.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/pfn.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#define PCPU_MIN_UNIT_PAGES_SHIFT	4	/* also max alloc size */
+#define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
+#define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
+
+struct pcpu_chunk {
+	struct list_head	list;		/* linked to pcpu_slot lists */
+	struct rb_node		rb_node;	/* key is chunk->vm->addr */
+	int			free_size;	/* free bytes in the chunk */
+	int			contig_hint;	/* max contiguous size hint */
+	struct vm_struct	*vm;		/* mapped vmalloc region */
+	int			map_used;	/* # of map entries used */
+	int			map_alloc;	/* # of map entries allocated */
+	int			*map;		/* allocation map */
+	struct page		*page[];	/* #cpus * UNIT_PAGES */
+};
+
+static int pcpu_unit_pages_shift;
+static int pcpu_unit_pages;
+static int pcpu_unit_shift;
+static int pcpu_unit_size;
+static int pcpu_chunk_size;
+static int pcpu_nr_slots;
+static size_t pcpu_chunk_struct_size;
+
+/* the address of the first chunk which starts with the kernel static area */
+void *pcpu_base_addr;
+EXPORT_SYMBOL_GPL(pcpu_base_addr);
+
+/* the size of kernel static area */
+static int pcpu_static_size;
+
+/*
+ * One mutex to rule them all.
+ *
+ * The following mutex is grabbed in the outermost public alloc/free
+ * interface functions and released only when the operation is
+ * complete.  As such, every function in this file other than the
+ * outermost functions are called under pcpu_mutex.
+ *
+ * It can easily be switched to use spinlock such that only the area
+ * allocation and page population commit are protected with it doing
+ * actual [de]allocation without holding any lock.  However, given
+ * what this allocator does, I think it's better to let them run
+ * sequentially.
+ */
+static DEFINE_MUTEX(pcpu_mutex);
+
+static struct list_head *pcpu_slot;		/* chunk list slots */
+static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
+
+static int pcpu_size_to_slot(int size)
+{
+	int highbit = fls(size);
+	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
+}
+
+static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
+{
+	if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
+		return 0;
+
+	return pcpu_size_to_slot(chunk->free_size);
+}
+
+static int pcpu_page_idx(unsigned int cpu, int page_idx)
+{
+	return (cpu << pcpu_unit_pages_shift) + page_idx;
+}
+
+static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
+				      unsigned int cpu, int page_idx)
+{
+	return &chunk->page[pcpu_page_idx(cpu, page_idx)];
+}
+
+static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
+				     unsigned int cpu, int page_idx)
+{
+	return (unsigned long)chunk->vm->addr +
+		(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
+}
+
+static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
+				     int page_idx)
+{
+	return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+}
+
+/**
+ * pcpu_realloc - versatile realloc
+ * @p: the current pointer (can be NULL for new allocations)
+ * @size: the current size (can be 0 for new allocations)
+ * @new_size: the wanted new size (can be 0 for free)
+ *
+ * More robust realloc which can be used to allocate, resize or free a
+ * memory area of arbitrary size.  If the needed size goes over
+ * PAGE_SIZE, kernel VM is used.
+ *
+ * RETURNS:
+ * The new pointer on success, NULL on failure.
+ */
+static void *pcpu_realloc(void *p, size_t size, size_t new_size)
+{
+	void *new;
+
+	if (new_size <= PAGE_SIZE)
+		new = kmalloc(new_size, GFP_KERNEL);
+	else
+		new = vmalloc(new_size);
+	if (new_size && !new)
+		return NULL;
+
+	memcpy(new, p, min(size, new_size));
+	if (new_size > size)
+		memset(new + size, 0, new_size - size);
+
+	if (size <= PAGE_SIZE)
+		kfree(p);
+	else
+		vfree(p);
+
+	return new;
+}
+
+/**
+ * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
+ * @chunk: chunk of interest
+ * @oslot: the previous slot it was on
+ *
+ * This function is called after an allocation or free changed @chunk.
+ * New slot according to the changed state is determined and @chunk is
+ * moved to the slot.
+ */
+static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
+{
+	int nslot = pcpu_chunk_slot(chunk);
+
+	if (oslot != nslot) {
+		if (oslot < nslot)
+			list_move(&chunk->list, &pcpu_slot[nslot]);
+		else
+			list_move_tail(&chunk->list, &pcpu_slot[nslot]);
+	}
+}
+
+static struct rb_node **pcpu_chunk_rb_search(void *addr,
+					     struct rb_node **parentp)
+{
+	struct rb_node **p = &pcpu_addr_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct pcpu_chunk *chunk;
+
+	while (*p) {
+		parent = *p;
+		chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
+
+		if (addr < chunk->vm->addr)
+			p = &(*p)->rb_left;
+		else if (addr > chunk->vm->addr)
+			p = &(*p)->rb_right;
+		else
+			break;
+	}
+
+	if (parentp)
+		*parentp = parent;
+	return p;
+}
+
+/**
+ * pcpu_chunk_addr_search - search for chunk containing specified address
+ * @addr: address to search for
+ *
+ * Look for chunk which might contain @addr.  More specifically, it
+ * searchs for the chunk with the highest start address which isn't
+ * beyond @addr.
+ *
+ * RETURNS:
+ * The address of the found chunk.
+ */
+static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
+{
+	struct rb_node *n, *parent;
+	struct pcpu_chunk *chunk;
+
+	n = *pcpu_chunk_rb_search(addr, &parent);
+	if (!n) {
+		/* no exactly matching chunk, the parent is the closest */
+		n = parent;
+		BUG_ON(!n);
+	}
+	chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+
+	if (addr < chunk->vm->addr) {
+		/* the parent was the next one, look for the previous one */
+		n = rb_prev(n);
+		BUG_ON(!n);
+		chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+	}
+
+	return chunk;
+}
+
+/**
+ * pcpu_chunk_addr_insert - insert chunk into address rb tree
+ * @new: chunk to insert
+ *
+ * Insert @new into address rb tree.
+ */
+static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
+{
+	struct rb_node **p, *parent;
+
+	p = pcpu_chunk_rb_search(new->vm->addr, &parent);
+	BUG_ON(*p);
+	rb_link_node(&new->rb_node, parent, p);
+	rb_insert_color(&new->rb_node, &pcpu_addr_root);
+}
+
+/**
+ * pcpu_split_block - split a map block
+ * @chunk: chunk of interest
+ * @i: index of map block to split
+ * @head: head size (can be 0)
+ * @tail: tail size (can be 0)
+ *
+ * Split the @i'th map block into two or three blocks.  If @head is
+ * non-zero, @head bytes block is inserted before block @i moving it
+ * to @i+1 and reducing its size by @head bytes.
+ *
+ * If @tail is non-zero, the target block, which can be @i or @i+1
+ * depending on @head, is reduced by @tail bytes and @tail byte block
+ * is inserted after the target block.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
+{
+	int nr_extra = !!head + !!tail;
+	int target = chunk->map_used + nr_extra;
+
+	/* reallocation required? */
+	if (chunk->map_alloc < target) {
+		int new_alloc = chunk->map_alloc;
+		int *new;
+
+		while (new_alloc < target)
+			new_alloc *= 2;
+
+		new = pcpu_realloc(chunk->map,
+				   chunk->map_alloc * sizeof(new[0]),
+				   new_alloc * sizeof(new[0]));
+		if (!new)
+			return -ENOMEM;
+
+		chunk->map_alloc = new_alloc;
+		chunk->map = new;
+	}
+
+	/* insert a new subblock */
+	memmove(&chunk->map[i + nr_extra], &chunk->map[i],
+		sizeof(chunk->map[0]) * (chunk->map_used - i));
+	chunk->map_used += nr_extra;
+
+	if (head) {
+		chunk->map[i + 1] = chunk->map[i] - head;
+		chunk->map[i++] = head;
+	}
+	if (tail) {
+		chunk->map[i++] -= tail;
+		chunk->map[i] = tail;
+	}
+	return 0;
+}
+
+/**
+ * pcpu_alloc_area - allocate area from a pcpu_chunk
+ * @chunk: chunk of interest
+ * @size: wanted size
+ * @align: wanted align
+ *
+ * Try to allocate @size bytes area aligned at @align from @chunk.
+ * Note that this function only allocates the offset.  It doesn't
+ * populate or map the area.
+ *
+ * RETURNS:
+ * Allocated offset in @chunk on success, -errno on failure.
+ */
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
+{
+	int oslot = pcpu_chunk_slot(chunk);
+	int max_contig = 0;
+	int i, off;
+
+	/*
+	 * The static chunk initially doesn't have map attached
+	 * because kmalloc wasn't available during init.  Give it one.
+	 */
+	if (unlikely(!chunk->map)) {
+		chunk->map = pcpu_realloc(NULL, 0,
+				PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+		if (!chunk->map)
+			return -ENOMEM;
+
+		chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+		chunk->map[chunk->map_used++] = -pcpu_static_size;
+		if (chunk->free_size)
+			chunk->map[chunk->map_used++] = chunk->free_size;
+	}
+
+	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
+		bool is_last = i + 1 == chunk->map_used;
+		int head, tail;
+
+		/* extra for alignment requirement */
+		head = ALIGN(off, align) - off;
+		BUG_ON(i == 0 && head != 0);
+
+		if (chunk->map[i] < 0)
+			continue;
+		if (chunk->map[i] < head + size) {
+			max_contig = max(chunk->map[i], max_contig);
+			continue;
+		}
+
+		/*
+		 * If head is small or the previous block is free,
+		 * merge'em.  Note that 'small' is defined as smaller
+		 * than sizeof(int), which is very small but isn't too
+		 * uncommon for percpu allocations.
+		 */
+		if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
+			if (chunk->map[i - 1] > 0)
+				chunk->map[i - 1] += head;
+			else {
+				chunk->map[i - 1] -= head;
+				chunk->free_size -= head;
+			}
+			chunk->map[i] -= head;
+			off += head;
+			head = 0;
+		}
+
+		/* if tail is small, just keep it around */
+		tail = chunk->map[i] - head - size;
+		if (tail < sizeof(int))
+			tail = 0;
+
+		/* split if warranted */
+		if (head || tail) {
+			if (pcpu_split_block(chunk, i, head, tail))
+				return -ENOMEM;
+			if (head) {
+				i++;
+				off += head;
+				max_contig = max(chunk->map[i - 1], max_contig);
+			}
+			if (tail)
+				max_contig = max(chunk->map[i + 1], max_contig);
+		}
+
+		/* update hint and mark allocated */
+		if (is_last)
+			chunk->contig_hint = max_contig; /* fully scanned */
+		else
+			chunk->contig_hint = max(chunk->contig_hint,
+						 max_contig);
+
+		chunk->free_size -= chunk->map[i];
+		chunk->map[i] = -chunk->map[i];
+
+		pcpu_chunk_relocate(chunk, oslot);
+		return off;
+	}
+
+	chunk->contig_hint = max_contig;	/* fully scanned */
+	pcpu_chunk_relocate(chunk, oslot);
+
+	/*
+	 * Tell the upper layer that this chunk has no area left.
+	 * Note that this is not an error condition but a notification
+	 * to upper layer that it needs to look at other chunks.
+	 * -ENOSPC is chosen as it isn't used in memory subsystem and
+	 * matches the meaning in a way.
+	 */
+	return -ENOSPC;
+}
+
+/**
+ * pcpu_free_area - free area to a pcpu_chunk
+ * @chunk: chunk of interest
+ * @freeme: offset of area to free
+ *
+ * Free area starting from @freeme to @chunk.  Note that this function
+ * only modifies the allocation map.  It doesn't depopulate or unmap
+ * the area.
+ */
+static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
+{
+	int oslot = pcpu_chunk_slot(chunk);
+	int i, off;
+
+	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
+		if (off == freeme)
+			break;
+	BUG_ON(off != freeme);
+	BUG_ON(chunk->map[i] > 0);
+
+	chunk->map[i] = -chunk->map[i];
+	chunk->free_size += chunk->map[i];
+
+	/* merge with previous? */
+	if (i > 0 && chunk->map[i - 1] >= 0) {
+		chunk->map[i - 1] += chunk->map[i];
+		chunk->map_used--;
+		memmove(&chunk->map[i], &chunk->map[i + 1],
+			(chunk->map_used - i) * sizeof(chunk->map[0]));
+		i--;
+	}
+	/* merge with next? */
+	if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
+		chunk->map[i] += chunk->map[i + 1];
+		chunk->map_used--;
+		memmove(&chunk->map[i + 1], &chunk->map[i + 2],
+			(chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
+	}
+
+	chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
+	pcpu_chunk_relocate(chunk, oslot);
+}
+
+/**
+ * pcpu_unmap - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * If @flush is true, vcache is flushed before unmapping and tlb
+ * after.
+ */
+static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
+		       bool flush)
+{
+	unsigned int last = num_possible_cpus() - 1;
+	unsigned int cpu;
+
+	/*
+	 * Each flushing trial can be very expensive, issue flush on
+	 * the whole region at once rather than doing it for each cpu.
+	 * This could be an overkill but is more scalable.
+	 */
+	if (flush)
+		flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
+				   pcpu_chunk_addr(chunk, last, page_end));
+
+	for_each_possible_cpu(cpu)
+		unmap_kernel_range_noflush(
+				pcpu_chunk_addr(chunk, cpu, page_start),
+				(page_end - page_start) << PAGE_SHIFT);
+
+	/* ditto as flush_cache_vunmap() */
+	if (flush)
+		flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
+				       pcpu_chunk_addr(chunk, last, page_end));
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk.  If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, size_t off,
+				  size_t size, bool flush)
+{
+	int page_start = PFN_DOWN(off);
+	int page_end = PFN_UP(off + size);
+	int unmap_start = -1;
+	int uninitialized_var(unmap_end);
+	unsigned int cpu;
+	int i;
+
+	for (i = page_start; i < page_end; i++) {
+		for_each_possible_cpu(cpu) {
+			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+
+			if (!*pagep)
+				continue;
+
+			__free_page(*pagep);
+
+			/*
+			 * If it's partial depopulation, it might get
+			 * populated or depopulated again.  Mark the
+			 * page gone.
+			 */
+			*pagep = NULL;
+
+			unmap_start = unmap_start < 0 ? i : unmap_start;
+			unmap_end = i + 1;
+		}
+	}
+
+	if (unmap_start >= 0)
+		pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+}
+
+/**
+ * pcpu_map - map pages into a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to map
+ * @page_end: page index of the last page to map + 1
+ *
+ * For each cpu, map pages [@page_start,@page_end) into @chunk.
+ * vcache is flushed afterwards.
+ */
+static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+{
+	unsigned int last = num_possible_cpus() - 1;
+	unsigned int cpu;
+	int err;
+
+	for_each_possible_cpu(cpu) {
+		err = map_kernel_range_noflush(
+				pcpu_chunk_addr(chunk, cpu, page_start),
+				(page_end - page_start) << PAGE_SHIFT,
+				PAGE_KERNEL,
+				pcpu_chunk_pagep(chunk, cpu, page_start));
+		if (err < 0)
+			return err;
+	}
+
+	/* flush at once, please read comments in pcpu_unmap() */
+	flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
+			 pcpu_chunk_addr(chunk, last, page_end));
+	return 0;
+}
+
+/**
+ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @off: offset to the area to populate
+ * @size: size of the area to populate
+ *
+ * For each cpu, populate and map pages [@page_start,@page_end) into
+ * @chunk.  The area is cleared on return.
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+	const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+	int page_start = PFN_DOWN(off);
+	int page_end = PFN_UP(off + size);
+	int map_start = -1;
+	int map_end;
+	unsigned int cpu;
+	int i;
+
+	for (i = page_start; i < page_end; i++) {
+		if (pcpu_chunk_page_occupied(chunk, i)) {
+			if (map_start >= 0) {
+				if (pcpu_map(chunk, map_start, map_end))
+					goto err;
+				map_start = -1;
+			}
+			continue;
+		}
+
+		map_start = map_start < 0 ? i : map_start;
+		map_end = i + 1;
+
+		for_each_possible_cpu(cpu) {
+			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+
+			*pagep = alloc_pages_node(cpu_to_node(cpu),
+						  alloc_mask, 0);
+			if (!*pagep)
+				goto err;
+		}
+	}
+
+	if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
+		goto err;
+
+	for_each_possible_cpu(cpu)
+		memset(chunk->vm->addr + (cpu << pcpu_unit_shift) + off, 0,
+		       size);
+
+	return 0;
+err:
+	/* likely under heavy memory pressure, give memory back */
+	pcpu_depopulate_chunk(chunk, off, size, true);
+	return -ENOMEM;
+}
+
+static void free_pcpu_chunk(struct pcpu_chunk *chunk)
+{
+	if (!chunk)
+		return;
+	if (chunk->vm)
+		free_vm_area(chunk->vm);
+	pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
+	kfree(chunk);
+}
+
+static struct pcpu_chunk *alloc_pcpu_chunk(void)
+{
+	struct pcpu_chunk *chunk;
+
+	chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
+	if (!chunk)
+		return NULL;
+
+	chunk->map = pcpu_realloc(NULL, 0,
+				  PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+	chunk->map[chunk->map_used++] = pcpu_unit_size;
+
+	chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+	if (!chunk->vm) {
+		free_pcpu_chunk(chunk);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&chunk->list);
+	chunk->free_size = pcpu_unit_size;
+	chunk->contig_hint = pcpu_unit_size;
+
+	return chunk;
+}
+
+/**
+ * __alloc_percpu - allocate percpu area
+ * @size: size of area to allocate
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align.  Might
+ * sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+	void *ptr = NULL;
+	struct pcpu_chunk *chunk;
+	int slot, off;
+
+	if (unlikely(!size || size > PAGE_SIZE << PCPU_MIN_UNIT_PAGES_SHIFT ||
+		     align > PAGE_SIZE)) {
+		WARN(true, "illegal size (%zu) or align (%zu) for "
+		     "percpu allocation\n", size, align);
+		return NULL;
+	}
+
+	mutex_lock(&pcpu_mutex);
+
+	/* allocate area */
+	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
+		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+			if (size > chunk->contig_hint)
+				continue;
+			off = pcpu_alloc_area(chunk, size, align);
+			if (off >= 0)
+				goto area_found;
+			if (off != -ENOSPC)
+				goto out_unlock;
+		}
+	}
+
+	/* hmmm... no space left, create a new chunk */
+	chunk = alloc_pcpu_chunk();
+	if (!chunk)
+		goto out_unlock;
+	pcpu_chunk_relocate(chunk, -1);
+	pcpu_chunk_addr_insert(chunk);
+
+	off = pcpu_alloc_area(chunk, size, align);
+	if (off < 0)
+		goto out_unlock;
+
+area_found:
+	/* populate, map and clear the area */
+	if (pcpu_populate_chunk(chunk, off, size)) {
+		pcpu_free_area(chunk, off);
+		goto out_unlock;
+	}
+
+	ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
+out_unlock:
+	mutex_unlock(&pcpu_mutex);
+	return ptr;
+}
+EXPORT_SYMBOL_GPL(__alloc_percpu);
+
+static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
+{
+	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+	list_del(&chunk->list);
+	rb_erase(&chunk->rb_node, &pcpu_addr_root);
+	free_pcpu_chunk(chunk);
+}
+
+/**
+ * free_percpu - free percpu area
+ * @ptr: pointer to area to free
+ *
+ * Free percpu area @ptr.  Might sleep.
+ */
+void free_percpu(void *ptr)
+{
+	void *addr = __pcpu_ptr_to_addr(ptr);
+	struct pcpu_chunk *chunk;
+	int off;
+
+	if (!ptr)
+		return;
+
+	mutex_lock(&pcpu_mutex);
+
+	chunk = pcpu_chunk_addr_search(addr);
+	off = addr - chunk->vm->addr;
+
+	pcpu_free_area(chunk, off);
+
+	/* the chunk became fully free, kill one if there are other free ones */
+	if (chunk->free_size == pcpu_unit_size) {
+		struct pcpu_chunk *pos;
+
+		list_for_each_entry(pos,
+				    &pcpu_slot[pcpu_chunk_slot(chunk)], list)
+			if (pos != chunk) {
+				pcpu_kill_chunk(pos);
+				break;
+			}
+	}
+
+	mutex_unlock(&pcpu_mutex);
+}
+EXPORT_SYMBOL_GPL(free_percpu);
+
+/**
+ * pcpu_setup_static - initialize kernel static percpu area
+ * @populate_pte_fn: callback to allocate pagetable
+ * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages
+ *
+ * Initialize kernel static percpu area.  The caller should allocate
+ * all the necessary pages and pass them in @pages.
+ * @populate_pte_fn() is called on each page to be used for percpu
+ * mapping and is responsible for making sure all the necessary page
+ * tables for the page is allocated.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access.
+ */
+size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
+				struct page **pages, size_t cpu_size)
+{
+	static struct vm_struct static_vm;
+	struct pcpu_chunk *static_chunk;
+	int nr_cpu_pages = DIV_ROUND_UP(cpu_size, PAGE_SIZE);
+	unsigned int cpu;
+	int err, i;
+
+	pcpu_unit_pages_shift = max_t(int, PCPU_MIN_UNIT_PAGES_SHIFT,
+				      order_base_2(cpu_size) - PAGE_SHIFT);
+
+	pcpu_static_size = cpu_size;
+	pcpu_unit_pages = 1 << pcpu_unit_pages_shift;
+	pcpu_unit_shift = PAGE_SHIFT + pcpu_unit_pages_shift;
+	pcpu_unit_size = 1 << pcpu_unit_shift;
+	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
+	pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1;
+	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
+		+ (1 << pcpu_unit_pages_shift) * sizeof(struct page *);
+
+	/* allocate chunk slots */
+	pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
+	for (i = 0; i < pcpu_nr_slots; i++)
+		INIT_LIST_HEAD(&pcpu_slot[i]);
+
+	/* init and register vm area */
+	static_vm.flags = VM_ALLOC;
+	static_vm.size = pcpu_chunk_size;
+	vm_area_register_early(&static_vm);
+
+	/* init static_chunk */
+	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
+	INIT_LIST_HEAD(&static_chunk->list);
+	static_chunk->vm = &static_vm;
+	static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+	static_chunk->contig_hint = static_chunk->free_size;
+
+	/* assign pages and map them */
+	for_each_possible_cpu(cpu) {
+		for (i = 0; i < nr_cpu_pages; i++) {
+			*pcpu_chunk_pagep(static_chunk, cpu, i) = *pages++;
+			populate_pte_fn(pcpu_chunk_addr(static_chunk, cpu, i));
+		}
+	}
+
+	err = pcpu_map(static_chunk, 0, nr_cpu_pages);
+	if (err)
+		panic("failed to setup static percpu area, err=%d\n", err);
+
+	/* link static_chunk in */
+	pcpu_chunk_relocate(static_chunk, -1);
+	pcpu_chunk_addr_insert(static_chunk);
+
+	/* we're done */
+	pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
+	return pcpu_unit_size;
+}
-- 
cgit v0.10.2


From 11124411aa95827404d6bfdfc14c908e1b54513c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:09 +0900
Subject: x86: convert to the new dynamic percpu allocator

Impact: use new dynamic allocator, unified access to static/dynamic
        percpu memory

Convert to the new dynamic percpu allocator.

* implement populate_extra_pte() for both 32 and 64
* update setup_per_cpu_areas() to use pcpu_setup_static()
* define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr()
* define config HAVE_DYNAMIC_PER_CPU_AREA

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f760a22..d3f6ead 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -135,6 +135,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y
 
+config HAVE_DYNAMIC_PER_CPU_AREA
+	def_bool y
+
 config HAVE_CPUMASK_OF_CPU_MAP
 	def_bool X86_64_SMP
 
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index aee103b..8f1d2fb 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -43,6 +43,14 @@
 #else /* ...!ASSEMBLY */
 
 #include <linux/stringify.h>
+#include <asm/sections.h>
+
+#define __addr_to_pcpu_ptr(addr)					\
+	(void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr	\
+		 + (unsigned long)__per_cpu_start)
+#define __pcpu_ptr_to_addr(ptr)						\
+	(void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr	\
+		 - (unsigned long)__per_cpu_start)
 
 #ifdef CONFIG_SMP
 #define __percpu_arg(x)		"%%"__stringify(__percpu_seg)":%P" #x
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6f7c102..dd91c25 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -402,6 +402,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 
 /* Install a pte for a particular vaddr in kernel space. */
 void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+void populate_extra_pte(unsigned long vaddr);
 
 #ifdef CONFIG_X86_32
 extern void native_pagetable_setup_start(pgd_t *base);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d992e6c..2dce435 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -61,38 +61,56 @@ static inline void setup_percpu_segment(int cpu)
  */
 void __init setup_per_cpu_areas(void)
 {
-	ssize_t size;
-	char *ptr;
-	int cpu;
-
-	/* Copy section for each CPU (we discard the original) */
-	size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
+	ssize_t size = __per_cpu_end - __per_cpu_start;
+	unsigned int nr_cpu_pages = DIV_ROUND_UP(size, PAGE_SIZE);
+	static struct page **pages;
+	size_t pages_size;
+	unsigned int cpu, i, j;
+	unsigned long delta;
+	size_t pcpu_unit_size;
 
 	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
+	pr_info("PERCPU: Allocating %zd bytes for static per cpu data\n", size);
 
-	pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
+	pages_size = nr_cpu_pages * num_possible_cpus() * sizeof(pages[0]);
+	pages = alloc_bootmem(pages_size);
 
+	j = 0;
 	for_each_possible_cpu(cpu) {
+		void *ptr;
+
+		for (i = 0; i < nr_cpu_pages; i++) {
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-		ptr = alloc_bootmem_pages(size);
+			ptr = alloc_bootmem_pages(PAGE_SIZE);
 #else
-		int node = early_cpu_to_node(cpu);
-		if (!node_online(node) || !NODE_DATA(node)) {
-			ptr = alloc_bootmem_pages(size);
-			pr_info("cpu %d has no node %d or node-local memory\n",
-				cpu, node);
-			pr_debug("per cpu data for cpu%d at %016lx\n",
-				 cpu, __pa(ptr));
-		} else {
-			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
-			pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
-				cpu, node, __pa(ptr));
-		}
+			int node = early_cpu_to_node(cpu);
+
+			if (!node_online(node) || !NODE_DATA(node)) {
+				ptr = alloc_bootmem_pages(PAGE_SIZE);
+				pr_info("cpu %d has no node %d or node-local "
+					"memory\n", cpu, node);
+				pr_debug("per cpu data for cpu%d at %016lx\n",
+					 cpu, __pa(ptr));
+			} else {
+				ptr = alloc_bootmem_pages_node(NODE_DATA(node),
+							       PAGE_SIZE);
+				pr_debug("per cpu data for cpu%d on node%d "
+					 "at %016lx\n", cpu, node, __pa(ptr));
+			}
 #endif
+			memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
+			pages[j++] = virt_to_page(ptr);
+		}
+	}
+
+	pcpu_unit_size = pcpu_setup_static(populate_extra_pte, pages, size);
 
-		memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
-		per_cpu_offset(cpu) = ptr - __per_cpu_start;
+	free_bootmem(__pa(pages), pages_size);
+
+	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+	for_each_possible_cpu(cpu) {
+		per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
 		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
 		per_cpu(cpu_number, cpu) = cpu;
 		setup_percpu_segment(cpu);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 00263bf..8b1a0ef 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -137,6 +137,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 	return pte_offset_kernel(pmd, 0);
 }
 
+void __init populate_extra_pte(unsigned long vaddr)
+{
+	int pgd_idx = pgd_index(vaddr);
+	int pmd_idx = pmd_index(vaddr);
+	pmd_t *pmd;
+
+	pmd = one_md_table_init(swapper_pg_dir + pgd_idx);
+	one_page_table_init(pmd + pmd_idx);
+}
+
 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
 					   unsigned long vaddr, pte_t *lastpte)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e6d36b4..7f91e2c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -223,6 +223,25 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 	set_pte_vaddr_pud(pud_page, vaddr, pteval);
 }
 
+void __init populate_extra_pte(unsigned long vaddr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+
+	pgd = pgd_offset_k(vaddr);
+	if (pgd_none(*pgd)) {
+		pud = (pud_t *)spp_getpage();
+		pgd_populate(&init_mm, pgd, pud);
+		if (pud != pud_offset(pgd, 0)) {
+			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+			       pud, pud_offset(pgd, 0));
+			return;
+		}
+	}
+
+	set_pte_vaddr_pud((pud_t *)pgd_page_vaddr(*pgd), vaddr, __pte(0));
+}
+
 /*
  * Create large page table mappings for a range of physical addresses.
  */
-- 
cgit v0.10.2


From cae3aeb83fef5a7c9c8ac40e653e59dd9a35469c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 21 Feb 2009 16:56:23 +0900
Subject: percpu: clean up size usage

Andrew was concerned about the unit of variables named or have suffix
size.  Every usage in percpu allocator is in bytes but make it super
clear by adding comments.

While at it, make pcpu_depopulate_chunk() take int @off and @size like
everyone else.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index 4617d97..997724c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -119,7 +119,7 @@ static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
 
 static int pcpu_size_to_slot(int size)
 {
-	int highbit = fls(size);
+	int highbit = fls(size);	/* size is in bytes */
 	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
 }
 
@@ -158,8 +158,8 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 /**
  * pcpu_realloc - versatile realloc
  * @p: the current pointer (can be NULL for new allocations)
- * @size: the current size (can be 0 for new allocations)
- * @new_size: the wanted new size (can be 0 for free)
+ * @size: the current size in bytes (can be 0 for new allocations)
+ * @new_size: the wanted new size in bytes (can be 0 for free)
  *
  * More robust realloc which can be used to allocate, resize or free a
  * memory area of arbitrary size.  If the needed size goes over
@@ -290,8 +290,8 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
  * pcpu_split_block - split a map block
  * @chunk: chunk of interest
  * @i: index of map block to split
- * @head: head size (can be 0)
- * @tail: tail size (can be 0)
+ * @head: head size in bytes (can be 0)
+ * @tail: tail size in bytes (can be 0)
  *
  * Split the @i'th map block into two or three blocks.  If @head is
  * non-zero, @head bytes block is inserted before block @i moving it
@@ -346,7 +346,7 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 /**
  * pcpu_alloc_area - allocate area from a pcpu_chunk
  * @chunk: chunk of interest
- * @size: wanted size
+ * @size: wanted size in bytes
  * @align: wanted align
  *
  * Try to allocate @size bytes area aligned at @align from @chunk.
@@ -540,15 +540,15 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
  * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
  * @chunk: chunk to depopulate
  * @off: offset to the area to depopulate
- * @size: size of the area to depopulate
+ * @size: size of the area to depopulate in bytes
  * @flush: whether to flush cache and tlb or not
  *
  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
  * from @chunk.  If @flush is true, vcache is flushed before unmapping
  * and tlb after.
  */
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, size_t off,
-				  size_t size, bool flush)
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
+				  bool flush)
 {
 	int page_start = PFN_DOWN(off);
 	int page_end = PFN_UP(off + size);
@@ -617,7 +617,7 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
  * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
  * @chunk: chunk of interest
  * @off: offset to the area to populate
- * @size: size of the area to populate
+ * @size: size of the area to populate in bytes
  *
  * For each cpu, populate and map pages [@page_start,@page_end) into
  * @chunk.  The area is cleared on return.
@@ -707,7 +707,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 
 /**
  * __alloc_percpu - allocate percpu area
- * @size: size of area to allocate
+ * @size: size of area to allocate in bytes
  * @align: alignment of area (max PAGE_SIZE)
  *
  * Allocate percpu area of @size bytes aligned at @align.  Might
@@ -819,6 +819,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * pcpu_setup_static - initialize kernel static percpu area
  * @populate_pte_fn: callback to allocate pagetable
  * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages
+ * @cpu_size: the size of static percpu area in bytes
  *
  * Initialize kernel static percpu area.  The caller should allocate
  * all the necessary pages and pass them in @pages.
-- 
cgit v0.10.2


From 8cfd9e923be54ef66ce174a93f4592b444b96407 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sun, 22 Feb 2009 12:36:55 +0000
Subject: [ARM] RiscPC: Fix etherh oops

The 8390 driver was structured by Al Viro to allow the flexibility
required by platforms.  lib8390.c contains the core code which drivers
explicitly include:
- 8390.c includes lib8390.c to provide the standard ISA based driver.
- etherh.c includes it with the accessors defined for RiscPC platforms,
  where it is addressed via the MMIO accessors with a device dependent
  register spacing.

Other platform drivers do something similar.

However, b9a9b4b caused the kernel to contain not only the etherh
private build of lib8390 (included in etherh.c) but also lib8390.c
itself, and referred the new net_device_ops methods to the ISA version.
The result of this is is not pretty:

Unable to handle kernel paging request at virtual address 12032030
pgd = c8330000
[12032030] *pgd=00000000
Internal error: Oops: 18331805 [#1]
Modules linked in: ipv6
CPU: 0    Not tainted  (2.6.29-rc3 #167)
PC is at do_set_multicast_list+0xd0/0x190
LR is at bitrev32+0x28/0x34
pc : [<c017aab4>]    lr : [<c0139120>]    psr: a0000093
sp : c8321d9c  ip : c8321d84  fp : c8321dbc
r10: c80c6800  r9 : 00000000  r8 : c80c6b60
r7 : c80c6b80  r6 : cc80c800  r5 : c80c6800  r4 : 00000000
r3 : cc80c80c  r2 : 00000004  r1 : 00000007  r0 : e0000000
Flags: NzCv  IRQs off  FIQs on  Mode SVC_32  ISA ARM  Segment user
...

Fix up b9a9b4b by making etherh's net_device_ops refer to the internal
lib8390 functions, and remove the build of the ISA 8390.c driver.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/drivers/net/arm/Makefile b/drivers/net/arm/Makefile
index c69c0cd..811a3cc 100644
--- a/drivers/net/arm/Makefile
+++ b/drivers/net/arm/Makefile
@@ -4,7 +4,7 @@
 #
 
 obj-$(CONFIG_ARM_AM79C961A)	+= am79c961a.o
-obj-$(CONFIG_ARM_ETHERH)	+= etherh.o ../8390.o
+obj-$(CONFIG_ARM_ETHERH)	+= etherh.o
 obj-$(CONFIG_ARM_ETHER3)	+= ether3.o
 obj-$(CONFIG_ARM_ETHER1)	+= ether1.o
 obj-$(CONFIG_ARM_AT91_ETHER)	+= at91_ether.o
diff --git a/drivers/net/arm/etherh.c b/drivers/net/arm/etherh.c
index 54b52e5..f52f668c 100644
--- a/drivers/net/arm/etherh.c
+++ b/drivers/net/arm/etherh.c
@@ -641,15 +641,15 @@ static const struct net_device_ops etherh_netdev_ops = {
 	.ndo_open		= etherh_open,
 	.ndo_stop		= etherh_close,
 	.ndo_set_config		= etherh_set_config,
-	.ndo_start_xmit		= ei_start_xmit,
-	.ndo_tx_timeout		= ei_tx_timeout,
-	.ndo_get_stats		= ei_get_stats,
-	.ndo_set_multicast_list = ei_set_multicast_list,
+	.ndo_start_xmit		= __ei_start_xmit,
+	.ndo_tx_timeout		= __ei_tx_timeout,
+	.ndo_get_stats		= __ei_get_stats,
+	.ndo_set_multicast_list = __ei_set_multicast_list,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_change_mtu		= eth_change_mtu,
 #ifdef CONFIG_NET_POLL_CONTROLLER
-	.ndo_poll_controller	= ei_poll,
+	.ndo_poll_controller	= __ei_poll,
 #endif
 };
 
-- 
cgit v0.10.2


From d82ad6d6833ee8248e6c34e1b9fc3c4e61e3f035 Mon Sep 17 00:00:00 2001
From: Andrei Birjukov <andrei.birjukov@artecdesign.ee>
Date: Sun, 22 Feb 2009 22:37:21 +0000
Subject: [ARM] at91: fix for Atmel AT91 powersaving

We've discovered that our AT91SAM9260 board consumed too much power when
returning from a slowclock low-power mode.  RAM self-refresh is enabled in
a bootloader in our case, this is how we saw a difference.  Estimated ca.
30mA more on 4V battery than the same state before powersaving.

After a small research we found that there seems to be a bogus
sdram_selfrefresh_disable() call at the end of at91_pm_enter() call, which
overwrites the LPR register with uninitialized value.  Please find the
suggested patch attached.

This patch fixes correct restoring of LPR register of the Atmel AT91 SDRAM
controller when returning from a power saving mode.

Signed-off-by: Andrei Birjukov <andrei.birjukov@artecdesign.ee>
Acked-by: Andrew Victor <linux@maxim.org.za>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index 9bb4f04..7ac812d 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -332,7 +332,6 @@ static int at91_pm_enter(suspend_state_t state)
 			at91_sys_read(AT91_AIC_IPR) & at91_sys_read(AT91_AIC_IMR));
 
 error:
-	sdram_selfrefresh_disable();
 	target_state = PM_SUSPEND_ON;
 	at91_irq_resume();
 	at91_gpio_resume();
-- 
cgit v0.10.2


From ec5b3d32437571b8a742069a4cfd04edb6b6eda5 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 23 Feb 2009 14:01:04 -0800
Subject: x86, mce: remove invalid __cpuinit/__cpuexit annotations

Impact: Bug fix when CPU hotplug is disabled

Correct the following broken __cpuinit/__cpuexit annotations:

- mce_cpu_features() is called from mce_resume(), and so cannot be
  __cpuinit.
- mce_disable_cpu() and mce_reenable_cpu() are called from
  mce_cpu_callback(), and so cannot be __cpuexit().

Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 60a114c..0625993 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -598,7 +598,7 @@ static void mce_init(void *dummy)
 }
 
 /* Add per CPU specific workarounds here */
-static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 {
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD) {
@@ -1056,7 +1056,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
-static void __cpuexit mce_disable_cpu(void *h)
+static void mce_disable_cpu(void *h)
 {
 	int i;
 
@@ -1066,7 +1066,7 @@ static void __cpuexit mce_disable_cpu(void *h)
 		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 }
 
-static void __cpuexit mce_reenable_cpu(void *h)
+static void mce_reenable_cpu(void *h)
 {
 	int i;
 
-- 
cgit v0.10.2


From cb83b42e23bd6c4bf91793a320fbe83787c13596 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:20 +0900
Subject: percpu: fix pcpu_chunk_struct_size

Impact: fix short allocation leading to memory corruption

While dropping rvalue wrapping macros around global parameters,
pcpu_chunk_struct_size was set incorrectly resulting in shorter page
pointer array.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index 997724c..ed92caa 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -850,7 +850,7 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
 	pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
-		+ (1 << pcpu_unit_pages_shift) * sizeof(struct page *);
+		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 
 	/* allocate chunk slots */
 	pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
-- 
cgit v0.10.2


From c132937556f56ee4b831ef4b23f1846e05fde102 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:20 +0900
Subject: bootmem: clean up arch-specific bootmem wrapping

Impact: cleaner and consistent bootmem wrapping

By setting CONFIG_HAVE_ARCH_BOOTMEM_NODE, archs can define
arch-specific wrappers for bootmem allocation.  However, this is done
a bit strangely in that only the high level convenience macros can be
changed while lower level, but still exported, interface functions
can't be wrapped.  This not only is messy but also leads to strange
situation where alloc_bootmem() does what the arch wants it to do but
the equivalent __alloc_bootmem() call doesn't although they should be
able to be used interchangeably.

This patch updates bootmem such that archs can override / wrap the
backend function - alloc_bootmem_core() instead of the highlevel
interface functions to allow simpler and consistent wrapping.  Also,
HAVE_ARCH_BOOTMEM_NODE is renamed to HAVE_ARCH_BOOTMEM.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@saeurebad.de>

diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b189680..05fe305 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt"
 config QUICKLIST
 	def_bool y
 
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
 	def_bool n
 
 config ARCH_HAVE_MEMORY_PRESENT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d3f6ead..6fd3b23 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1111,7 +1111,7 @@ config NODES_SHIFT
 	  Specify the maximum number of NUMA Nodes available on the target
 	  system.  Increases memory reserved to accomodate various tables.
 
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
 	def_bool y
 	depends on X86_32 && NUMA
 
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 07f1af4..1e0fa9e6 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -93,45 +93,12 @@ static inline int pfn_valid(int pfn)
 #endif /* CONFIG_DISCONTIGMEM */
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-
-/*
- * Following are macros that are specific to this numa platform.
- */
-#define reserve_bootmem(addr, size, flags) \
-	reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
-#define alloc_bootmem(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_nopanic(x) \
-	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
-				__pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
-#define alloc_bootmem_pages(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_pages_nopanic(x) \
-	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
-				__pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
-#define alloc_bootmem_node(pgdat, x)					\
-({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES,	\
-						__pa(MAX_DMA_ADDRESS));	\
-})
-#define alloc_bootmem_pages_node(pgdat, x)				\
-({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE,		\
-						__pa(MAX_DMA_ADDRESS));	\
-})
-#define alloc_bootmem_low_pages_node(pgdat, x)				\
+/* always use node 0 for bootmem on this numa platform */
+#define alloc_bootmem_core(__bdata, size, align, goal, limit)		\
 ({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0);		\
+	bootmem_data_t __maybe_unused *	__abm_bdata_dummy = (__bdata);	\
+	__alloc_bootmem_core(NODE_DATA(0)->bdata,			\
+			     (size), (align), (goal), (limit));		\
 })
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 95837bf..3a87f93 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -69,10 +69,9 @@ extern int reserve_bootmem_node(pg_data_t *pgdat,
 				 unsigned long physaddr,
 				 unsigned long size,
 				 int flags);
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
-extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
-#endif
-
+extern int reserve_bootmem(unsigned long addr,
+			   unsigned long size,
+			   int flags);
 extern void *__alloc_bootmem_nopanic(unsigned long size,
 			     unsigned long align,
 			     unsigned long goal);
@@ -94,7 +93,7 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 				      unsigned long size,
 				      unsigned long align,
 				      unsigned long goal);
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
+
 #define alloc_bootmem(x) \
 	__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_nopanic(x) \
@@ -113,7 +112,6 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages_node(pgdat, x) \
 	__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
 extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
 				   int flags);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 51a0ccf..d7140c0 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -37,6 +37,16 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
 
 static int bootmem_debug;
 
+/*
+ * If an arch needs to apply workarounds to bootmem allocation, it can
+ * set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around
+ * __alloc_bootmem_core().
+ */
+#ifndef CONFIG_HAVE_ARCH_BOOTMEM
+#define alloc_bootmem_core(bdata, size, align, goal, limit)		\
+	__alloc_bootmem_core((bdata), (size), (align), (goal), (limit))
+#endif
+
 static int __init bootmem_debug_setup(char *buf)
 {
 	bootmem_debug = 1;
@@ -382,7 +392,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 	return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
 }
 
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 /**
  * reserve_bootmem - mark a page range as usable
  * @addr: starting address of the range
@@ -403,7 +412,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
 
 	return mark_bootmem(start, end, 1, flags);
 }
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
 static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
 			unsigned long step)
@@ -428,7 +436,7 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
 	return ALIGN(base + off, align) - base;
 }
 
-static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+static void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
 				unsigned long size, unsigned long align,
 				unsigned long goal, unsigned long limit)
 {
-- 
cgit v0.10.2


From 2d0aae41695257603fc281b519677131ab5a752b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:21 +0900
Subject: bootmem: reorder interface functions and add a missing one

Impact: cleanup and addition of missing interface wrapper

The interface functions in bootmem.h was ordered in not so orderly
manner.  Reorder them such that

* functions allocating the same area group together -
  ie. alloc_bootmem group and alloc_bootmem_low group.

* functions w/o node parameter come before the ones w/ node parameter.

* nopanic variants are immediately below their panicky counterparts.

While at it, add alloc_bootmem_pages_node_nopanic() which was missing.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@saeurebad.de>

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 3a87f93..455d832 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -65,22 +65,20 @@ extern void free_bootmem(unsigned long addr, unsigned long size);
 #define BOOTMEM_DEFAULT		0
 #define BOOTMEM_EXCLUSIVE	(1<<0)
 
-extern int reserve_bootmem_node(pg_data_t *pgdat,
-				 unsigned long physaddr,
-				 unsigned long size,
-				 int flags);
 extern int reserve_bootmem(unsigned long addr,
 			   unsigned long size,
 			   int flags);
-extern void *__alloc_bootmem_nopanic(unsigned long size,
+extern int reserve_bootmem_node(pg_data_t *pgdat,
+				unsigned long physaddr,
+				unsigned long size,
+				int flags);
+
+extern void *__alloc_bootmem(unsigned long size,
 			     unsigned long align,
 			     unsigned long goal);
-extern void *__alloc_bootmem(unsigned long size,
+extern void *__alloc_bootmem_nopanic(unsigned long size,
 				     unsigned long align,
 				     unsigned long goal);
-extern void *__alloc_bootmem_low(unsigned long size,
-				 unsigned long align,
-				 unsigned long goal);
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
@@ -89,6 +87,9 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal);
+extern void *__alloc_bootmem_low(unsigned long size,
+				 unsigned long align,
+				 unsigned long goal);
 extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 				      unsigned long size,
 				      unsigned long align,
@@ -98,18 +99,21 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 	__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_nopanic(x) \
 	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low(x) \
-	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
 #define alloc_bootmem_pages(x) \
 	__alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_nopanic(x) \
 	__alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages(x) \
-	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #define alloc_bootmem_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_pages_node_nopanic(pgdat, x) \
+	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+
+#define alloc_bootmem_low(x) \
+	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
+#define alloc_bootmem_low_pages(x) \
+	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #define alloc_bootmem_low_pages_node(pgdat, x) \
 	__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
 
-- 
cgit v0.10.2


From c0c0a29379b5848aec2e8f1c58d853d3cb7118b8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:21 +0900
Subject: vmalloc: add @align to vm_area_register_early()

Impact: allow larger alignment for early vmalloc area allocation

Some early vmalloc users might want larger alignment, for example, for
custom large page mapping.  Add @align to vm_area_register_early().
While at it, drop docbook comment on non-existent @size.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index df6df02..91eddd8 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -200,7 +200,7 @@ callback_init(void * kernel_end)
 		/* register the vm area */
 		console_remap_vm.flags = VM_ALLOC;
 		console_remap_vm.size = nr_pages << PAGE_SHIFT;
-		vm_area_register_early(&console_remap_vm);
+		vm_area_register_early(&console_remap_vm, PAGE_SIZE);
 
 		vaddr = (unsigned long)consle_remap_vm.addr;
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 599ba79..2f6994f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -109,6 +109,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
  */
 extern rwlock_t vmlist_lock;
 extern struct vm_struct *vmlist;
-extern __init void vm_area_register_early(struct vm_struct *vm);
+extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 
 #endif /* _LINUX_VMALLOC_H */
diff --git a/mm/percpu.c b/mm/percpu.c
index ed92caa..41e7a5f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -860,7 +860,7 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
 	/* init and register vm area */
 	static_vm.flags = VM_ALLOC;
 	static_vm.size = pcpu_chunk_size;
-	vm_area_register_early(&static_vm);
+	vm_area_register_early(&static_vm, PAGE_SIZE);
 
 	/* init static_chunk */
 	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 224eca9..366ae9e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -995,7 +995,7 @@ EXPORT_SYMBOL(vm_map_ram);
 /**
  * vm_area_register_early - register vmap area early during boot
  * @vm: vm_struct to register
- * @size: size of area to register
+ * @align: requested alignment
  *
  * This function is used to register kernel vm area before
  * vmalloc_init() is called.  @vm->size and @vm->flags should contain
@@ -1004,12 +1004,15 @@ EXPORT_SYMBOL(vm_map_ram);
  *
  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
  */
-void __init vm_area_register_early(struct vm_struct *vm)
+void __init vm_area_register_early(struct vm_struct *vm, size_t align)
 {
 	static size_t vm_init_off __initdata;
+	unsigned long addr;
+
+	addr = ALIGN(VMALLOC_START + vm_init_off, align);
+	vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
 
-	vm->addr = (void *)VMALLOC_START + vm_init_off;
-	vm_init_off = PFN_ALIGN(vm_init_off + vm->size);
+	vm->addr = (void *)addr;
 
 	vm->next = vmlist;
 	vmlist = vm;
-- 
cgit v0.10.2


From 458a3e644c3327be529393982e24277eda8f1ac7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:21 +0900
Subject: x86: update populate_extra_pte() and add populate_extra_pmd()

Impact: minor change to populate_extra_pte() and addition of pmd flavor

Update populate_extra_pte() to return pointer to the pte_t for the
specified address and add populate_extra_pmd() which only populates
till the pmd and returns pointer to the pmd entry for the address.

For 64bit, pud/pmd/pte fill functions are separated out from
set_pte_vaddr[_pud]() and used for set_pte_vaddr[_pud]() and
populate_extra_{pte|pmd}().

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index dd91c25..46312eb 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -402,7 +402,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 
 /* Install a pte for a particular vaddr in kernel space. */
 void set_pte_vaddr(unsigned long vaddr, pte_t pte);
-void populate_extra_pte(unsigned long vaddr);
+pmd_t *populate_extra_pmd(unsigned long vaddr);
+pte_t *populate_extra_pte(unsigned long vaddr);
 
 #ifdef CONFIG_X86_32
 extern void native_pagetable_setup_start(pgd_t *base);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 2dce435..671e652 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -41,6 +41,11 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 };
 EXPORT_SYMBOL(__per_cpu_offset);
 
+static void __init pcpu4k_populate_pte(unsigned long addr)
+{
+	populate_extra_pte(addr);
+}
+
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
@@ -104,7 +109,7 @@ void __init setup_per_cpu_areas(void)
 		}
 	}
 
-	pcpu_unit_size = pcpu_setup_static(populate_extra_pte, pages, size);
+	pcpu_unit_size = pcpu_setup_static(pcpu4k_populate_pte, pages, size);
 
 	free_bootmem(__pa(pages), pages_size);
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8b1a0ef..84a2688 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -137,14 +137,21 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 	return pte_offset_kernel(pmd, 0);
 }
 
-void __init populate_extra_pte(unsigned long vaddr)
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
 {
 	int pgd_idx = pgd_index(vaddr);
 	int pmd_idx = pmd_index(vaddr);
+
+	return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+	int pte_idx = pte_index(vaddr);
 	pmd_t *pmd;
 
-	pmd = one_md_table_init(swapper_pg_dir + pgd_idx);
-	one_page_table_init(pmd + pmd_idx);
+	pmd = populate_extra_pmd(vaddr);
+	return one_page_table_init(pmd) + pte_idx;
 }
 
 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 7f91e2c..7d4e76d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -168,34 +168,51 @@ static __ref void *spp_getpage(void)
 	return ptr;
 }
 
-void
-set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr)
 {
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
+	if (pgd_none(*pgd)) {
+		pud_t *pud = (pud_t *)spp_getpage();
+		pgd_populate(&init_mm, pgd, pud);
+		if (pud != pud_offset(pgd, 0))
+			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+			       pud, pud_offset(pgd, 0));
+	}
+	return pud_offset(pgd, vaddr);
+}
 
-	pud = pud_page + pud_index(vaddr);
+static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr)
+{
 	if (pud_none(*pud)) {
-		pmd = (pmd_t *) spp_getpage();
+		pmd_t *pmd = (pmd_t *) spp_getpage();
 		pud_populate(&init_mm, pud, pmd);
-		if (pmd != pmd_offset(pud, 0)) {
+		if (pmd != pmd_offset(pud, 0))
 			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
-				pmd, pmd_offset(pud, 0));
-			return;
-		}
+			       pmd, pmd_offset(pud, 0));
 	}
-	pmd = pmd_offset(pud, vaddr);
+	return pmd_offset(pud, vaddr);
+}
+
+static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr)
+{
 	if (pmd_none(*pmd)) {
-		pte = (pte_t *) spp_getpage();
+		pte_t *pte = (pte_t *) spp_getpage();
 		pmd_populate_kernel(&init_mm, pmd, pte);
-		if (pte != pte_offset_kernel(pmd, 0)) {
+		if (pte != pte_offset_kernel(pmd, 0))
 			printk(KERN_ERR "PAGETABLE BUG #02!\n");
-			return;
-		}
 	}
+	return pte_offset_kernel(pmd, vaddr);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pud = pud_page + pud_index(vaddr);
+	pmd = fill_pmd(pud, vaddr);
+	pte = fill_pte(pmd, vaddr);
 
-	pte = pte_offset_kernel(pmd, vaddr);
 	set_pte(pte, new_pte);
 
 	/*
@@ -205,8 +222,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
 	__flush_tlb_one(vaddr);
 }
 
-void
-set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 {
 	pgd_t *pgd;
 	pud_t *pud_page;
@@ -223,23 +239,22 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 	set_pte_vaddr_pud(pud_page, vaddr, pteval);
 }
 
-void __init populate_extra_pte(unsigned long vaddr)
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 
 	pgd = pgd_offset_k(vaddr);
-	if (pgd_none(*pgd)) {
-		pud = (pud_t *)spp_getpage();
-		pgd_populate(&init_mm, pgd, pud);
-		if (pud != pud_offset(pgd, 0)) {
-			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
-			       pud, pud_offset(pgd, 0));
-			return;
-		}
-	}
+	pud = fill_pud(pgd, vaddr);
+	return fill_pmd(pud, vaddr);
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+	pmd_t *pmd;
 
-	set_pte_vaddr_pud((pud_t *)pgd_page_vaddr(*pgd), vaddr, __pte(0));
+	pmd = populate_extra_pmd(vaddr);
+	return fill_pte(pmd, vaddr);
 }
 
 /*
-- 
cgit v0.10.2


From d9b55eeb1d55ef2dc5a4fdbff9604c2c68cb5649 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:21 +0900
Subject: percpu: remove unit_size power-of-2 restriction

Impact: allow unit_size to be arbitrary multiple of PAGE_SIZE

In dynamic percpu allocator, there is no reason the unit size should
be power of two.  Remove the restriction.

As non-power-of-two unit size means that empty chunks fall into the
same slot index as lightly occupied chunks which is bad for reclaming.
Reserve an extra slot for empty chunks.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index 41e7a5f..d9e6e5d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -67,7 +67,7 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-#define PCPU_MIN_UNIT_PAGES_SHIFT	4	/* also max alloc size */
+#define PCPU_MIN_UNIT_PAGES		16	/* max alloc size in pages */
 #define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
 #define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
 
@@ -83,9 +83,7 @@ struct pcpu_chunk {
 	struct page		*page[];	/* #cpus * UNIT_PAGES */
 };
 
-static int pcpu_unit_pages_shift;
 static int pcpu_unit_pages;
-static int pcpu_unit_shift;
 static int pcpu_unit_size;
 static int pcpu_chunk_size;
 static int pcpu_nr_slots;
@@ -117,12 +115,19 @@ static DEFINE_MUTEX(pcpu_mutex);
 static struct list_head *pcpu_slot;		/* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
 
-static int pcpu_size_to_slot(int size)
+static int __pcpu_size_to_slot(int size)
 {
 	int highbit = fls(size);	/* size is in bytes */
 	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
 }
 
+static int pcpu_size_to_slot(int size)
+{
+	if (size == pcpu_unit_size)
+		return pcpu_nr_slots - 1;
+	return __pcpu_size_to_slot(size);
+}
+
 static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
 {
 	if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
@@ -133,7 +138,7 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
 
 static int pcpu_page_idx(unsigned int cpu, int page_idx)
 {
-	return (cpu << pcpu_unit_pages_shift) + page_idx;
+	return cpu * pcpu_unit_pages + page_idx;
 }
 
 static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
@@ -659,7 +664,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 		goto err;
 
 	for_each_possible_cpu(cpu)
-		memset(chunk->vm->addr + (cpu << pcpu_unit_shift) + off, 0,
+		memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
 		       size);
 
 	return 0;
@@ -722,7 +727,7 @@ void *__alloc_percpu(size_t size, size_t align)
 	struct pcpu_chunk *chunk;
 	int slot, off;
 
-	if (unlikely(!size || size > PAGE_SIZE << PCPU_MIN_UNIT_PAGES_SHIFT ||
+	if (unlikely(!size || size > PCPU_MIN_UNIT_PAGES * PAGE_SIZE ||
 		     align > PAGE_SIZE)) {
 		WARN(true, "illegal size (%zu) or align (%zu) for "
 		     "percpu allocation\n", size, align);
@@ -840,19 +845,19 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
 	unsigned int cpu;
 	int err, i;
 
-	pcpu_unit_pages_shift = max_t(int, PCPU_MIN_UNIT_PAGES_SHIFT,
-				      order_base_2(cpu_size) - PAGE_SHIFT);
+	pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_PAGES, PFN_UP(cpu_size));
 
 	pcpu_static_size = cpu_size;
-	pcpu_unit_pages = 1 << pcpu_unit_pages_shift;
-	pcpu_unit_shift = PAGE_SHIFT + pcpu_unit_pages_shift;
-	pcpu_unit_size = 1 << pcpu_unit_shift;
+	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
-	pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
 		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 
-	/* allocate chunk slots */
+	/*
+	 * Allocate chunk slots.  The additional last slot is for
+	 * empty chunks.
+	 */
+	pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
 	pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
-- 
cgit v0.10.2


From 8d408b4be37bc49c9086531f2ebe411cf5731746 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:21 +0900
Subject: percpu: give more latitude to arch specific first chunk
 initialization

Impact: more latitude for first percpu chunk allocation

The first percpu chunk serves the kernel static percpu area and may or
may not contain extra room for further dynamic allocation.
Initialization of the first chunk needs to be done before normal
memory allocation service is up, so it has its own init path -
pcpu_setup_static().

It seems archs need more latitude while initializing the first chunk
for example to take advantage of large page mapping.  This patch makes
the following changes to allow this.

* Define PERCPU_DYNAMIC_RESERVE to give arch hint about how much space
  to reserve in the first chunk for further dynamic allocation.

* Rename pcpu_setup_static() to pcpu_setup_first_chunk().

* Make pcpu_setup_first_chunk() much more flexible by fetching page
  pointer by callback and adding optional @unit_size, @free_size and
  @base_addr arguments which allow archs to selectively part of chunk
  initialization to their likings.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 671e652..d928e88 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -41,6 +41,16 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 };
 EXPORT_SYMBOL(__per_cpu_offset);
 
+static struct page **pcpu4k_pages __initdata;
+static int pcpu4k_nr_static_pages __initdata;
+
+static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
+{
+	if (pageno < pcpu4k_nr_static_pages)
+		return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
+	return NULL;
+}
+
 static void __init pcpu4k_populate_pte(unsigned long addr)
 {
 	populate_extra_pte(addr);
@@ -109,7 +119,10 @@ void __init setup_per_cpu_areas(void)
 		}
 	}
 
-	pcpu_unit_size = pcpu_setup_static(pcpu4k_populate_pte, pages, size);
+	pcpu4k_pages = pages;
+	pcpu4k_nr_static_pages = nr_cpu_pages;
+	pcpu_unit_size = pcpu_setup_first_chunk(pcpu4k_get_page, size, 0, 0,
+						NULL, pcpu4k_populate_pte);
 
 	free_bootmem(__pa(pages), pages_size);
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 1808099..910beb0 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -78,12 +78,47 @@
 
 #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 
+/* minimum unit size, also is the maximum supported allocation size */
+#define PCPU_MIN_UNIT_SIZE		(16UL << PAGE_SHIFT)
+
+/*
+ * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
+ * back on the first chunk if arch is manually allocating and mapping
+ * it for faster access (as a part of large page mapping for example).
+ * Note that dynamic percpu allocator covers both static and dynamic
+ * areas, so these values are bigger than PERCPU_MODULE_RESERVE.
+ *
+ * On typical configuration with modules, the following values leave
+ * about 8k of free space on the first chunk after boot on both x86_32
+ * and 64 when module support is enabled.  When module support is
+ * disabled, it's much tighter.
+ */
+#ifndef PERCPU_DYNAMIC_RESERVE
+#  if BITS_PER_LONG > 32
+#    ifdef CONFIG_MODULES
+#      define PERCPU_DYNAMIC_RESERVE	(6 << PAGE_SHIFT)
+#    else
+#      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
+#    endif
+#  else
+#    ifdef CONFIG_MODULES
+#      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
+#    else
+#      define PERCPU_DYNAMIC_RESERVE	(2 << PAGE_SHIFT)
+#    endif
+#  endif
+#endif	/* PERCPU_DYNAMIC_RESERVE */
+
 extern void *pcpu_base_addr;
 
+typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
-extern size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
-				       struct page **pages, size_t cpu_size);
+extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
+					size_t static_size, size_t unit_size,
+					size_t free_size, void *base_addr,
+					pcpu_populate_pte_fn_t populate_pte_fn);
+
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
diff --git a/mm/percpu.c b/mm/percpu.c
index d9e6e5d..9ac0198 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -48,8 +48,8 @@
  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
  *   regular address to percpu pointer and back
  *
- * - use pcpu_setup_static() during percpu area initialization to
- *   setup kernel static percpu area
+ * - use pcpu_setup_first_chunk() during percpu area initialization to
+ *   setup the first chunk containing the kernel static percpu area
  */
 
 #include <linux/bitmap.h>
@@ -67,7 +67,6 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-#define PCPU_MIN_UNIT_PAGES		16	/* max alloc size in pages */
 #define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
 #define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
 
@@ -80,6 +79,7 @@ struct pcpu_chunk {
 	int			map_used;	/* # of map entries used */
 	int			map_alloc;	/* # of map entries allocated */
 	int			*map;		/* allocation map */
+	bool			immutable;	/* no [de]population allowed */
 	struct page		*page[];	/* #cpus * UNIT_PAGES */
 };
 
@@ -521,6 +521,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
 	unsigned int last = num_possible_cpus() - 1;
 	unsigned int cpu;
 
+	/* unmap must not be done on immutable chunk */
+	WARN_ON(chunk->immutable);
+
 	/*
 	 * Each flushing trial can be very expensive, issue flush on
 	 * the whole region at once rather than doing it for each cpu.
@@ -602,6 +605,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
 	unsigned int cpu;
 	int err;
 
+	/* map must not be done on immutable chunk */
+	WARN_ON(chunk->immutable);
+
 	for_each_possible_cpu(cpu) {
 		err = map_kernel_range_noflush(
 				pcpu_chunk_addr(chunk, cpu, page_start),
@@ -727,8 +733,7 @@ void *__alloc_percpu(size_t size, size_t align)
 	struct pcpu_chunk *chunk;
 	int slot, off;
 
-	if (unlikely(!size || size > PCPU_MIN_UNIT_PAGES * PAGE_SIZE ||
-		     align > PAGE_SIZE)) {
+	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
 		WARN(true, "illegal size (%zu) or align (%zu) for "
 		     "percpu allocation\n", size, align);
 		return NULL;
@@ -776,6 +781,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
 
 static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
 {
+	WARN_ON(chunk->immutable);
 	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
 	list_del(&chunk->list);
 	rb_erase(&chunk->rb_node, &pcpu_addr_root);
@@ -821,33 +827,73 @@ void free_percpu(void *ptr)
 EXPORT_SYMBOL_GPL(free_percpu);
 
 /**
- * pcpu_setup_static - initialize kernel static percpu area
- * @populate_pte_fn: callback to allocate pagetable
- * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages
- * @cpu_size: the size of static percpu area in bytes
- *
- * Initialize kernel static percpu area.  The caller should allocate
- * all the necessary pages and pass them in @pages.
- * @populate_pte_fn() is called on each page to be used for percpu
- * mapping and is responsible for making sure all the necessary page
- * tables for the page is allocated.
+ * pcpu_setup_first_chunk - initialize the first percpu chunk
+ * @get_page_fn: callback to fetch page pointer
+ * @static_size: the size of static percpu area in bytes
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
+ * @free_size: free size in bytes, 0 for auto
+ * @base_addr: mapped address, NULL for auto
+ * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
+ *
+ * Initialize the first percpu chunk which contains the kernel static
+ * perpcu area.  This function is to be called from arch percpu area
+ * setup path.  The first two parameters are mandatory.  The rest are
+ * optional.
+ *
+ * @get_page_fn() should return pointer to percpu page given cpu
+ * number and page number.  It should at least return enough pages to
+ * cover the static area.  The returned pages for static area should
+ * have been initialized with valid data.  If @unit_size is specified,
+ * it can also return pages after the static area.  NULL return
+ * indicates end of pages for the cpu.  Note that @get_page_fn() must
+ * return the same number of pages for all cpus.
+ *
+ * @unit_size, if non-zero, determines unit size and must be aligned
+ * to PAGE_SIZE and equal to or larger than @static_size + @free_size.
+ *
+ * @free_size determines the number of free bytes after the static
+ * area in the first chunk.  If zero, whatever left is available.
+ * Specifying non-zero value make percpu leave the area after
+ * @static_size + @free_size alone.
+ *
+ * Non-null @base_addr means that the caller already allocated virtual
+ * region for the first chunk and mapped it.  percpu must not mess
+ * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
+ * @populate_pte_fn doesn't make any sense.
+ *
+ * @populate_pte_fn is used to populate the pagetable.  NULL means the
+ * caller already populated the pagetable.
  *
  * RETURNS:
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access.
  */
-size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
-				struct page **pages, size_t cpu_size)
+size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
+				     size_t static_size, size_t unit_size,
+				     size_t free_size, void *base_addr,
+				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct static_vm;
 	struct pcpu_chunk *static_chunk;
-	int nr_cpu_pages = DIV_ROUND_UP(cpu_size, PAGE_SIZE);
 	unsigned int cpu;
+	int nr_pages;
 	int err, i;
 
-	pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_PAGES, PFN_UP(cpu_size));
+	/* santiy checks */
+	BUG_ON(!static_size);
+	BUG_ON(!unit_size && free_size);
+	BUG_ON(unit_size && unit_size < static_size + free_size);
+	BUG_ON(unit_size & ~PAGE_MASK);
+	BUG_ON(base_addr && !unit_size);
+	BUG_ON(base_addr && populate_pte_fn);
 
-	pcpu_static_size = cpu_size;
+	if (unit_size)
+		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
+	else
+		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
+					PFN_UP(static_size));
+
+	pcpu_static_size = static_size;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
@@ -862,29 +908,66 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 
-	/* init and register vm area */
-	static_vm.flags = VM_ALLOC;
-	static_vm.size = pcpu_chunk_size;
-	vm_area_register_early(&static_vm, PAGE_SIZE);
-
 	/* init static_chunk */
 	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
 	INIT_LIST_HEAD(&static_chunk->list);
 	static_chunk->vm = &static_vm;
-	static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+
+	if (free_size)
+		static_chunk->free_size = free_size;
+	else
+		static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+
 	static_chunk->contig_hint = static_chunk->free_size;
 
-	/* assign pages and map them */
+	/* allocate vm address */
+	static_vm.flags = VM_ALLOC;
+	static_vm.size = pcpu_chunk_size;
+
+	if (!base_addr)
+		vm_area_register_early(&static_vm, PAGE_SIZE);
+	else {
+		/*
+		 * Pages already mapped.  No need to remap into
+		 * vmalloc area.  In this case the static chunk can't
+		 * be mapped or unmapped by percpu and is marked
+		 * immutable.
+		 */
+		static_vm.addr = base_addr;
+		static_chunk->immutable = true;
+	}
+
+	/* assign pages */
+	nr_pages = -1;
 	for_each_possible_cpu(cpu) {
-		for (i = 0; i < nr_cpu_pages; i++) {
-			*pcpu_chunk_pagep(static_chunk, cpu, i) = *pages++;
-			populate_pte_fn(pcpu_chunk_addr(static_chunk, cpu, i));
+		for (i = 0; i < pcpu_unit_pages; i++) {
+			struct page *page = get_page_fn(cpu, i);
+
+			if (!page)
+				break;
+			*pcpu_chunk_pagep(static_chunk, cpu, i) = page;
 		}
+
+		BUG_ON(i < PFN_UP(pcpu_static_size));
+
+		if (nr_pages < 0)
+			nr_pages = i;
+		else
+			BUG_ON(nr_pages != i);
 	}
 
-	err = pcpu_map(static_chunk, 0, nr_cpu_pages);
-	if (err)
-		panic("failed to setup static percpu area, err=%d\n", err);
+	/* map them */
+	if (populate_pte_fn) {
+		for_each_possible_cpu(cpu)
+			for (i = 0; i < nr_pages; i++)
+				populate_pte_fn(pcpu_chunk_addr(static_chunk,
+								cpu, i));
+
+		err = pcpu_map(static_chunk, 0, nr_pages);
+		if (err)
+			panic("failed to setup static percpu area, err=%d\n",
+			      err);
+	}
 
 	/* link static_chunk in */
 	pcpu_chunk_relocate(static_chunk, -1);
-- 
cgit v0.10.2


From 5f5d8405d1c50f5cf7e1dbfe9c9b44e2f015c8fd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:21 +0900
Subject: x86: separate out setup_pcpu_4k() from setup_per_cpu_areas()

Impact: modularize percpu first chunk allocation

x86 is gonna have a few different strategies for the first chunk
allocation.  Modularize it by separating out the current allocation
mechanism into pcpu_alloc_bootmem() and setup_pcpu_4k().

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d928e88..4a17c96 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,6 +7,7 @@
 #include <linux/crash_dump.h>
 #include <linux/smp.h>
 #include <linux/topology.h>
+#include <linux/pfn.h>
 #include <asm/sections.h>
 #include <asm/processor.h>
 #include <asm/setup.h>
@@ -41,6 +42,52 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 };
 EXPORT_SYMBOL(__per_cpu_offset);
 
+/**
+ * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
+ * @cpu: cpu to allocate for
+ * @size: size allocation in bytes
+ * @align: alignment
+ *
+ * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper
+ * does the right thing for NUMA regardless of the current
+ * configuration.
+ *
+ * RETURNS:
+ * Pointer to the allocated area on success, NULL on failure.
+ */
+static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
+					unsigned long align)
+{
+	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+	int node = early_cpu_to_node(cpu);
+	void *ptr;
+
+	if (!node_online(node) || !NODE_DATA(node)) {
+		ptr = __alloc_bootmem_nopanic(size, align, goal);
+		pr_info("cpu %d has no node %d or node-local memory\n",
+			cpu, node);
+		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
+			 cpu, size, __pa(ptr));
+	} else {
+		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
+						   size, align, goal);
+		pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
+			 "%016lx\n", cpu, size, node, __pa(ptr));
+	}
+	return ptr;
+#else
+	return __alloc_bootmem_nopanic(size, align, goal);
+#endif
+}
+
+/*
+ * 4k page allocator
+ *
+ * This is the basic allocator.  Static percpu area is allocated
+ * page-by-page and most of initialization is done by the generic
+ * setup function.
+ */
 static struct page **pcpu4k_pages __initdata;
 static int pcpu4k_nr_static_pages __initdata;
 
@@ -56,6 +103,51 @@ static void __init pcpu4k_populate_pte(unsigned long addr)
 	populate_extra_pte(addr);
 }
 
+static ssize_t __init setup_pcpu_4k(size_t static_size)
+{
+	size_t pages_size;
+	unsigned int cpu;
+	int i, j;
+	ssize_t ret;
+
+	pcpu4k_nr_static_pages = PFN_UP(static_size);
+
+	/* unaligned allocations can't be freed, round up to page size */
+	pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
+			       * sizeof(pcpu4k_pages[0]));
+	pcpu4k_pages = alloc_bootmem(pages_size);
+
+	/* allocate and copy */
+	j = 0;
+	for_each_possible_cpu(cpu)
+		for (i = 0; i < pcpu4k_nr_static_pages; i++) {
+			void *ptr;
+
+			ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
+			if (!ptr)
+				goto enomem;
+
+			memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
+			pcpu4k_pages[j++] = virt_to_page(ptr);
+		}
+
+	/* we're ready, commit */
+	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
+		pcpu4k_nr_static_pages, static_size);
+
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
+				     pcpu4k_populate_pte);
+	goto out_free_ar;
+
+enomem:
+	while (--j >= 0)
+		free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
+	ret = -ENOMEM;
+out_free_ar:
+	free_bootmem(__pa(pcpu4k_pages), pages_size);
+	return ret;
+}
+
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
@@ -76,56 +168,24 @@ static inline void setup_percpu_segment(int cpu)
  */
 void __init setup_per_cpu_areas(void)
 {
-	ssize_t size = __per_cpu_end - __per_cpu_start;
-	unsigned int nr_cpu_pages = DIV_ROUND_UP(size, PAGE_SIZE);
-	static struct page **pages;
-	size_t pages_size;
-	unsigned int cpu, i, j;
+	size_t static_size = __per_cpu_end - __per_cpu_start;
+	unsigned int cpu;
 	unsigned long delta;
 	size_t pcpu_unit_size;
+	ssize_t ret;
 
 	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
-	pr_info("PERCPU: Allocating %zd bytes for static per cpu data\n", size);
-
-	pages_size = nr_cpu_pages * num_possible_cpus() * sizeof(pages[0]);
-	pages = alloc_bootmem(pages_size);
-
-	j = 0;
-	for_each_possible_cpu(cpu) {
-		void *ptr;
-
-		for (i = 0; i < nr_cpu_pages; i++) {
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-			ptr = alloc_bootmem_pages(PAGE_SIZE);
-#else
-			int node = early_cpu_to_node(cpu);
-
-			if (!node_online(node) || !NODE_DATA(node)) {
-				ptr = alloc_bootmem_pages(PAGE_SIZE);
-				pr_info("cpu %d has no node %d or node-local "
-					"memory\n", cpu, node);
-				pr_debug("per cpu data for cpu%d at %016lx\n",
-					 cpu, __pa(ptr));
-			} else {
-				ptr = alloc_bootmem_pages_node(NODE_DATA(node),
-							       PAGE_SIZE);
-				pr_debug("per cpu data for cpu%d on node%d "
-					 "at %016lx\n", cpu, node, __pa(ptr));
-			}
-#endif
-			memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
-			pages[j++] = virt_to_page(ptr);
-		}
-	}
 
-	pcpu4k_pages = pages;
-	pcpu4k_nr_static_pages = nr_cpu_pages;
-	pcpu_unit_size = pcpu_setup_first_chunk(pcpu4k_get_page, size, 0, 0,
-						NULL, pcpu4k_populate_pte);
+	/* allocate percpu area */
+	ret = setup_pcpu_4k(static_size);
+	if (ret < 0)
+		panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
+		      static_size, ret);
 
-	free_bootmem(__pa(pages), pages_size);
+	pcpu_unit_size = ret;
 
+	/* alrighty, percpu areas up and running */
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu) {
 		per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
-- 
cgit v0.10.2


From 89c9215165ca609096e845926d9a18f1306176a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:21 +0900
Subject: x86: add embedding percpu first chunk allocator

Impact: add better first percpu allocation for !NUMA

On !NUMA, we can simply allocate contiguous memory and use it for the
first chunk without mapping it into vmalloc area.  As the memory area
is covered by the large page physical memory mapping, it allows the
dynamic perpcu allocator to not add any TLB overhead for the static
percpu area and whatever falls into the first chunk and the
implementation is very simple too.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 4a17c96..fd4c399 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -43,6 +43,35 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 EXPORT_SYMBOL(__per_cpu_offset);
 
 /**
+ * pcpu_need_numa - determine percpu allocation needs to consider NUMA
+ *
+ * If NUMA is not configured or there is only one NUMA node available,
+ * there is no reason to consider NUMA.  This function determines
+ * whether percpu allocation should consider NUMA or not.
+ *
+ * RETURNS:
+ * true if NUMA should be considered; otherwise, false.
+ */
+static bool __init pcpu_need_numa(void)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+	pg_data_t *last = NULL;
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		int node = early_cpu_to_node(cpu);
+
+		if (node_online(node) && NODE_DATA(node) &&
+		    last && last != NODE_DATA(node))
+			return true;
+
+		last = NODE_DATA(node);
+	}
+#endif
+	return false;
+}
+
+/**
  * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
  * @cpu: cpu to allocate for
  * @size: size allocation in bytes
@@ -82,6 +111,59 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
 }
 
 /*
+ * Embedding allocator
+ *
+ * The first chunk is sized to just contain the static area plus
+ * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using
+ * bootmem allocator and used as-is without being mapped into vmalloc
+ * area.  This enables the first chunk to piggy back on the linear
+ * physical PMD mapping and doesn't add any additional pressure to
+ * TLB.
+ */
+static void *pcpue_ptr __initdata;
+static size_t pcpue_unit_size __initdata;
+
+static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
+{
+	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size
+			    + ((size_t)pageno << PAGE_SHIFT));
+}
+
+static ssize_t __init setup_pcpu_embed(size_t static_size)
+{
+	unsigned int cpu;
+
+	/*
+	 * If large page isn't supported, there's no benefit in doing
+	 * this.  Also, embedding allocation doesn't play well with
+	 * NUMA.
+	 */
+	if (!cpu_has_pse || pcpu_need_numa())
+		return -EINVAL;
+
+	/* allocate and copy */
+	pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+	pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
+	pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
+				       PAGE_SIZE);
+	if (!pcpue_ptr)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu)
+		memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load,
+		       static_size);
+
+	/* we're ready, commit */
+	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
+		pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+
+	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+				      pcpue_unit_size,
+				      pcpue_unit_size - static_size, pcpue_ptr,
+				      NULL);
+}
+
+/*
  * 4k page allocator
  *
  * This is the basic allocator.  Static percpu area is allocated
@@ -178,7 +260,9 @@ void __init setup_per_cpu_areas(void)
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
 
 	/* allocate percpu area */
-	ret = setup_pcpu_4k(static_size);
+	ret = setup_pcpu_embed(static_size);
+	if (ret < 0)
+		ret = setup_pcpu_4k(static_size);
 	if (ret < 0)
 		panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
 		      static_size, ret);
-- 
cgit v0.10.2


From 8ac837571491e239e64bd87863c1679d8002e8a2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:22 +0900
Subject: x86: add remapping percpu first chunk allocator

Impact: add better first percpu allocation for NUMA

On NUMA, embedding allocator can't be used as different units can't be
made to fall in the correct NUMA nodes.  To use large page mapping,
each unit needs to be remapped.  However, percpu areas are usually
much smaller than large page size and unused space hurts a lot as the
number of cpus grow.  This allocator remaps large pages for each chunk
but gives back unused part to the bootmem allocator making the large
pages mapped twice.

This adds slightly to the TLB pressure but is much better than using
4k mappings while still being NUMA-friendly.

Ingo suggested that this would be the correct approach for NUMA.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index fd4c399..2d946a8 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -111,6 +111,133 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
 }
 
 /*
+ * Remap allocator
+ *
+ * This allocator uses PMD page as unit.  A PMD page is allocated for
+ * each cpu and each is remapped into vmalloc area using PMD mapping.
+ * As PMD page is quite large, only part of it is used for the first
+ * chunk.  Unused part is returned to the bootmem allocator.
+ *
+ * So, the PMD pages are mapped twice - once to the physical mapping
+ * and to the vmalloc area for the first percpu chunk.  The double
+ * mapping does add one more PMD TLB entry pressure but still is much
+ * better than only using 4k mappings while still being NUMA friendly.
+ */
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+static size_t pcpur_size __initdata;
+static void **pcpur_ptrs __initdata;
+
+static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
+{
+	size_t off = (size_t)pageno << PAGE_SHIFT;
+
+	if (off >= pcpur_size)
+		return NULL;
+
+	return virt_to_page(pcpur_ptrs[cpu] + off);
+}
+
+static ssize_t __init setup_pcpu_remap(size_t static_size)
+{
+	static struct vm_struct vm;
+	pg_data_t *last;
+	size_t ptrs_size;
+	unsigned int cpu;
+	ssize_t ret;
+
+	/*
+	 * If large page isn't supported, there's no benefit in doing
+	 * this.  Also, on non-NUMA, embedding is better.
+	 */
+	if (!cpu_has_pse || pcpu_need_numa())
+		return -EINVAL;
+
+	last = NULL;
+	for_each_possible_cpu(cpu) {
+		int node = early_cpu_to_node(cpu);
+
+		if (node_online(node) && NODE_DATA(node) &&
+		    last && last != NODE_DATA(node))
+			goto proceed;
+
+		last = NODE_DATA(node);
+	}
+	return -EINVAL;
+
+proceed:
+	/*
+	 * Currently supports only single page.  Supporting multiple
+	 * pages won't be too difficult if it ever becomes necessary.
+	 */
+	pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+	if (pcpur_size > PMD_SIZE) {
+		pr_warning("PERCPU: static data is larger than large page, "
+			   "can't use large page\n");
+		return -EINVAL;
+	}
+
+	/* allocate pointer array and alloc large pages */
+	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
+	pcpur_ptrs = alloc_bootmem(ptrs_size);
+
+	for_each_possible_cpu(cpu) {
+		pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
+		if (!pcpur_ptrs[cpu])
+			goto enomem;
+
+		/*
+		 * Only use pcpur_size bytes and give back the rest.
+		 *
+		 * Ingo: The 2MB up-rounding bootmem is needed to make
+		 * sure the partial 2MB page is still fully RAM - it's
+		 * not well-specified to have a PAT-incompatible area
+		 * (unmapped RAM, device memory, etc.) in that hole.
+		 */
+		free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
+			     PMD_SIZE - pcpur_size);
+
+		memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+	}
+
+	/* allocate address and map */
+	vm.flags = VM_ALLOC;
+	vm.size = num_possible_cpus() * PMD_SIZE;
+	vm_area_register_early(&vm, PMD_SIZE);
+
+	for_each_possible_cpu(cpu) {
+		pmd_t *pmd;
+
+		pmd = populate_extra_pmd((unsigned long)vm.addr
+					 + cpu * PMD_SIZE);
+		set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
+				     PAGE_KERNEL_LARGE));
+	}
+
+	/* we're ready, commit */
+	pr_info("PERCPU: Remapped at %p with large pages, static data "
+		"%zu bytes\n", vm.addr, static_size);
+
+	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
+				     pcpur_size - static_size, vm.addr, NULL);
+	goto out_free_ar;
+
+enomem:
+	for_each_possible_cpu(cpu)
+		if (pcpur_ptrs[cpu])
+			free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
+	ret = -ENOMEM;
+out_free_ar:
+	free_bootmem(__pa(pcpur_ptrs), ptrs_size);
+	return ret;
+}
+#else
+static ssize_t __init setup_pcpu_remap(size_t static_size)
+{
+	return -EINVAL;
+}
+#endif
+
+/*
  * Embedding allocator
  *
  * The first chunk is sized to just contain the static area plus
@@ -259,8 +386,14 @@ void __init setup_per_cpu_areas(void)
 	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
 
-	/* allocate percpu area */
-	ret = setup_pcpu_embed(static_size);
+	/*
+	 * Allocate percpu area.  If PSE is supported, try to make use
+	 * of large page mappings.  Please read comments on top of
+	 * each allocator for details.
+	 */
+	ret = setup_pcpu_remap(static_size);
+	if (ret < 0)
+		ret = setup_pcpu_embed(static_size);
 	if (ret < 0)
 		ret = setup_pcpu_4k(static_size);
 	if (ret < 0)
-- 
cgit v0.10.2


From 40150d37be7f7949b2ec07d511244da856647d84 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 12:32:28 +0900
Subject: percpu: add __read_mostly to variables which are mostly read only

Most global variables in percpu allocator are initialized during boot
and read only from that point on.  Add __read_mostly as per Rusty's
suggestion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>

diff --git a/mm/percpu.c b/mm/percpu.c
index 9ac0198..5954e7a 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -83,18 +83,18 @@ struct pcpu_chunk {
 	struct page		*page[];	/* #cpus * UNIT_PAGES */
 };
 
-static int pcpu_unit_pages;
-static int pcpu_unit_size;
-static int pcpu_chunk_size;
-static int pcpu_nr_slots;
-static size_t pcpu_chunk_struct_size;
+static int pcpu_unit_pages __read_mostly;
+static int pcpu_unit_size __read_mostly;
+static int pcpu_chunk_size __read_mostly;
+static int pcpu_nr_slots __read_mostly;
+static size_t pcpu_chunk_struct_size __read_mostly;
 
 /* the address of the first chunk which starts with the kernel static area */
-void *pcpu_base_addr;
+void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
 /* the size of kernel static area */
-static int pcpu_static_size;
+static int pcpu_static_size __read_mostly;
 
 /*
  * One mutex to rule them all.
@@ -112,7 +112,7 @@ static int pcpu_static_size;
  */
 static DEFINE_MUTEX(pcpu_mutex);
 
-static struct list_head *pcpu_slot;		/* chunk list slots */
+static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
 
 static int __pcpu_size_to_slot(int size)
-- 
cgit v0.10.2


From 0af98d37e85e6958eb84987b1f60da3b54008317 Mon Sep 17 00:00:00 2001
From: Phil Sutter <n0-1@freewrt.org>
Date: Sun, 8 Feb 2009 16:44:42 +0100
Subject: [WATCHDOG] rc32434_wdt: fix watchdog driver

The existing driver code wasn't working. Neither the timeout was set
correctly, nor system reset was being triggered, as the driver seemed
to keep the WDT alive himself. There was also some unnecessary code.

Signed-off-by: Phil Sutter <n0-1@freewrt.org>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
Cc: stable <stable@kernel.org>

diff --git a/drivers/watchdog/rc32434_wdt.c b/drivers/watchdog/rc32434_wdt.c
index 57027f4..f1f98cd 100644
--- a/drivers/watchdog/rc32434_wdt.c
+++ b/drivers/watchdog/rc32434_wdt.c
@@ -34,104 +34,89 @@
 #include <asm/time.h>
 #include <asm/mach-rc32434/integ.h>
 
-#define MAX_TIMEOUT			20
-#define RC32434_WDT_INTERVAL		(15 * HZ)
-
-#define VERSION "0.2"
+#define VERSION "0.3"
 
 static struct {
-	struct completion stop;
-	int running;
-	struct timer_list timer;
-	int queue;
-	int default_ticks;
 	unsigned long inuse;
 } rc32434_wdt_device;
 
 static struct integ __iomem *wdt_reg;
-static int ticks = 100 * HZ;
 
 static int expect_close;
-static int timeout;
+
+/* Board internal clock speed in Hz,
+ * the watchdog timer ticks at. */
+extern unsigned int idt_cpu_freq;
+
+/* translate wtcompare value to seconds and vice versa */
+#define WTCOMP2SEC(x)	(x / idt_cpu_freq)
+#define SEC2WTCOMP(x)	(x * idt_cpu_freq)
+
+/* Use a default timeout of 20s. This should be
+ * safe for CPU clock speeds up to 400MHz, as
+ * ((2 ^ 32) - 1) / (400MHz / 2) = 21s.  */
+#define WATCHDOG_TIMEOUT 20
+
+static int timeout = WATCHDOG_TIMEOUT;
 
 static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
 	__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
+/* apply or and nand masks to data read from addr and write back */
+#define SET_BITS(addr, or, nand) \
+	writel((readl(&addr) | or) & ~nand, &addr)
 
 static void rc32434_wdt_start(void)
 {
-	u32 val;
-
-	if (!rc32434_wdt_device.inuse) {
-		writel(0, &wdt_reg->wtcount);
+	u32 or, nand;
 
-		val = RC32434_ERR_WRE;
-		writel(readl(&wdt_reg->errcs) | val, &wdt_reg->errcs);
+	/* zero the counter before enabling */
+	writel(0, &wdt_reg->wtcount);
 
-		val = RC32434_WTC_EN;
-		writel(readl(&wdt_reg->wtc) | val, &wdt_reg->wtc);
-	}
-	rc32434_wdt_device.running++;
-}
+	/* don't generate a non-maskable interrupt,
+	 * do a warm reset instead */
+	nand = 1 << RC32434_ERR_WNE;
+	or = 1 << RC32434_ERR_WRE;
 
-static void rc32434_wdt_stop(void)
-{
-	u32 val;
+	/* reset the ERRCS timeout bit in case it's set */
+	nand |= 1 << RC32434_ERR_WTO;
 
-	if (rc32434_wdt_device.running) {
+	SET_BITS(wdt_reg->errcs, or, nand);
 
-		val = ~RC32434_WTC_EN;
-		writel(readl(&wdt_reg->wtc) & val, &wdt_reg->wtc);
+	/* reset WTC timeout bit and enable WDT */
+	nand = 1 << RC32434_WTC_TO;
+	or = 1 << RC32434_WTC_EN;
 
-		val = ~RC32434_ERR_WRE;
-		writel(readl(&wdt_reg->errcs) & val, &wdt_reg->errcs);
+	SET_BITS(wdt_reg->wtc, or, nand);
+}
 
-		rc32434_wdt_device.running = 0;
-	}
+static void rc32434_wdt_stop(void)
+{
+	/* Disable WDT */
+	SET_BITS(wdt_reg->wtc, 0, 1 << RC32434_WTC_EN);
 }
 
-static void rc32434_wdt_set(int new_timeout)
+static int rc32434_wdt_set(int new_timeout)
 {
-	u32 cmp = new_timeout * HZ;
-	u32 state, val;
+	int max_to = WTCOMP2SEC((u32)-1);
 
+	if (new_timeout < 0 || new_timeout > max_to) {
+		printk(KERN_ERR KBUILD_MODNAME
+			": timeout value must be between 0 and %d",
+			max_to);
+		return -EINVAL;
+	}
 	timeout = new_timeout;
-	/*
-	 * store and disable WTC
-	 */
-	state = (u32)(readl(&wdt_reg->wtc) & RC32434_WTC_EN);
-	val = ~RC32434_WTC_EN;
-	writel(readl(&wdt_reg->wtc) & val, &wdt_reg->wtc);
-
-	writel(0, &wdt_reg->wtcount);
-	writel(cmp, &wdt_reg->wtcompare);
-
-	/*
-	 * restore WTC
-	 */
-
-	writel(readl(&wdt_reg->wtc) | state, &wdt_reg);
-}
+	writel(SEC2WTCOMP(timeout), &wdt_reg->wtcompare);
 
-static void rc32434_wdt_reset(void)
-{
-	ticks = rc32434_wdt_device.default_ticks;
+	return 0;
 }
 
-static void rc32434_wdt_update(unsigned long unused)
+static void rc32434_wdt_ping(void)
 {
-	if (rc32434_wdt_device.running)
-		ticks--;
-
 	writel(0, &wdt_reg->wtcount);
-
-	if (rc32434_wdt_device.queue && ticks)
-		mod_timer(&rc32434_wdt_device.timer,
-			jiffies + RC32434_WDT_INTERVAL);
-	else
-		complete(&rc32434_wdt_device.stop);
 }
 
 static int rc32434_wdt_open(struct inode *inode, struct file *file)
@@ -142,19 +127,23 @@ static int rc32434_wdt_open(struct inode *inode, struct file *file)
 	if (nowayout)
 		__module_get(THIS_MODULE);
 
+	rc32434_wdt_start();
+	rc32434_wdt_ping();
+
 	return nonseekable_open(inode, file);
 }
 
 static int rc32434_wdt_release(struct inode *inode, struct file *file)
 {
-	if (expect_close && nowayout == 0) {
+	if (expect_close == 42) {
 		rc32434_wdt_stop();
 		printk(KERN_INFO KBUILD_MODNAME ": disabling watchdog timer\n");
 		module_put(THIS_MODULE);
-	} else
+	} else {
 		printk(KERN_CRIT KBUILD_MODNAME
 			": device closed unexpectedly. WDT will not stop !\n");
-
+		rc32434_wdt_ping();
+	}
 	clear_bit(0, &rc32434_wdt_device.inuse);
 	return 0;
 }
@@ -174,10 +163,10 @@ static ssize_t rc32434_wdt_write(struct file *file, const char *data,
 				if (get_user(c, data + i))
 					return -EFAULT;
 				if (c == 'V')
-					expect_close = 1;
+					expect_close = 42;
 			}
 		}
-		rc32434_wdt_update(0);
+		rc32434_wdt_ping();
 		return len;
 	}
 	return 0;
@@ -197,11 +186,11 @@ static long rc32434_wdt_ioctl(struct file *file, unsigned int cmd,
 	};
 	switch (cmd) {
 	case WDIOC_KEEPALIVE:
-		rc32434_wdt_reset();
+		rc32434_wdt_ping();
 		break;
 	case WDIOC_GETSTATUS:
 	case WDIOC_GETBOOTSTATUS:
-		value = readl(&wdt_reg->wtcount);
+		value = 0;
 		if (copy_to_user(argp, &value, sizeof(int)))
 			return -EFAULT;
 		break;
@@ -218,6 +207,7 @@ static long rc32434_wdt_ioctl(struct file *file, unsigned int cmd,
 			break;
 		case WDIOS_DISABLECARD:
 			rc32434_wdt_stop();
+			break;
 		default:
 			return -EINVAL;
 		}
@@ -225,11 +215,9 @@ static long rc32434_wdt_ioctl(struct file *file, unsigned int cmd,
 	case WDIOC_SETTIMEOUT:
 		if (copy_from_user(&new_timeout, argp, sizeof(int)))
 			return -EFAULT;
-		if (new_timeout < 1)
+		if (rc32434_wdt_set(new_timeout))
 			return -EINVAL;
-		if (new_timeout > MAX_TIMEOUT)
-			return -EINVAL;
-		rc32434_wdt_set(new_timeout);
+		/* Fall through */
 	case WDIOC_GETTIMEOUT:
 		return copy_to_user(argp, &timeout, sizeof(int));
 	default:
@@ -262,7 +250,7 @@ static int rc32434_wdt_probe(struct platform_device *pdev)
 	int ret;
 	struct resource *r;
 
-	r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rb500_wdt_res");
+	r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rb532_wdt_res");
 	if (!r) {
 		printk(KERN_ERR KBUILD_MODNAME
 			"failed to retrieve resources\n");
@@ -277,24 +265,12 @@ static int rc32434_wdt_probe(struct platform_device *pdev)
 	}
 
 	ret = misc_register(&rc32434_wdt_miscdev);
-
 	if (ret < 0) {
 		printk(KERN_ERR KBUILD_MODNAME
 			"failed to register watchdog device\n");
 		goto unmap;
 	}
 
-	init_completion(&rc32434_wdt_device.stop);
-	rc32434_wdt_device.queue = 0;
-
-	clear_bit(0, &rc32434_wdt_device.inuse);
-
-	setup_timer(&rc32434_wdt_device.timer, rc32434_wdt_update, 0L);
-
-	rc32434_wdt_device.default_ticks = ticks;
-
-	rc32434_wdt_start();
-
 	printk(banner, timeout);
 
 	return 0;
@@ -306,14 +282,8 @@ unmap:
 
 static int rc32434_wdt_remove(struct platform_device *pdev)
 {
-	if (rc32434_wdt_device.queue) {
-		rc32434_wdt_device.queue = 0;
-		wait_for_completion(&rc32434_wdt_device.stop);
-	}
 	misc_deregister(&rc32434_wdt_miscdev);
-
 	iounmap(wdt_reg);
-
 	return 0;
 }
 
-- 
cgit v0.10.2


From d9a8798c4bab5ccd40e45e011f668099cfb3eb83 Mon Sep 17 00:00:00 2001
From: Phil Sutter <n0-1@freewrt.org>
Date: Sun, 8 Feb 2009 16:44:42 +0100
Subject: [WATCHDOG] rc32434_wdt: fix sections

Fix init and exit sections.

Signed-off-by: Phil Sutter <n0-1@freewrt.org>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
Cc: stable <stable@kernel.org>

diff --git a/drivers/watchdog/rc32434_wdt.c b/drivers/watchdog/rc32434_wdt.c
index f1f98cd..f3553fa 100644
--- a/drivers/watchdog/rc32434_wdt.c
+++ b/drivers/watchdog/rc32434_wdt.c
@@ -34,7 +34,7 @@
 #include <asm/time.h>
 #include <asm/mach-rc32434/integ.h>
 
-#define VERSION "0.3"
+#define VERSION "0.4"
 
 static struct {
 	unsigned long inuse;
@@ -242,10 +242,10 @@ static struct miscdevice rc32434_wdt_miscdev = {
 	.fops	= &rc32434_wdt_fops,
 };
 
-static char banner[] = KERN_INFO KBUILD_MODNAME
+static char banner[] __devinitdata = KERN_INFO KBUILD_MODNAME
 		": Watchdog Timer version " VERSION ", timer margin: %d sec\n";
 
-static int rc32434_wdt_probe(struct platform_device *pdev)
+static int __devinit rc32434_wdt_probe(struct platform_device *pdev)
 {
 	int ret;
 	struct resource *r;
@@ -280,7 +280,7 @@ unmap:
 	return ret;
 }
 
-static int rc32434_wdt_remove(struct platform_device *pdev)
+static int __devexit rc32434_wdt_remove(struct platform_device *pdev)
 {
 	misc_deregister(&rc32434_wdt_miscdev);
 	iounmap(wdt_reg);
@@ -289,8 +289,8 @@ static int rc32434_wdt_remove(struct platform_device *pdev)
 
 static struct platform_driver rc32434_wdt = {
 	.probe	= rc32434_wdt_probe,
-	.remove = rc32434_wdt_remove,
-	.driver = {
+	.remove	= __devexit_p(rc32434_wdt_remove),
+	.driver	= {
 		.name = "rc32434_wdt",
 	}
 };
-- 
cgit v0.10.2


From c8532db7f2661b63f658b9a08cf4053a3e6abb78 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@sirena.org.uk>
Date: Tue, 24 Feb 2009 15:55:48 +0100
Subject: [ARM] 5411/1: S3C64XX: Fix EINT unmask

Currently the unmask function for EINT interrupts was setting the mask
bit rather than clearing it.  This was also previously reported and
fixed by Kyungmin Park <kyungmin.park@samsung.com> and others.

Acked-By: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/plat-s3c64xx/irq-eint.c b/arch/arm/plat-s3c64xx/irq-eint.c
index 1f7cc00..ebb305c 100644
--- a/arch/arm/plat-s3c64xx/irq-eint.c
+++ b/arch/arm/plat-s3c64xx/irq-eint.c
@@ -55,7 +55,7 @@ static void s3c_irq_eint_unmask(unsigned int irq)
 	u32 mask;
 
 	mask = __raw_readl(S3C64XX_EINT0MASK);
-	mask |= eint_irq_to_bit(irq);
+	mask &= ~eint_irq_to_bit(irq);
 	__raw_writel(mask, S3C64XX_EINT0MASK);
 }
 
-- 
cgit v0.10.2


From 41fdff322e26c4a86fe65cf577f2556a650cb7bc Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:30 +0100
Subject: x86, mce, cmci: export MAX_NR_BANKS

Impact: Cleanup (code movement)

Move MAX_NR_BANKS into mce.h because it's needed there
for followup patches.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 225cdb5..39136c4 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -95,6 +95,12 @@ void mce_log(struct mce *m);
 DECLARE_PER_CPU(struct sys_device, device_mce);
 extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
+/*
+ * To support more than 128 would need to escape the predefined
+ * Linux defined extended banks first.
+ */
+#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
+
 #ifdef CONFIG_X86_MCE_INTEL
 void mce_intel_feature_init(struct cpuinfo_x86 *c);
 #else
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index a4a7c68..39f8bb5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -37,12 +37,6 @@
 
 #define MISC_MCELOG_MINOR 227
 
-/*
- * To support more than 128 would need to escape the predefined
- * Linux defined extended banks first.
- */
-#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
-
 atomic_t mce_entry;
 
 static int mce_dont_init;
-- 
cgit v0.10.2


From b276268631af3a1b0df871e10d19d492f0513d4b Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:31 +0100
Subject: x86, mce, cmci: factor out threshold interrupt handler

Impact: cleanup; preparation for feature

The mce_amd_64 code has an own private MC threshold vector with an own
interrupt handler. Since Intel needs a similar handler
it makes sense to share the vector because both can not
be active at the same time.

I factored the common APIC handler code into a separate file which can
be used by both the Intel or AMD MC code.

This is needed for the next patch which adds an Intel specific
CMCI handler.

This patch should be a nop for AMD, it just moves some code
around.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9c39095..52d7013 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -751,6 +751,11 @@ config X86_MCE_AMD
 	   Additional support for AMD specific MCE features such as
 	   the DRAM Error Threshold.
 
+config X86_MCE_THRESHOLD
+	depends on X86_MCE_AMD || X86_MCE_INTEL
+	bool
+	default y
+
 config X86_MCE_NONFATAL
 	tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
 	depends on X86_32 && X86_MCE
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 39136c4..125cd87 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -135,5 +135,7 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
 #define mcheck_init(c) do { } while (0)
 #endif
 
+extern void (*mce_threshold_vector)(void);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323..b2f8982 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32)		+= k7.o p4.o p5.o p6.o winchip.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel_64.o
 obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd_64.o
 obj-$(CONFIG_X86_MCE_NONFATAL)	+= non-fatal.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index e82c820..49705be 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
 
 static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
 
+static void amd_threshold_interrupt(void);
+
 /*
  * CPU Initialization
  */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 			tr.reset = 0;
 			tr.old_limit = 0;
 			threshold_restart_bank(&tr);
+
+			mce_threshold_vector = amd_threshold_interrupt;
 		}
 	}
 }
@@ -187,16 +191,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
  * the interrupt goes off when error_count reaches threshold_limit.
  * the handler will simply log mcelog w/ software defined bank number.
  */
-asmlinkage void mce_threshold_interrupt(void)
+static void amd_threshold_interrupt(void)
 {
 	unsigned int bank, block;
 	struct mce m;
 	u32 low = 0, high = 0, address = 0;
 
-	ack_APIC_irq();
-	exit_idle();
-	irq_enter();
-
 	mce_setup(&m);
 
 	/* assume first bank caused it */
@@ -241,13 +241,10 @@ asmlinkage void mce_threshold_interrupt(void)
 				       + bank * NR_BLOCKS
 				       + block;
 				mce_log(&m);
-				goto out;
+				return;
 			}
 		}
 	}
-out:
-	inc_irq_stat(irq_threshold_count);
-	irq_exit();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 0000000..4319142
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,24 @@
+/* Common corrected MCE threshold handler code */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <asm/mce.h>
+#include <asm/irq_vectors.h>
+#include <asm/idle.h>
+
+static void default_threshold_interrupt(void)
+{
+	printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
+			 THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+asmlinkage void mce_threshold_interrupt(void)
+{
+	ack_APIC_irq();
+	exit_idle();
+	irq_enter();
+	inc_irq_stat(irq_threshold_count);
+	mce_threshold_vector();
+	irq_exit();
+}
-- 
cgit v0.10.2


From f9695df42cdbca78530b4458c38ecfdd0bb90079 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:32 +0100
Subject: x86, mce, cmci: avoid potential reentry of threshold interrupt

Impact: minor bugfix

The threshold handler on AMD (and soon on Intel) could be theoretically
reentered by the hardware. This could lead to corrupted events
because the machine check poll code assumes it is not reentered.

Move the APIC ACK to the end of the interrupt handler to let
the hardware avoid that.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 4319142..e4b8a38 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -15,10 +15,11 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt;
 
 asmlinkage void mce_threshold_interrupt(void)
 {
-	ack_APIC_irq();
 	exit_idle();
 	irq_enter();
 	inc_irq_stat(irq_threshold_count);
 	mce_threshold_vector();
 	irq_exit();
+	/* Ack only at the end to avoid potential reentry */
+	ack_APIC_irq();
 }
-- 
cgit v0.10.2


From 8457c84d68678cbfd4167a9073b89da58e48c037 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:33 +0100
Subject: x86, mce: replace machine check events logged interval with ratelimit

Impact: behavior change, use common code

Use a standard leaky bucket ratelimit for the machine check
warning print interval instead of waiting every check_interval.
Also decrease the limit to twice per minute.
This interacts better with threshold interrupts because
they can happen more often than check_interval.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 39f8bb5..9017609 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -28,6 +28,7 @@
 #include <linux/kdebug.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
+#include <linux/ratelimit.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
@@ -488,11 +489,11 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
  */
 int mce_notify_user(void)
 {
+	/* Not more than two messages every minute */
+	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
 	clear_thread_flag(TIF_MCE_NOTIFY);
 	if (test_and_clear_bit(0, &notify_user)) {
-		static unsigned long last_print;
-		unsigned long now = jiffies;
-
 		wake_up_interruptible(&mce_wait);
 
 		/*
@@ -503,10 +504,8 @@ int mce_notify_user(void)
 		if (trigger[0] && !work_pending(&mce_trigger_work))
 			schedule_work(&mce_trigger_work);
 
-		if (time_after_eq(now, last_print + (check_interval*HZ))) {
-			last_print = now;
+		if (__ratelimit(&ratelimit))
 			printk(KERN_INFO "Machine check events logged\n");
-		}
 
 		return 1;
 	}
-- 
cgit v0.10.2


From ee031c31d6381d004bfd386c2e45821211507499 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:34 +0100
Subject: x86, mce, cmci: use polled banks bitmap in machine check poller

Define a per cpu bitmap that contains the banks polled by the machine
check poller. This is needed for the CMCI code in the next patches
to be able to disable polling on specific banks.

The bank by default contains all banks, so there is no behaviour
change. Only future code will remove some banks from the polling
set.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 125cd87..9b952369 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -119,11 +119,14 @@ extern atomic_t mce_entry;
 
 extern void do_machine_check(struct pt_regs *, long);
 
+typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
+DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
+
 enum mcp_flags {
 	MCP_TIMESTAMP = (1 << 0),	/* log time stamp */
 	MCP_UC = (1 << 1),		/* log uncorrected errors */
 };
-extern void machine_check_poll(enum mcp_flags flags);
+extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
 extern int mce_notify_user(void);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 9017609..a8ff38b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -62,6 +62,11 @@ static char *trigger_argv[2] = { trigger, NULL };
 
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
 
+/* MCA banks polled by the period polling timer for corrected events */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
 /* Do initial initialization of a struct mce */
 void mce_setup(struct mce *m)
 {
@@ -191,7 +196,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
  *
  * This is executed in standard interrupt context.
  */
-void machine_check_poll(enum mcp_flags flags)
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 {
 	struct mce m;
 	int i;
@@ -200,7 +205,7 @@ void machine_check_poll(enum mcp_flags flags)
 
 	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 	for (i = 0; i < banks; i++) {
-		if (!bank[i])
+		if (!bank[i] || !test_bit(i, *b))
 			continue;
 
 		m.misc = 0;
@@ -458,7 +463,8 @@ static void mcheck_timer(unsigned long data)
 	WARN_ON(smp_processor_id() != data);
 
 	if (mce_available(&current_cpu_data))
-		machine_check_poll(MCP_TIMESTAMP);
+		machine_check_poll(MCP_TIMESTAMP,
+				&__get_cpu_var(mce_poll_banks));
 
 	/*
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -572,11 +578,13 @@ static void mce_init(void *dummy)
 {
 	u64 cap;
 	int i;
+	mce_banks_t all_banks;
 
 	/*
 	 * Log the machine checks left over from the previous reset.
 	 */
-	machine_check_poll(MCP_UC);
+	bitmap_fill(all_banks, MAX_NR_BANKS);
+	machine_check_poll(MCP_UC, &all_banks);
 
 	set_in_cr4(X86_CR4_MCE);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 49705be..ee8bfcd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -231,7 +231,8 @@ static void amd_threshold_interrupt(void)
 
 			/* Log the machine check that caused the threshold
 			   event. */
-			machine_check_poll(MCP_TIMESTAMP);
+			machine_check_poll(MCP_TIMESTAMP,
+					&__get_cpu_var(mce_poll_banks));
 
 			if (high & MASK_OVERFLOW_HI) {
 				rdmsrl(address, m.misc);
-- 
cgit v0.10.2


From 03195c6b40f2b4db92545921daa7c3a19b4e4c32 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:35 +0100
Subject: x86, mce, cmci: define MSR names and fields for new CMCI registers

Impact: New register definitions only

CMCI means support for raising an interrupt on a corrected machine
check event instead of having to poll for it. It's a new feature in
Intel Nehalem CPUs available on some machine check banks.

For details see the IA32 SDM Vol3a 14.5

Define the registers for it as a preparation for further patches.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e3..bc9514f 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -53,6 +53,7 @@
 #define		APIC_ESR_SENDILL	0x00020
 #define		APIC_ESR_RECVILL	0x00040
 #define		APIC_ESR_ILLREGA	0x00080
+#define 	APIC_LVTCMCI	0x2f0
 #define	APIC_ICR	0x300
 #define		APIC_DEST_SELF		0x40000
 #define		APIC_DEST_ALLINC	0x80000
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 9b952369..6fc5e07 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -11,6 +11,8 @@
  */
 
 #define MCG_CTL_P	 (1UL<<8)   /* MCG_CAP register available */
+#define MCG_EXT_P	 (1ULL<<9)   /* Extended registers available */
+#define MCG_CMCI_P	 (1ULL<<10)  /* CMCI supported */
 
 #define MCG_STATUS_RIPV  (1UL<<0)   /* restart ip valid */
 #define MCG_STATUS_EIPV  (1UL<<1)   /* ip points to correct instruction */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 358acc5..2dbd231 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -77,6 +77,11 @@
 #define MSR_IA32_MC0_ADDR		0x00000402
 #define MSR_IA32_MC0_MISC		0x00000403
 
+/* These are consecutive and not in the normal 4er MCE bank block */
+#define MSR_IA32_MC0_CTL2		0x00000280
+#define CMCI_EN			(1ULL << 30)
+#define CMCI_THRESHOLD_MASK		0xffffULL
+
 #define MSR_P6_PERFCTR0			0x000000c1
 #define MSR_P6_PERFCTR1			0x000000c2
 #define MSR_P6_EVNTSEL0			0x00000186
-- 
cgit v0.10.2


From 88ccbedd9ca85d1aca6a6f99df48dce87b7c02d4 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:36 +0100
Subject: x86, mce, cmci: add CMCI support

Impact: Major new feature

Intel CMCI (Corrected Machine Check Interrupt) is a new
feature on Nehalem CPUs. It allows the CPU to trigger
interrupts on corrected events, which allows faster
reaction to them instead of with the traditional
polling timer.

Also use CMCI to discover shared banks. Machine check banks
can be shared by CPU threads or even cores. Using the CMCI enable
bit it is possible to detect the fact that another CPU already
saw a specific bank. Use this to assign shared banks only
to one CPU to avoid reporting duplicated events.

On CPU hot unplug bank sharing is re discovered. This is done
using a thread that cycles through all the CPUs.

To avoid races between the poller and CMCI we only poll
for banks that are not CMCI capable and only check CMCI
owned banks on a interrupt.

The shared banks ownership information is currently only used for
CMCI interrupts, not polled banks.

The sharing discovery code follows the algorithm recommended in the
IA32 SDM Vol3a 14.5.2.1

The CMCI interrupt handler just calls the machine check poller to
pick up the machine check event that caused the interrupt.

I decided not to implement a separate threshold event like
the AMD version has, because the threshold is always one currently
and adding another event didn't seem to add any value.

Some code inspired by Yunhong Jiang's Xen implementation,
which was in term inspired by a earlier CMCI implementation
by me.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6fc5e07..563933e 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -105,8 +105,16 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
 #ifdef CONFIG_X86_MCE_INTEL
 void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void cmci_clear(void);
+void cmci_reenable(void);
+void cmci_rediscover(int dying);
+void cmci_recheck(void);
 #else
 static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+static inline void cmci_clear(void) {}
+static inline void cmci_reenable(void) {}
+static inline void cmci_rediscover(int dying) {}
+static inline void cmci_recheck(void) {}
 #endif
 
 #ifdef CONFIG_X86_MCE_AMD
@@ -115,6 +123,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
 static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
 #endif
 
+extern int mce_available(struct cpuinfo_x86 *c);
+
 void mce_log_therm_throt_event(__u64 status);
 
 extern atomic_t mce_entry;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index a8ff38b..bfbd532 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -166,7 +166,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 	panic(msg);
 }
 
-static int mce_available(struct cpuinfo_x86 *c)
+int mce_available(struct cpuinfo_x86 *c)
 {
 	if (mce_dont_init)
 		return 0;
@@ -1060,9 +1060,12 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 static void mce_disable_cpu(void *h)
 {
 	int i;
+	unsigned long action = *(unsigned long *)h;
 
 	if (!mce_available(&current_cpu_data))
 		return;
+	if (!(action & CPU_TASKS_FROZEN))
+		cmci_clear();
 	for (i = 0; i < banks; i++)
 		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 }
@@ -1070,9 +1073,12 @@ static void mce_disable_cpu(void *h)
 static void mce_reenable_cpu(void *h)
 {
 	int i;
+	unsigned long action = *(unsigned long *)h;
 
 	if (!mce_available(&current_cpu_data))
 		return;
+	if (!(action & CPU_TASKS_FROZEN))
+		cmci_reenable();
 	for (i = 0; i < banks; i++)
 		wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
 }
@@ -1100,13 +1106,17 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		del_timer_sync(t);
-		smp_call_function_single(cpu, mce_disable_cpu, NULL, 1);
+		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
 		break;
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 		t->expires = round_jiffies_relative(jiffies + next_interval);
 		add_timer_on(t, cpu);
-		smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1);
+		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+		break;
+	case CPU_POST_DEAD:
+		/* intentionally ignoring frozen here */
+		cmci_rediscover(cpu);
 		break;
 	}
 	return NOTIFY_OK;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 1b1491a..a518ec8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
 /*
  * Intel specific MCE features.
  * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
  */
 
 #include <linux/init.h>
@@ -12,6 +14,7 @@
 #include <asm/hw_irq.h>
 #include <asm/idle.h>
 #include <asm/therm_throt.h>
+#include <asm/apic.h>
 
 asmlinkage void smp_thermal_interrupt(void)
 {
@@ -84,7 +87,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	return;
 }
 
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_SPINLOCK(cmci_discover_lock);
+
+#define CMCI_THRESHOLD 1
+
+static __cpuinit int cmci_supported(int *banks)
+{
+	u64 cap;
+
+	/*
+	 * Vendor check is not strictly needed, but the initial
+	 * initialization is vendor keyed and this
+	 * makes sure none of the backdoors are entered otherwise.
+	 */
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return 0;
+	if (!cpu_has_apic || lapic_get_maxlvt() < 6)
+		return 0;
+	rdmsrl(MSR_IA32_MCG_CAP, cap);
+	*banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+	return !!(cap & MCG_CMCI_P);
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+	mce_notify_user();
+}
+
+static void print_update(char *type, int *hdr, int num)
+{
+	if (*hdr == 0)
+		printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
+	*hdr = 1;
+	printk(KERN_CONT " %s:%d", type, num);
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks.
+ */
+static __cpuinit void cmci_discover(int banks, int boot)
+{
+	unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
+	int hdr = 0;
+	int i;
+
+	spin_lock(&cmci_discover_lock);
+	for (i = 0; i < banks; i++) {
+		u64 val;
+
+		if (test_bit(i, owned))
+			continue;
+
+		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+		/* Already owned by someone else? */
+		if (val & CMCI_EN) {
+			if (test_and_clear_bit(i, owned) || boot)
+				print_update("SHD", &hdr, i);
+			__clear_bit(i, __get_cpu_var(mce_poll_banks));
+			continue;
+		}
+
+		val |= CMCI_EN | CMCI_THRESHOLD;
+		wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+		/* Did the enable bit stick? -- the bank supports CMCI */
+		if (val & CMCI_EN) {
+			if (!test_and_set_bit(i, owned) || boot)
+				print_update("CMCI", &hdr, i);
+			__clear_bit(i, __get_cpu_var(mce_poll_banks));
+		} else {
+			WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
+		}
+	}
+	spin_unlock(&cmci_discover_lock);
+	if (hdr)
+		printk(KERN_CONT "\n");
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+__cpuinit void cmci_recheck(void)
+{
+	unsigned long flags;
+	int banks;
+
+	if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
+		return;
+	local_irq_save(flags);
+	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+	local_irq_restore(flags);
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void __cpuexit cmci_clear(void)
+{
+	int i;
+	int banks;
+	u64 val;
+
+	if (!cmci_supported(&banks))
+		return;
+	spin_lock(&cmci_discover_lock);
+	for (i = 0; i < banks; i++) {
+		if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+			continue;
+		/* Disable CMCI */
+		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+		wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		__clear_bit(i, __get_cpu_var(mce_banks_owned));
+	}
+	spin_unlock(&cmci_discover_lock);
+}
+
+/*
+ * After a CPU went down cycle through all the others and rediscover
+ * Must run in process context.
+ */
+void __cpuexit cmci_rediscover(int dying)
+{
+	int banks;
+	int cpu;
+	cpumask_var_t old;
+
+	if (!cmci_supported(&banks))
+		return;
+	if (!alloc_cpumask_var(&old, GFP_KERNEL))
+		return;
+	cpumask_copy(old, &current->cpus_allowed);
+
+	for_each_online_cpu (cpu) {
+		if (cpu == dying)
+			continue;
+		if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
+			continue;
+		/* Recheck banks in case CPUs don't all have the same */
+		if (cmci_supported(&banks))
+			cmci_discover(banks, 0);
+	}
+
+	set_cpus_allowed_ptr(current, old);
+	free_cpumask_var(old);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+	int banks;
+	if (cmci_supported(&banks))
+		cmci_discover(banks, 0);
+}
+
+static __cpuinit void intel_init_cmci(void)
+{
+	int banks;
+
+	if (!cmci_supported(&banks))
+		return;
+
+	mce_threshold_vector = intel_threshold_interrupt;
+	cmci_discover(banks, 1);
+	/*
+	 * For CPU #0 this runs with still disabled APIC, but that's
+	 * ok because only the vector is set up. We still do another
+	 * check for the banks later for CPU #0 just to make sure
+	 * to not miss any events.
+	 */
+	apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+	cmci_recheck();
+}
+
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
 	intel_init_thermal(c);
+	intel_init_cmci();
 }
-- 
cgit v0.10.2


From df20e2eb3e59b8625021a1bc8b1b53a4edc6008b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 24 Feb 2009 13:19:02 -0800
Subject: x86, mce, cmci: remove incorrect __cpuinit/__cpuexit annotations

Impact: Bug fix on UP

The MCE code is reinitialized from resume, so we can't use
__cpuinit/__cpuexit for most of the code.  Remove those annotations
for anything downstream of mce_init().

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index a518ec8..7a2e10f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -104,7 +104,7 @@ static DEFINE_SPINLOCK(cmci_discover_lock);
 
 #define CMCI_THRESHOLD 1
 
-static __cpuinit int cmci_supported(int *banks)
+static int cmci_supported(int *banks)
 {
 	u64 cap;
 
@@ -147,7 +147,7 @@ static void print_update(char *type, int *hdr, int num)
  * on this CPU. Use the algorithm recommended in the SDM to discover shared
  * banks.
  */
-static __cpuinit void cmci_discover(int banks, int boot)
+static void cmci_discover(int banks, int boot)
 {
 	unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
 	int hdr = 0;
@@ -192,7 +192,7 @@ static __cpuinit void cmci_discover(int banks, int boot)
  * Just in case we missed an event during initialization check
  * all the CMCI owned banks.
  */
-__cpuinit void cmci_recheck(void)
+void cmci_recheck(void)
 {
 	unsigned long flags;
 	int banks;
@@ -208,7 +208,7 @@ __cpuinit void cmci_recheck(void)
  * Disable CMCI on this CPU for all banks it owns when it goes down.
  * This allows other CPUs to claim the banks on rediscovery.
  */
-void __cpuexit cmci_clear(void)
+void cmci_clear(void)
 {
 	int i;
 	int banks;
@@ -233,7 +233,7 @@ void __cpuexit cmci_clear(void)
  * After a CPU went down cycle through all the others and rediscover
  * Must run in process context.
  */
-void __cpuexit cmci_rediscover(int dying)
+void cmci_rediscover(int dying)
 {
 	int banks;
 	int cpu;
-- 
cgit v0.10.2


From 5ca8681ca10f671427710f4954644359856581a3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:37 +0100
Subject: x86, mce, cmci: disable CMCI on rebooting

Impact: Avoids confusing other OSes.

Disable the CMCI vector on reboot to avoid confusing other OS.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 570f36e..648676f 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -868,6 +868,14 @@ void clear_local_APIC(void)
 		apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
 	}
 #endif
+#ifdef CONFIG_X86_MCE_INTEL
+	if (maxlvt >= 6) {
+		v = apic_read(APIC_LVTCMCI);
+		if (!(v & APIC_LVT_MASKED))
+			apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+	}
+#endif
+
 	/*
 	 * Clean APIC state for other OSs:
 	 */
-- 
cgit v0.10.2


From be71b8553d0522aba535a815baaebb1f0bb9a9ec Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:38 +0100
Subject: x86, mce, cmci: recheck CMCI banks after APIC has been enabled on CPU
 #0

Impact: Fix marginal race condition

One the first CPU the machine checks are enabled early before
the local APIC is enabled. This could in theory lead
to some lost CMCI events very early during boot because
CMCIs cannot be delivered with disabled LAPIC.

The poller also doesn't recover from this because it doesn't
check CMCI banks.

Add an explicit CMCI banks check after the LAPIC is enabled.
This is only done for CPU #0, the other CPUs only initialize
machine checks after the LAPIC is on.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 648676f..57b5377 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -48,6 +48,7 @@
 #include <asm/apic.h>
 #include <asm/i8259.h>
 #include <asm/smp.h>
+#include <asm/mce.h>
 
 #include <mach_apic.h>
 #include <mach_apicdef.h>
@@ -1270,6 +1271,12 @@ void __cpuinit setup_local_APIC(void)
 	apic_write(APIC_LVT1, value);
 
 	preempt_enable();
+
+#ifdef CONFIG_X86_MCE_INTEL
+	/* Recheck CMCI information after local APIC is up on CPU #0 */
+	if (smp_processor_id() == 0)
+		cmci_recheck();
+#endif
 }
 
 void __cpuinit end_local_APIC_setup(void)
-- 
cgit v0.10.2


From 24ff954233ecfd45801383f831626f88937ebe6f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 25 Feb 2009 10:38:10 +0900
Subject: x86, percpu: fix minor bugs in setup_percpu.c

Recent changes in setup_percpu.c made a now meaningless DBG()
statement fail to compile and introduced a
comparison-of-different-types warning.  Fix them.

Compile failure is reported by Ingo Molnar.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 2d946a8..c29f301 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -270,7 +270,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 
 	/* allocate and copy */
 	pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
-	pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
+	pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
 	pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
 				       PAGE_SIZE);
 	if (!pcpue_ptr)
@@ -438,8 +438,6 @@ void __init setup_per_cpu_areas(void)
 		 */
 		if (cpu == boot_cpu_id)
 			switch_to_new_gdt(cpu);
-
-		DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
 	}
 
 	/* indicate the early static arrays will soon be gone */
-- 
cgit v0.10.2


From d325100504f1d0c296a1fbfef558deaa655e2240 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 25 Feb 2009 11:01:40 +0900
Subject: x86: convert cacheflush macros inline functions

Impact: cleanup

Unused macro parameters cause spurious unused variable warnings.
Convert all cacheflush macros to inline functions to avoid the
warnings and achieve better type checking.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 2f84665..5b301b7 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -5,24 +5,43 @@
 #include <linux/mm.h>
 
 /* Caches aren't brain-dead on the intel. */
-#define flush_cache_all()			do { } while (0)
-#define flush_cache_mm(mm)			do { } while (0)
-#define flush_cache_dup_mm(mm)			do { } while (0)
-#define flush_cache_range(vma, start, end)	do { } while (0)
-#define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
-#define flush_dcache_page(page)			do { } while (0)
-#define flush_dcache_mmap_lock(mapping)		do { } while (0)
-#define flush_dcache_mmap_unlock(mapping)	do { } while (0)
-#define flush_icache_range(start, end)		do { } while (0)
-#define flush_icache_page(vma, pg)		do { } while (0)
-#define flush_icache_user_range(vma, pg, adr, len)	do { } while (0)
-#define flush_cache_vmap(start, end)		do { } while (0)
-#define flush_cache_vunmap(start, end)		do { } while (0)
+static inline void flush_cache_all(void) { }
+static inline void flush_cache_mm(struct mm_struct *mm) { }
+static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
+static inline void flush_cache_range(struct vm_area_struct *vma,
+				     unsigned long start, unsigned long end) { }
+static inline void flush_cache_page(struct vm_area_struct *vma,
+				    unsigned long vmaddr, unsigned long pfn) { }
+static inline void flush_dcache_page(struct page *page) { }
+static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
+static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
+static inline void flush_icache_range(unsigned long start,
+				      unsigned long end) { }
+static inline void flush_icache_page(struct vm_area_struct *vma,
+				     struct page *page) { }
+static inline void flush_icache_user_range(struct vm_area_struct *vma,
+					   struct page *page,
+					   unsigned long addr,
+					   unsigned long len) { }
+static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
+static inline void flush_cache_vunmap(unsigned long start,
+				      unsigned long end) { }
 
-#define copy_to_user_page(vma, page, vaddr, dst, src, len)	\
-	memcpy((dst), (src), (len))
-#define copy_from_user_page(vma, page, vaddr, dst, src, len)	\
-	memcpy((dst), (src), (len))
+static inline void copy_to_user_page(struct vm_area_struct *vma,
+				     struct page *page, unsigned long vaddr,
+				     void *dst, const void *src,
+				     unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+
+static inline void copy_from_user_page(struct vm_area_struct *vma,
+				       struct page *page, unsigned long vaddr,
+				       void *dst, const void *src,
+				       unsigned long len)
+{
+	memcpy(dst, src, len);
+}
 
 #define PG_non_WB				PG_arch_1
 PAGEFLAG(NonWB, non_WB)
-- 
cgit v0.10.2


From 0dcec8c27ba44cd11c6e68c46d5fd553818a3837 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Feb 2009 14:07:33 +0100
Subject: alloc_percpu: add align argument to __alloc_percpu, fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: build fix

API was changed, but not all usage sites were converted:

 net/ipv4/route.c: In function ‘ip_rt_init’:
 net/ipv4/route.c:3379: error: too few arguments to function ‘__alloc_percpu’

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 97f7115..bf89540 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3376,7 +3376,7 @@ int __init ip_rt_init(void)
 	int rc = 0;
 
 #ifdef CONFIG_NET_CLS_ROUTE
-	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
+	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
 	if (!ip_rt_acct)
 		panic("IP: failed to allocate ip_rt_acct\n");
 #endif
-- 
cgit v0.10.2


From d2b0261506602bd969164879206027b30358ffdf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Feb 2009 14:36:45 +0100
Subject: alloc_percpu: fix UP build

Impact: build fix

the !SMP branch had a 'gfp' leftover:

 include/linux/percpu.h: In function '__alloc_percpu':
 include/linux/percpu.h:160: error: 'gfp' undeclared (first use in this function)
 include/linux/percpu.h:160: error: (Each undeclared identifier is reported only once
 include/linux/percpu.h:160: error: for each function it appears in.)

Use GFP_KERNEL like the SMP version does.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 910beb0..d8e5a9a 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -157,7 +157,7 @@ static inline void *__alloc_percpu(size_t size, size_t align)
 	 * percpu sections on SMP for which this path isn't used.
 	 */
 	WARN_ON_ONCE(align > __alignof__(unsigned long long));
-	return kzalloc(size, gfp);
+	return kzalloc(size, GFP_KERNEL);
 }
 
 static inline void free_percpu(void *p)
-- 
cgit v0.10.2


From e317603694bfd17b28a40de9d65e1a4ec12f816e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 26 Feb 2009 10:54:17 +0900
Subject: percpu: fix too low alignment restriction on UP

UP __alloc_percpu() triggered WARN_ON_ONCE() if the requested
alignment is larger than that of unsigned long long, which is too
small for all the cacheline aligned allocations.  Bump it up to
SMP_CACHE_BYTES which kmalloc allocations generally guarantee.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index d8e5a9a..545b068 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -156,7 +156,7 @@ static inline void *__alloc_percpu(size_t size, size_t align)
 	 * on it.  Larger alignment should only be used for module
 	 * percpu sections on SMP for which this path isn't used.
 	 */
-	WARN_ON_ONCE(align > __alignof__(unsigned long long));
+	WARN_ON_ONCE(align > SMP_CACHE_BYTES);
 	return kzalloc(size, GFP_KERNEL);
 }
 
-- 
cgit v0.10.2


From a682604838763981613e42015cd0e39f2989d6bb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 25 Feb 2009 18:03:42 -0800
Subject: rcu: Teach RCU that idle task is not quiscent state at boot

This patch fixes a bug located by Vegard Nossum with the aid of
kmemcheck, updated based on review comments from Nick Piggin,
Ingo Molnar, and Andrew Morton.  And cleans up the variable-name
and function-name language.  ;-)

The boot CPU runs in the context of its idle thread during boot-up.
During this time, idle_cpu(0) will always return nonzero, which will
fool Classic and Hierarchical RCU into deciding that a large chunk of
the boot-up sequence is a big long quiescent state.  This in turn causes
RCU to prematurely end grace periods during this time.

This patch changes the rcutree.c and rcuclassic.c rcu_check_callbacks()
function to ignore the idle task as a quiescent state until the
system has started up the scheduler in rest_init(), introducing a
new non-API function rcu_idle_now_means_idle() to inform RCU of this
transition.  RCU maintains an internal rcu_idle_cpu_truthful variable
to track this state, which is then used by rcu_check_callback() to
determine if it should believe idle_cpu().

Because this patch has the effect of disallowing RCU grace periods
during long stretches of the boot-up sequence, this patch also introduces
Josh Triplett's UP-only optimization that makes synchronize_rcu() be a
no-op if num_online_cpus() returns 1.  This allows boot-time code that
calls synchronize_rcu() to proceed normally.  Note, however, that RCU
callbacks registered by call_rcu() will likely queue up until later in
the boot sequence.  Although rcuclassic and rcutree can also use this
same optimization after boot completes, rcupreempt must restrict its
use of this optimization to the portion of the boot sequence before the
scheduler starts up, given that an rcupreempt RCU read-side critical
section may be preeempted.

In addition, this patch takes Nick Piggin's suggestion to make the
system_state global variable be __read_mostly.

Changes since v4:

o	Changes the name of the introduced function and variable to
	be less emotional.  ;-)

Changes since v3:

o	WARN_ON(nr_context_switches() > 0) to verify that RCU
	switches out of boot-time mode before the first context
	switch, as suggested by Nick Piggin.

Changes since v2:

o	Created rcu_blocking_is_gp() internal-to-RCU API that
	determines whether a call to synchronize_rcu() is itself
	a grace period.

o	The definition of rcu_blocking_is_gp() for rcuclassic and
	rcutree checks to see if but a single CPU is online.

o	The definition of rcu_blocking_is_gp() for rcupreempt
	checks to see both if but a single CPU is online and if
	the system is still in early boot.

	This allows rcupreempt to again work correctly if running
	on a single CPU after booting is complete.

o	Added check to rcupreempt's synchronize_sched() for there
	being but one online CPU.

Tested all three variants both SMP and !SMP, booted fine, passed a short
rcutorture test on both x86 and Power.

Located-by: Vegard Nossum <vegard.nossum@gmail.com>
Tested-by: Vegard Nossum <vegard.nossum@gmail.com>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index f3f697d..80044a4 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -181,4 +181,10 @@ extern long rcu_batches_completed_bh(void);
 #define rcu_enter_nohz()	do { } while (0)
 #define rcu_exit_nohz()		do { } while (0)
 
+/* A context switch is a grace period for rcuclassic. */
+static inline int rcu_blocking_is_gp(void)
+{
+	return num_online_cpus() == 1;
+}
+
 #endif /* __LINUX_RCUCLASSIC_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 921340a..528343e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,6 +52,9 @@ struct rcu_head {
 	void (*func)(struct rcu_head *head);
 };
 
+/* Internal to kernel, but needed by rcupreempt.h. */
+extern int rcu_scheduler_active;
+
 #if defined(CONFIG_CLASSIC_RCU)
 #include <linux/rcuclassic.h>
 #elif defined(CONFIG_TREE_RCU)
@@ -265,6 +268,7 @@ extern void rcu_barrier_sched(void);
 
 /* Internal to kernel */
 extern void rcu_init(void);
+extern void rcu_scheduler_starting(void);
 extern int rcu_needs_cpu(int cpu);
 
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
index 3e05c09..74304b4 100644
--- a/include/linux/rcupreempt.h
+++ b/include/linux/rcupreempt.h
@@ -142,4 +142,19 @@ static inline void rcu_exit_nohz(void)
 #define rcu_exit_nohz()		do { } while (0)
 #endif /* CONFIG_NO_HZ */
 
+/*
+ * A context switch is a grace period for rcupreempt synchronize_rcu()
+ * only during early boot, before the scheduler has been initialized.
+ * So, how the heck do we get a context switch?  Well, if the caller
+ * invokes synchronize_rcu(), they are willing to accept a context
+ * switch, so we simply pretend that one happened.
+ *
+ * After boot, there might be a blocked or preempted task in an RCU
+ * read-side critical section, so we cannot then take the fastpath.
+ */
+static inline int rcu_blocking_is_gp(void)
+{
+	return num_online_cpus() == 1 && !rcu_scheduler_active;
+}
+
 #endif /* __LINUX_RCUPREEMPT_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index d4368b7..a722fb6 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -326,4 +326,10 @@ static inline void rcu_exit_nohz(void)
 }
 #endif /* CONFIG_NO_HZ */
 
+/* A context switch is a grace period for rcutree. */
+static inline int rcu_blocking_is_gp(void)
+{
+	return num_online_cpus() == 1;
+}
+
 #endif /* __LINUX_RCUTREE_H */
diff --git a/init/main.c b/init/main.c
index 8442094..83697e1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -97,7 +97,7 @@ static inline void mark_rodata_ro(void) { }
 extern void tc_init(void);
 #endif
 
-enum system_states system_state;
+enum system_states system_state __read_mostly;
 EXPORT_SYMBOL(system_state);
 
 /*
@@ -463,6 +463,7 @@ static noinline void __init_refok rest_init(void)
 	 * at least once to get things moving:
 	 */
 	init_idle_bootup_task(current);
+	rcu_scheduler_starting();
 	preempt_enable_no_resched();
 	schedule();
 	preempt_disable();
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index bd5a900..654c640 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -679,8 +679,8 @@ int rcu_needs_cpu(int cpu)
 void rcu_check_callbacks(int cpu, int user)
 {
 	if (user ||
-	    (idle_cpu(cpu) && !in_softirq() &&
-				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+	    (idle_cpu(cpu) && rcu_scheduler_active &&
+	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
 
 		/*
 		 * Get here if this CPU took its interrupt from user
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index d92a76a..cae8a05 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,6 +44,7 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
+#include <linux/kernel_stat.h>
 
 enum rcu_barrier {
 	RCU_BARRIER_STD,
@@ -55,6 +56,7 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
+int rcu_scheduler_active __read_mostly;
 
 /*
  * Awaken the corresponding synchronize_rcu() instance now that a
@@ -80,6 +82,10 @@ void wakeme_after_rcu(struct rcu_head  *head)
 void synchronize_rcu(void)
 {
 	struct rcu_synchronize rcu;
+
+	if (rcu_blocking_is_gp())
+		return;
+
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu(&rcu.head, wakeme_after_rcu);
@@ -175,3 +181,9 @@ void __init rcu_init(void)
 	__rcu_init();
 }
 
+void rcu_scheduler_starting(void)
+{
+	WARN_ON(num_online_cpus() != 1);
+	WARN_ON(nr_context_switches() > 0);
+	rcu_scheduler_active = 1;
+}
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 33cfc50..5d59e85 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1181,6 +1181,9 @@ void __synchronize_sched(void)
 {
 	struct rcu_synchronize rcu;
 
+	if (num_online_cpus() == 1)
+		return;  /* blocking is gp if only one CPU! */
+
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_sched(&rcu.head, wakeme_after_rcu);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index b2fd602..97ce315 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -948,8 +948,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
 void rcu_check_callbacks(int cpu, int user)
 {
 	if (user ||
-	    (idle_cpu(cpu) && !in_softirq() &&
-				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+	    (idle_cpu(cpu) && rcu_scheduler_active &&
+	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
 
 		/*
 		 * Get here if this CPU took its interrupt from user
-- 
cgit v0.10.2


From a760a6656e6f00bb0144a42a048cf0266646e22c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 26 Feb 2009 14:06:31 +0800
Subject: crypto: api - Fix module load deadlock with fallback algorithms

With the mandatory algorithm testing at registration, we have
now created a deadlock with algorithms requiring fallbacks.
This can happen if the module containing the algorithm requiring
fallback is loaded first, without the fallback module being loaded
first.  The system will then try to test the new algorithm, find
that it needs to load a fallback, and then try to load that.

As both algorithms share the same module alias, it can attempt
to load the original algorithm again and block indefinitely.

As algorithms requiring fallbacks are a special case, we can fix
this by giving them a different module alias than the rest.  Then
it's just a matter of using the right aliases according to what
algorithms we're trying to find.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index c42cd89..6118890 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -556,7 +556,7 @@ static void __exit aes_s390_fini(void)
 module_init(aes_s390_init);
 module_exit(aes_s390_fini);
 
-MODULE_ALIAS("aes");
+MODULE_ALIAS("aes-all");
 
 MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
 MODULE_LICENSE("GPL");
diff --git a/crypto/api.c b/crypto/api.c
index efe77df..38a2bc0 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -215,8 +215,19 @@ struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask)
 	mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD);
 	type &= mask;
 
-	alg = try_then_request_module(crypto_alg_lookup(name, type, mask),
-				      name);
+	alg = crypto_alg_lookup(name, type, mask);
+	if (!alg) {
+		char tmp[CRYPTO_MAX_ALG_NAME];
+
+		request_module(name);
+
+		if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask) &&
+		    snprintf(tmp, sizeof(tmp), "%s-all", name) < sizeof(tmp))
+			request_module(tmp);
+
+		alg = crypto_alg_lookup(name, type, mask);
+	}
+
 	if (alg)
 		return crypto_is_larval(alg) ? crypto_larval_wait(alg) : alg;
 
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index 856b3cc..3f0fdd1 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -489,4 +489,4 @@ MODULE_DESCRIPTION("VIA PadLock AES algorithm support");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Michal Ludvig");
 
-MODULE_ALIAS("aes");
+MODULE_ALIAS("aes-all");
diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c
index a7fbade..a2c8e85 100644
--- a/drivers/crypto/padlock-sha.c
+++ b/drivers/crypto/padlock-sha.c
@@ -304,7 +304,7 @@ MODULE_DESCRIPTION("VIA PadLock SHA1/SHA256 algorithms support.");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Michal Ludvig");
 
-MODULE_ALIAS("sha1");
-MODULE_ALIAS("sha256");
+MODULE_ALIAS("sha1-all");
+MODULE_ALIAS("sha256-all");
 MODULE_ALIAS("sha1-padlock");
 MODULE_ALIAS("sha256-padlock");
-- 
cgit v0.10.2


From cac64d00c256e65776d575e82aaf540632b66178 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 25 Feb 2009 09:59:26 -0800
Subject: sched_rt: don't start timer when rt bandwidth disabled

Impact: fix incorrect condition check

No need to start rt bandwidth timer when rt bandwidth is disabled.
If this timer starts, it may stop at sched_rt_period_timer() on the first time.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 410eec4..c3baa96 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
 	ktime_t now;
 
-	if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
+	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return;
 
 	if (hrtimer_active(&rt_b->rt_period_timer))
-- 
cgit v0.10.2


From 54e991242850edc8c53f71fa5aa3ba7a93ce38f5 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Fri, 27 Feb 2009 15:13:54 +0530
Subject: sched: don't allow setuid to succeed if the user does not have rt
 bandwidth

Impact: fix hung task with certain (non-default) rt-limit settings

Corey Hickey reported that on using setuid to change the uid of a
rt process, the process would be unkillable and not be running.
This is because there was no rt runtime for that user group. Add
in a check to see if a user can attach an rt task to its task group.
On failure, return EINVAL, which is also returned in
CONFIG_CGROUP_SCHED.

Reported-by: Corey Hickey <bugfood-ml@fatooh.org>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8981e52..8c216e0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2291,9 +2291,13 @@ extern long sched_group_rt_runtime(struct task_group *tg);
 extern int sched_group_set_rt_period(struct task_group *tg,
 				      long rt_period_us);
 extern long sched_group_rt_period(struct task_group *tg);
+extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
 #endif
 #endif
 
+extern int task_can_switch_user(struct user_struct *up,
+					struct task_struct *tsk);
+
 #ifdef CONFIG_TASK_XACCT
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index c3baa96..8e2558c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9224,6 +9224,16 @@ static int sched_rt_global_constraints(void)
 
 	return ret;
 }
+
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+	/* Don't accept realtime tasks when there is no way for them to run */
+	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+		return 0;
+
+	return 1;
+}
+
 #else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
@@ -9317,8 +9327,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		      struct task_struct *tsk)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
-	/* Don't accept realtime tasks when there is no way for them to run */
-	if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
+	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
 		return -EINVAL;
 #else
 	/* We don't support RT-tasks being in separate groups */
diff --git a/kernel/sys.c b/kernel/sys.c
index f145c41..37f458e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -559,7 +559,7 @@ error:
 	abort_creds(new);
 	return retval;
 }
-  
+
 /*
  * change the user struct in a credentials set to match the new UID
  */
@@ -571,6 +571,11 @@ static int set_user(struct cred *new)
 	if (!new_user)
 		return -EAGAIN;
 
+	if (!task_can_switch_user(new_user, current)) {
+		free_uid(new_user);
+		return -EINVAL;
+	}
+
 	if (atomic_read(&new_user->processes) >=
 				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
 			new_user != INIT_USER) {
@@ -631,10 +636,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 			goto error;
 	}
 
-	retval = -EAGAIN;
-	if (new->uid != old->uid && set_user(new) < 0)
-		goto error;
-
+	if (new->uid != old->uid) {
+		retval = set_user(new);
+		if (retval < 0)
+			goto error;
+	}
 	if (ruid != (uid_t) -1 ||
 	    (euid != (uid_t) -1 && euid != old->uid))
 		new->suid = new->euid;
@@ -680,9 +686,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
 	retval = -EPERM;
 	if (capable(CAP_SETUID)) {
 		new->suid = new->uid = uid;
-		if (uid != old->uid && set_user(new) < 0) {
-			retval = -EAGAIN;
-			goto error;
+		if (uid != old->uid) {
+			retval = set_user(new);
+			if (retval < 0)
+				goto error;
 		}
 	} else if (uid != old->uid && uid != new->suid) {
 		goto error;
@@ -734,11 +741,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 			goto error;
 	}
 
-	retval = -EAGAIN;
 	if (ruid != (uid_t) -1) {
 		new->uid = ruid;
-		if (ruid != old->uid && set_user(new) < 0)
-			goto error;
+		if (ruid != old->uid) {
+			retval = set_user(new);
+			if (retval < 0)
+				goto error;
+		}
 	}
 	if (euid != (uid_t) -1)
 		new->euid = euid;
diff --git a/kernel/user.c b/kernel/user.c
index 3551ac7..6a9b696 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -362,6 +362,24 @@ static void free_user(struct user_struct *up, unsigned long flags)
 
 #endif
 
+#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
+/*
+ * We need to check if a setuid can take place. This function should be called
+ * before successfully completing the setuid.
+ */
+int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
+{
+
+	return sched_rt_can_attach(up->tg, tsk);
+
+}
+#else
+int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
+{
+	return 1;
+}
+#endif
+
 /*
  * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
  * caller must undo that ref with free_uid().
-- 
cgit v0.10.2


From 02d51fdfb2bfcf6bbd776f983177f55868aa0a79 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Mar 2009 15:42:36 +0900
Subject: percpu: kill compile warning in pcpu_populate_chunk()

Impact: remove compile warning

Mark local variable map_end in pcpu_populate_chunk() with
uninitialized_var().  The variable is always used in tandem with
map_start and guaranteed to be initialized before use but gcc doesn't
understand that.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ingo Molnar <mingo@elte.hu>

diff --git a/mm/percpu.c b/mm/percpu.c
index 5954e7a..3d0f545 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -639,7 +639,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 	int page_start = PFN_DOWN(off);
 	int page_end = PFN_UP(off + size);
 	int map_start = -1;
-	int map_end;
+	int uninitialized_var(map_end);
 	unsigned int cpu;
 	int i;
 
-- 
cgit v0.10.2


From af6326d72c95d6e0bbc88c92185c654f57acef3b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Mar 2009 16:03:16 +0900
Subject: alpha: fix typo in recent early vmalloc change

Impact: fix build

Add missing 'o' in variable name.  Compile tested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 91eddd8..af71d38 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -202,7 +202,7 @@ callback_init(void * kernel_end)
 		console_remap_vm.size = nr_pages << PAGE_SHIFT;
 		vm_area_register_early(&console_remap_vm, PAGE_SIZE);
 
-		vaddr = (unsigned long)consle_remap_vm.addr;
+		vaddr = (unsigned long)console_remap_vm.addr;
 
 		/* Set up the third level PTEs and update the virtual
 		   addresses of the CRB entries.  */
-- 
cgit v0.10.2


From d0c4f570276cb4d2dc4215b90eb7cb6e2bdd4a15 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Mar 2009 16:06:56 +0900
Subject: bootmem, x86: further fixes for arch-specific bootmem wrapping

Impact: fix new breakages introduced by previous fix

Commit c132937556f56ee4b831ef4b23f1846e05fde102 tried to clean up
bootmem arch wrapper but it wasn't quite correct.  Before the commit,
the followings were broken.

* Low level interface functions prefixed with __ ignored arch
  preference.

* reserve_bootmem(...) can't be mapped into
  reserve_bootmem_node(NODE_DATA(0)->bdata, ...) because the node is
  not preference here.  The region specified MUST fall into the
  specified region; otherwise, it will panic.

After the commit,

* If allocation fails for the arch preferred node, it should fallback
  to whatever is available.  Instead, it simply failed allocation.

There are too many internal details to allow generic wrapping and
still keep things simple for archs.  Plus, all that arch wants is a
way to prefer certain node over another.

This patch drops the generic wrapping around alloc_bootmem_core() and
add alloc_bootmem_core() instead.  If necessary, arch can define
bootmem_arch_referred_node() macro or function which takes all
allocation information and returns the preferred node.  bootmem
generic code will always try the preferred node first and then
fallback to other nodes as usual.

Breakages noted and changes reviewed by Johannes Weiner.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>

diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index eeacf67..ede6998 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -92,12 +92,8 @@ static inline int pfn_valid(int pfn)
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
 /* always use node 0 for bootmem on this numa platform */
-#define alloc_bootmem_core(__bdata, size, align, goal, limit)		\
-({									\
-	bootmem_data_t __maybe_unused *	__abm_bdata_dummy = (__bdata);	\
-	__alloc_bootmem_core(NODE_DATA(0)->bdata,			\
-			     (size), (align), (goal), (limit));		\
-})
+#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit)	\
+	(NODE_DATA(0)->bdata)
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 
 #endif /* _ASM_X86_MMZONE_32_H */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d7140c0..daf9271 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -37,16 +37,6 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
 
 static int bootmem_debug;
 
-/*
- * If an arch needs to apply workarounds to bootmem allocation, it can
- * set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around
- * __alloc_bootmem_core().
- */
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM
-#define alloc_bootmem_core(bdata, size, align, goal, limit)		\
-	__alloc_bootmem_core((bdata), (size), (align), (goal), (limit))
-#endif
-
 static int __init bootmem_debug_setup(char *buf)
 {
 	bootmem_debug = 1;
@@ -436,9 +426,9 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
 	return ALIGN(base + off, align) - base;
 }
 
-static void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
-				unsigned long size, unsigned long align,
-				unsigned long goal, unsigned long limit)
+static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+					unsigned long size, unsigned long align,
+					unsigned long goal, unsigned long limit)
 {
 	unsigned long fallback = 0;
 	unsigned long min, max, start, sidx, midx, step;
@@ -538,17 +528,34 @@ find_block:
 	return NULL;
 }
 
+static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
+					unsigned long size, unsigned long align,
+					unsigned long goal, unsigned long limit)
+{
+#ifdef CONFIG_HAVE_ARCH_BOOTMEM
+	bootmem_data_t *p_bdata;
+
+	p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit);
+	if (p_bdata)
+		return alloc_bootmem_core(p_bdata, size, align, goal, limit);
+#endif
+	return NULL;
+}
+
 static void * __init ___alloc_bootmem_nopanic(unsigned long size,
 					unsigned long align,
 					unsigned long goal,
 					unsigned long limit)
 {
 	bootmem_data_t *bdata;
+	void *region;
 
 restart:
-	list_for_each_entry(bdata, &bdata_list, list) {
-		void *region;
+	region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
+	if (region)
+		return region;
 
+	list_for_each_entry(bdata, &bdata_list, list) {
 		if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
 			continue;
 		if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
@@ -626,6 +633,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
 {
 	void *ptr;
 
+	ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit);
+	if (ptr)
+		return ptr;
+
 	ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
 	if (ptr)
 		return ptr;
@@ -682,6 +693,10 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 {
 	void *ptr;
 
+	ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
+	if (ptr)
+		return ptr;
+
 	ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
 	if (ptr)
 		return ptr;
-- 
cgit v0.10.2


From fab852aaf761a00cfe16330429b7cac15cceaeb9 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Sun, 1 Mar 2009 16:09:14 +0200
Subject: x86: count errors in testmmiotrace.ko

Check the read values against the written values in the MMIO read/write
test. This test shows if the given MMIO test area really works as
memory, which is a prerequisite for a successful mmiotrace test.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Cc: Stuart Bennett <stuart@freedesktop.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index ab50a8d..4b29f3b 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,5 +1,5 @@
 /*
- * Written by Pekka Paalanen, 2008 <pq@iki.fi>
+ * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
  */
 #include <linux/module.h>
 #include <linux/io.h>
@@ -11,28 +11,51 @@ static unsigned long mmio_address;
 module_param(mmio_address, ulong, 0);
 MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
 
+static unsigned v16(unsigned i)
+{
+	return i * 12 + 7;
+}
+
+static unsigned v32(unsigned i)
+{
+	return i * 212371 + 13;
+}
+
 static void do_write_test(void __iomem *p)
 {
 	unsigned int i;
 	mmiotrace_printk("Write test.\n");
+
 	for (i = 0; i < 256; i++)
 		iowrite8(i, p + i);
+
 	for (i = 1024; i < (5 * 1024); i += 2)
-		iowrite16(i * 12 + 7, p + i);
+		iowrite16(v16(i), p + i);
+
 	for (i = (5 * 1024); i < (16 * 1024); i += 4)
-		iowrite32(i * 212371 + 13, p + i);
+		iowrite32(v32(i), p + i);
 }
 
 static void do_read_test(void __iomem *p)
 {
 	unsigned int i;
+	unsigned errs[3] = { 0 };
 	mmiotrace_printk("Read test.\n");
+
 	for (i = 0; i < 256; i++)
-		ioread8(p + i);
+		if (ioread8(p + i) != i)
+			++errs[0];
+
 	for (i = 1024; i < (5 * 1024); i += 2)
-		ioread16(p + i);
+		if (ioread16(p + i) != v16(i))
+			++errs[1];
+
 	for (i = (5 * 1024); i < (16 * 1024); i += 4)
-		ioread32(p + i);
+		if (ioread32(p + i) != v32(i))
+			++errs[2];
+
+	mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n",
+						errs[0], errs[1], errs[2]);
 }
 
 static void do_test(void)
-- 
cgit v0.10.2


From 5ff93697fcfe1e2b9e61db82961d8f50d1ad5d57 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Sun, 1 Mar 2009 16:10:08 +0200
Subject: x86: add far read test to testmmiotrace

Apparently pages far into an ioremapped region might not actually be
mapped during ioremap(). Add an optional read test to try to trigger a
multiply faulting MMIO access. Also add more messages to the kernel log
to help debugging.

This patch is based on a patch suggested by
Stuart Bennett <stuart@freedesktop.org>
who discovered bugs in mmiotrace related to normal kernel space faults.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Cc: Stuart Bennett <stuart@freedesktop.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 4b29f3b..427fd1b 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -9,7 +9,13 @@
 
 static unsigned long mmio_address;
 module_param(mmio_address, ulong, 0);
-MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
+MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
+				"(or 8 MB if read_far is non-zero).");
+
+static unsigned long read_far = 0x400100;
+module_param(read_far, ulong, 0);
+MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB "
+				"(default: 0x400100).");
 
 static unsigned v16(unsigned i)
 {
@@ -24,6 +30,7 @@ static unsigned v32(unsigned i)
 static void do_write_test(void __iomem *p)
 {
 	unsigned int i;
+	pr_info(MODULE_NAME ": write test.\n");
 	mmiotrace_printk("Write test.\n");
 
 	for (i = 0; i < 256; i++)
@@ -40,6 +47,7 @@ static void do_read_test(void __iomem *p)
 {
 	unsigned int i;
 	unsigned errs[3] = { 0 };
+	pr_info(MODULE_NAME ": read test.\n");
 	mmiotrace_printk("Read test.\n");
 
 	for (i = 0; i < 256; i++)
@@ -58,9 +66,17 @@ static void do_read_test(void __iomem *p)
 						errs[0], errs[1], errs[2]);
 }
 
-static void do_test(void)
+static void do_read_far_test(void __iomem *p)
+{
+	pr_info(MODULE_NAME ": read far test.\n");
+	mmiotrace_printk("Read far test.\n");
+
+	ioread32(p + read_far);
+}
+
+static void do_test(unsigned long size)
 {
-	void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
+	void __iomem *p = ioremap_nocache(mmio_address, size);
 	if (!p) {
 		pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
 		return;
@@ -68,11 +84,15 @@ static void do_test(void)
 	mmiotrace_printk("ioremap returned %p.\n", p);
 	do_write_test(p);
 	do_read_test(p);
+	if (read_far && read_far < size - 4)
+		do_read_far_test(p);
 	iounmap(p);
 }
 
 static int __init init(void)
 {
+	unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
+
 	if (mmio_address == 0) {
 		pr_err(MODULE_NAME ": you have to use the module argument "
 							"mmio_address.\n");
@@ -81,10 +101,11 @@ static int __init init(void)
 		return -ENXIO;
 	}
 
-	pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
-					"in PCI address space, and writing "
-					"rubbish in there.\n", mmio_address);
-	do_test();
+	pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI "
+		"address space, and writing 16 kB of rubbish in there.\n",
+		 size >> 10, mmio_address);
+	do_test(size);
+	pr_info(MODULE_NAME ": All done.\n");
 	return 0;
 }
 
-- 
cgit v0.10.2


From e9d54cae8f03e7f963a12f44bd50d68f49b9ea36 Mon Sep 17 00:00:00 2001
From: Stuart Bennett <stuart@freedesktop.org>
Date: Fri, 30 Jan 2009 17:38:59 +0000
Subject: x86 mmiotrace: WARN_ONCE if dis/arming a page fails

Print a full warning once, if arming or disarming a page fails.

Also, if initial arming fails, do not handle the page further. This
avoids the possibility of a page failing to arm and then later claiming
to have handled any fault on that page.

WARN_ONCE added by Pekka Paalanen.

Signed-off-by: Stuart Bennett <stuart@freedesktop.org>
Signed-off-by: Pekka Paalanen <pq@iki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 93d8203..fb1f115 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -105,7 +105,7 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 	return NULL;
 }
 
-static void set_page_present(unsigned long addr, bool present,
+static int set_page_present(unsigned long addr, bool present,
 							unsigned int *pglevel)
 {
 	pteval_t pteval;
@@ -116,7 +116,7 @@ static void set_page_present(unsigned long addr, bool present,
 
 	if (!pte) {
 		pr_err("kmmio: no pte for page 0x%08lx\n", addr);
-		return;
+		return -1;
 	}
 
 	if (pglevel)
@@ -140,22 +140,27 @@ static void set_page_present(unsigned long addr, bool present,
 
 	default:
 		pr_err("kmmio: unexpected page level 0x%x.\n", level);
-		return;
+		return -1;
 	}
 
 	__flush_tlb_one(addr);
+
+	return 0;
 }
 
 /** Mark the given page as not present. Access to it will trigger a fault. */
-static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
+static int arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
 {
-	set_page_present(page & PAGE_MASK, false, pglevel);
+	int ret = set_page_present(page & PAGE_MASK, false, pglevel);
+	WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", page);
+	return ret;
 }
 
 /** Mark the given page as present. */
 static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
 {
-	set_page_present(page & PAGE_MASK, true, pglevel);
+	int ret = set_page_present(page & PAGE_MASK, true, pglevel);
+	WARN_ONCE(ret < 0, KERN_ERR "kmmio disarming 0x%08lx failed.\n", page);
 }
 
 /*
@@ -326,9 +331,13 @@ static int add_kmmio_fault_page(unsigned long page)
 
 	f->count = 1;
 	f->page = page;
-	list_add_rcu(&f->list, kmmio_page_list(f->page));
 
-	arm_kmmio_fault_page(f->page, NULL);
+	if (arm_kmmio_fault_page(f->page, NULL)) {
+		kfree(f);
+		return -1;
+	}
+
+	list_add_rcu(&f->list, kmmio_page_list(f->page));
 
 	return 0;
 }
-- 
cgit v0.10.2


From 5359b585fb5edb3db34d6cd491e1475b098c61d3 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Sun, 1 Mar 2009 16:11:58 +0200
Subject: x86 mmiotrace: fix save/restore page table state

From baa99e2b32449ec7bf147c234adfa444caecac8a Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Sun, 22 Feb 2009 20:02:43 +0200

Blindly setting _PAGE_PRESENT in disarm_kmmio_fault_page() overlooks the
possibility, that the page was not present when it was armed.

Make arm_kmmio_fault_page() store the previous page presence in struct
kmmio_fault_page and use it on disarm.

This patch was originally written by Stuart Bennett, but Pekka Paalanen
rewrote it a little different.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Cc: Stuart Bennett <stuart@freedesktop.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index fb1f115..be361eb 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -32,6 +32,8 @@ struct kmmio_fault_page {
 	struct list_head list;
 	struct kmmio_fault_page *release_next;
 	unsigned long page; /* location of the fault page */
+	bool old_presence; /* page presence prior to arming */
+	bool armed;
 
 	/*
 	 * Number of times this page has been registered as a part
@@ -105,8 +107,7 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 	return NULL;
 }
 
-static int set_page_present(unsigned long addr, bool present,
-							unsigned int *pglevel)
+static int set_page_presence(unsigned long addr, bool present, bool *old)
 {
 	pteval_t pteval;
 	pmdval_t pmdval;
@@ -119,20 +120,21 @@ static int set_page_present(unsigned long addr, bool present,
 		return -1;
 	}
 
-	if (pglevel)
-		*pglevel = level;
-
 	switch (level) {
 	case PG_LEVEL_2M:
 		pmd = (pmd_t *)pte;
-		pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
+		pmdval = pmd_val(*pmd);
+		*old = !!(pmdval & _PAGE_PRESENT);
+		pmdval &= ~_PAGE_PRESENT;
 		if (present)
 			pmdval |= _PAGE_PRESENT;
 		set_pmd(pmd, __pmd(pmdval));
 		break;
 
 	case PG_LEVEL_4K:
-		pteval = pte_val(*pte) & ~_PAGE_PRESENT;
+		pteval = pte_val(*pte);
+		*old = !!(pteval & _PAGE_PRESENT);
+		pteval &= ~_PAGE_PRESENT;
 		if (present)
 			pteval |= _PAGE_PRESENT;
 		set_pte_atomic(pte, __pte(pteval));
@@ -148,19 +150,39 @@ static int set_page_present(unsigned long addr, bool present,
 	return 0;
 }
 
-/** Mark the given page as not present. Access to it will trigger a fault. */
-static int arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
+/*
+ * Mark the given page as not present. Access to it will trigger a fault.
+ *
+ * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
+ * protection is ignored here. RCU read lock is assumed held, so the struct
+ * will not disappear unexpectedly. Furthermore, the caller must guarantee,
+ * that double arming the same virtual address (page) cannot occur.
+ *
+ * Double disarming on the other hand is allowed, and may occur when a fault
+ * and mmiotrace shutdown happen simultaneously.
+ */
+static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
 {
-	int ret = set_page_present(page & PAGE_MASK, false, pglevel);
-	WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", page);
+	int ret;
+	WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
+	if (f->armed) {
+		pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
+					f->page, f->count, f->old_presence);
+	}
+	ret = set_page_presence(f->page, false, &f->old_presence);
+	WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
+	f->armed = true;
 	return ret;
 }
 
-/** Mark the given page as present. */
-static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
+/** Restore the given page to saved presence state. */
+static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
 {
-	int ret = set_page_present(page & PAGE_MASK, true, pglevel);
-	WARN_ONCE(ret < 0, KERN_ERR "kmmio disarming 0x%08lx failed.\n", page);
+	bool tmp;
+	int ret = set_page_presence(f->page, f->old_presence, &tmp);
+	WARN_ONCE(ret < 0,
+			KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
+	f->armed = false;
 }
 
 /*
@@ -207,7 +229,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 
 	ctx = &get_cpu_var(kmmio_ctx);
 	if (ctx->active) {
-		disarm_kmmio_fault_page(faultpage->page, NULL);
+		disarm_kmmio_fault_page(faultpage);
 		if (addr == ctx->addr) {
 			/*
 			 * On SMP we sometimes get recursive probe hits on the
@@ -249,7 +271,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	regs->flags &= ~X86_EFLAGS_IF;
 
 	/* Now we set present bit in PTE and single step. */
-	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
+	disarm_kmmio_fault_page(ctx->fpage);
 
 	/*
 	 * If another cpu accesses the same page while we are stepping,
@@ -288,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	if (ctx->probe && ctx->probe->post_handler)
 		ctx->probe->post_handler(ctx->probe, condition, regs);
 
-	arm_kmmio_fault_page(ctx->fpage->page, NULL);
+	arm_kmmio_fault_page(ctx->fpage);
 
 	regs->flags &= ~X86_EFLAGS_TF;
 	regs->flags |= ctx->saved_flags;
@@ -320,19 +342,19 @@ static int add_kmmio_fault_page(unsigned long page)
 	f = get_kmmio_fault_page(page);
 	if (f) {
 		if (!f->count)
-			arm_kmmio_fault_page(f->page, NULL);
+			arm_kmmio_fault_page(f);
 		f->count++;
 		return 0;
 	}
 
-	f = kmalloc(sizeof(*f), GFP_ATOMIC);
+	f = kzalloc(sizeof(*f), GFP_ATOMIC);
 	if (!f)
 		return -1;
 
 	f->count = 1;
 	f->page = page;
 
-	if (arm_kmmio_fault_page(f->page, NULL)) {
+	if (arm_kmmio_fault_page(f)) {
 		kfree(f);
 		return -1;
 	}
@@ -356,7 +378,7 @@ static void release_kmmio_fault_page(unsigned long page,
 	f->count--;
 	BUG_ON(f->count < 0);
 	if (!f->count) {
-		disarm_kmmio_fault_page(f->page, NULL);
+		disarm_kmmio_fault_page(f);
 		f->release_next = *release_list;
 		*release_list = f;
 	}
-- 
cgit v0.10.2


From 0b700a6a253b6a3b3059bb9a9247a73490ee33fb Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Sun, 1 Mar 2009 16:12:48 +0200
Subject: x86 mmiotrace: split set_page_presence()

From 36772dcb6ffbbb68254cbfc379a103acd2fbfefc Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Sat, 28 Feb 2009 21:34:59 +0200

Split set_page_presence() in kmmio.c into two more functions set_pmd_presence()
and set_pte_presence(). Purely code reorganization, no functional changes.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Cc: Stuart Bennett <stuart@freedesktop.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index be361eb..d297775 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -107,12 +107,29 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 	return NULL;
 }
 
+static void set_pmd_presence(pmd_t *pmd, bool present, bool *old)
+{
+	pmdval_t v = pmd_val(*pmd);
+	*old = !!(v & _PAGE_PRESENT);
+	v &= ~_PAGE_PRESENT;
+	if (present)
+		v |= _PAGE_PRESENT;
+	set_pmd(pmd, __pmd(v));
+}
+
+static void set_pte_presence(pte_t *pte, bool present, bool *old)
+{
+	pteval_t v = pte_val(*pte);
+	*old = !!(v & _PAGE_PRESENT);
+	v &= ~_PAGE_PRESENT;
+	if (present)
+		v |= _PAGE_PRESENT;
+	set_pte_atomic(pte, __pte(v));
+}
+
 static int set_page_presence(unsigned long addr, bool present, bool *old)
 {
-	pteval_t pteval;
-	pmdval_t pmdval;
 	unsigned int level;
-	pmd_t *pmd;
 	pte_t *pte = lookup_address(addr, &level);
 
 	if (!pte) {
@@ -122,31 +139,17 @@ static int set_page_presence(unsigned long addr, bool present, bool *old)
 
 	switch (level) {
 	case PG_LEVEL_2M:
-		pmd = (pmd_t *)pte;
-		pmdval = pmd_val(*pmd);
-		*old = !!(pmdval & _PAGE_PRESENT);
-		pmdval &= ~_PAGE_PRESENT;
-		if (present)
-			pmdval |= _PAGE_PRESENT;
-		set_pmd(pmd, __pmd(pmdval));
+		set_pmd_presence((pmd_t *)pte, present, old);
 		break;
-
 	case PG_LEVEL_4K:
-		pteval = pte_val(*pte);
-		*old = !!(pteval & _PAGE_PRESENT);
-		pteval &= ~_PAGE_PRESENT;
-		if (present)
-			pteval |= _PAGE_PRESENT;
-		set_pte_atomic(pte, __pte(pteval));
+		set_pte_presence(pte, present, old);
 		break;
-
 	default:
 		pr_err("kmmio: unexpected page level 0x%x.\n", level);
 		return -1;
 	}
 
 	__flush_tlb_one(addr);
-
 	return 0;
 }
 
-- 
cgit v0.10.2


From 3e39aa156a24ce386da378784edd0f748c770087 Mon Sep 17 00:00:00 2001
From: Stuart Bennett <stuart@freedesktop.org>
Date: Thu, 5 Feb 2009 11:02:02 +0000
Subject: x86 mmiotrace: improve handling of secondary faults

Upgrade some kmmio.c debug messages to warnings.
Allow secondary faults on probed pages to fall through, and only log
secondary faults that are not due to non-present pages.

Patch edited by Pekka Paalanen.

Signed-off-by: Stuart Bennett <stuart@freedesktop.org>
Signed-off-by: Pekka Paalanen <pq@iki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index d297775..4c66bd3 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -232,28 +232,32 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 
 	ctx = &get_cpu_var(kmmio_ctx);
 	if (ctx->active) {
-		disarm_kmmio_fault_page(faultpage);
 		if (addr == ctx->addr) {
 			/*
-			 * On SMP we sometimes get recursive probe hits on the
-			 * same address. Context is already saved, fall out.
+			 * A second fault on the same page means some other
+			 * condition needs handling by do_page_fault(), the
+			 * page really not being present is the most common.
 			 */
-			pr_debug("kmmio: duplicate probe hit on CPU %d, for "
-						"address 0x%08lx.\n",
-						smp_processor_id(), addr);
-			ret = 1;
-			goto no_kmmio_ctx;
-		}
-		/*
-		 * Prevent overwriting already in-flight context.
-		 * This should not happen, let's hope disarming at least
-		 * prevents a panic.
-		 */
-		pr_emerg("kmmio: recursive probe hit on CPU %d, "
+			pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n",
+					addr, smp_processor_id());
+
+			if (!faultpage->old_presence)
+				pr_info("kmmio: unexpected secondary hit for "
+					"address 0x%08lx on CPU %d.\n", addr,
+					smp_processor_id());
+		} else {
+			/*
+			 * Prevent overwriting already in-flight context.
+			 * This should not happen, let's hope disarming at
+			 * least prevents a panic.
+			 */
+			pr_emerg("kmmio: recursive probe hit on CPU %d, "
 					"for address 0x%08lx. Ignoring.\n",
 					smp_processor_id(), addr);
-		pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
-					ctx->addr);
+			pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
+						ctx->addr);
+			disarm_kmmio_fault_page(faultpage);
+		}
 		goto no_kmmio_ctx;
 	}
 	ctx->active++;
@@ -305,7 +309,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 	if (!ctx->active) {
-		pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+		pr_warning("kmmio: spurious debug trap on CPU %d.\n",
 							smp_processor_id());
 		goto out;
 	}
-- 
cgit v0.10.2


From 340430c572f7b2b275d39965e88bafa71693cb23 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Tue, 24 Feb 2009 21:44:15 +0200
Subject: x86 mmiotrace: fix race with release_kmmio_fault_page()

There was a theoretical possibility to a race between arming a page in
post_kmmio_handler() and disarming the page in
release_kmmio_fault_page():

cpu0                             cpu1
------------------------------------------------------------------
mmiotrace shutdown
enter release_kmmio_fault_page
                                 fault on the page
                                 disarm the page
disarm the page
                                 handle the MMIO access
                                 re-arm the page
put the page on release list
remove_kmmio_fault_pages()
                                 fault on the page
                                 page not known to mmiotrace
                                 fall back to do_page_fault()
                                 *KABOOM*

(This scenario also shows the double disarm case which is allowed.)

Fixed by acquiring kmmio_lock in post_kmmio_handler() and checking
if the page is being released from mmiotrace.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Cc: Stuart Bennett <stuart@freedesktop.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 4c66bd3..9f20503 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -38,7 +38,8 @@ struct kmmio_fault_page {
 	/*
 	 * Number of times this page has been registered as a part
 	 * of a probe. If zero, page is disarmed and this may be freed.
-	 * Used only by writers (RCU).
+	 * Used only by writers (RCU) and post_kmmio_handler().
+	 * Protected by kmmio_lock, when linked into kmmio_page_table.
 	 */
 	int count;
 };
@@ -317,7 +318,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	if (ctx->probe && ctx->probe->post_handler)
 		ctx->probe->post_handler(ctx->probe, condition, regs);
 
-	arm_kmmio_fault_page(ctx->fpage);
+	/* Prevent racing against release_kmmio_fault_page(). */
+	spin_lock(&kmmio_lock);
+	if (ctx->fpage->count)
+		arm_kmmio_fault_page(ctx->fpage);
+	spin_unlock(&kmmio_lock);
 
 	regs->flags &= ~X86_EFLAGS_TF;
 	regs->flags |= ctx->saved_flags;
-- 
cgit v0.10.2


From a572e217c6f09fb02853d3196458b8bf30a9a321 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier.adi@gmail.com>
Date: Mon, 2 Mar 2009 17:22:36 +0800
Subject: Blackfin arch: drop untested and useless "generic" board file

Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/mach-bf533/boards/Kconfig b/arch/blackfin/mach-bf533/boards/Kconfig
index 308c98d..8d8b3e7 100644
--- a/arch/blackfin/mach-bf533/boards/Kconfig
+++ b/arch/blackfin/mach-bf533/boards/Kconfig
@@ -38,9 +38,4 @@ config BFIN532_IP0X
 	help
 	  Core support for IP04/IP04 open hardware IP-PBX.
 
-config GENERIC_BF533_BOARD
-	bool "Generic"
-	help
-	  Generic or Custom board support.
-
 endchoice
diff --git a/arch/blackfin/mach-bf533/boards/Makefile b/arch/blackfin/mach-bf533/boards/Makefile
index 9afbe72..ff1e832 100644
--- a/arch/blackfin/mach-bf533/boards/Makefile
+++ b/arch/blackfin/mach-bf533/boards/Makefile
@@ -2,7 +2,6 @@
 # arch/blackfin/mach-bf533/boards/Makefile
 #
 
-obj-$(CONFIG_GENERIC_BF533_BOARD)      += generic_board.o
 obj-$(CONFIG_BFIN533_STAMP)            += stamp.o
 obj-$(CONFIG_BFIN532_IP0X)             += ip0x.o
 obj-$(CONFIG_BFIN533_EZKIT)            += ezkit.o
diff --git a/arch/blackfin/mach-bf533/boards/generic_board.c b/arch/blackfin/mach-bf533/boards/generic_board.c
deleted file mode 100644
index 986eeec..0000000
--- a/arch/blackfin/mach-bf533/boards/generic_board.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * File:         arch/blackfin/mach-bf533/generic_board.c
- * Based on:     arch/blackfin/mach-bf533/ezkit.c
- * Author:       Aidan Williams <aidan@nicta.com.au>
- *
- * Created:      2005
- * Description:
- *
- * Modified:
- *               Copyright 2005 National ICT Australia (NICTA)
- *               Copyright 2004-2006 Analog Devices Inc.
- *
- * Bugs:         Enter bugs at http://blackfin.uclinux.org/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see the file COPYING, or write
- * to the Free Software Foundation, Inc.,
- * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/irq.h>
-
-/*
- * Name the Board for the /proc/cpuinfo
- */
-const char bfin_board_name[] = "UNKNOWN BOARD";
-
-#if defined(CONFIG_RTC_DRV_BFIN) || defined(CONFIG_RTC_DRV_BFIN_MODULE)
-static struct platform_device rtc_device = {
-	.name = "rtc-bfin",
-	.id   = -1,
-};
-#endif
-
-/*
- *  Driver needs to know address, irq and flag pin.
- */
-#if defined(CONFIG_SMC91X) || defined(CONFIG_SMC91X_MODULE)
-static struct resource smc91x_resources[] = {
-	{
-		.start = 0x20300300,
-		.end = 0x20300300 + 16,
-		.flags = IORESOURCE_MEM,
-	}, {
-		.start = IRQ_PROG_INTB,
-		.end = IRQ_PROG_INTB,
-		.flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL,
-	}, {
-		.start = IRQ_PF7,
-		.end = IRQ_PF7,
-		.flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL,
-	},
-};
-
-static struct platform_device smc91x_device = {
-	.name = "smc91x",
-	.id = 0,
-	.num_resources = ARRAY_SIZE(smc91x_resources),
-	.resource = smc91x_resources,
-};
-#endif
-
-#if defined(CONFIG_BFIN_SIR) || defined(CONFIG_BFIN_SIR_MODULE)
-#ifdef CONFIG_BFIN_SIR0
-static struct resource bfin_sir0_resources[] = {
-	{
-		.start = 0xFFC00400,
-		.end = 0xFFC004FF,
-		.flags = IORESOURCE_MEM,
-	},
-	{
-		.start = IRQ_UART0_RX,
-		.end = IRQ_UART0_RX+1,
-		.flags = IORESOURCE_IRQ,
-	},
-	{
-		.start = CH_UART0_RX,
-		.end = CH_UART0_RX+1,
-		.flags = IORESOURCE_DMA,
-	},
-};
-
-static struct platform_device bfin_sir0_device = {
-	.name = "bfin_sir",
-	.id = 0,
-	.num_resources = ARRAY_SIZE(bfin_sir0_resources),
-	.resource = bfin_sir0_resources,
-};
-#endif
-#endif
-
-static struct platform_device *generic_board_devices[] __initdata = {
-#if defined(CONFIG_RTC_DRV_BFIN) || defined(CONFIG_RTC_DRV_BFIN_MODULE)
-	&rtc_device,
-#endif
-
-#if defined(CONFIG_SMC91X) || defined(CONFIG_SMC91X_MODULE)
-	&smc91x_device,
-#endif
-
-#if defined(CONFIG_BFIN_SIR) || defined(CONFIG_BFIN_SIR_MODULE)
-#ifdef CONFIG_BFIN_SIR0
-	&bfin_sir0_device,
-#endif
-#endif
-};
-
-static int __init generic_board_init(void)
-{
-	printk(KERN_INFO "%s(): registering device resources\n", __func__);
-	return platform_add_devices(generic_board_devices, ARRAY_SIZE(generic_board_devices));
-}
-
-arch_initcall(generic_board_init);
diff --git a/arch/blackfin/mach-bf537/boards/Kconfig b/arch/blackfin/mach-bf537/boards/Kconfig
index 42a57b0..77c59da 100644
--- a/arch/blackfin/mach-bf537/boards/Kconfig
+++ b/arch/blackfin/mach-bf537/boards/Kconfig
@@ -33,9 +33,4 @@ config CAMSIG_MINOTAUR
 	help
 	  Board supply package for CSP Minotaur
 
-config GENERIC_BF537_BOARD
-	bool "Generic"
-	help
-	  Generic or Custom board support.
-
 endchoice
diff --git a/arch/blackfin/mach-bf537/boards/Makefile b/arch/blackfin/mach-bf537/boards/Makefile
index 7168cc1..68b98a7a 100644
--- a/arch/blackfin/mach-bf537/boards/Makefile
+++ b/arch/blackfin/mach-bf537/boards/Makefile
@@ -2,7 +2,6 @@
 # arch/blackfin/mach-bf537/boards/Makefile
 #
 
-obj-$(CONFIG_GENERIC_BF537_BOARD)      += generic_board.o
 obj-$(CONFIG_BFIN537_STAMP)            += stamp.o
 obj-$(CONFIG_BFIN537_BLUETECHNIX_CM)   += cm_bf537.o
 obj-$(CONFIG_BFIN537_BLUETECHNIX_TCM)  += tcm_bf537.o
diff --git a/arch/blackfin/mach-bf537/boards/generic_board.c b/arch/blackfin/mach-bf537/boards/generic_board.c
deleted file mode 100644
index da710fd..0000000
--- a/arch/blackfin/mach-bf537/boards/generic_board.c
+++ /dev/null
@@ -1,745 +0,0 @@
-/*
- * File:         arch/blackfin/mach-bf537/boards/generic_board.c
- * Based on:     arch/blackfin/mach-bf533/boards/ezkit.c
- * Author:       Aidan Williams <aidan@nicta.com.au>
- *
- * Created:
- * Description:
- *
- * Modified:
- *               Copyright 2005 National ICT Australia (NICTA)
- *               Copyright 2004-2008 Analog Devices Inc.
- *
- * Bugs:         Enter bugs at http://blackfin.uclinux.org/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see the file COPYING, or write
- * to the Free Software Foundation, Inc.,
- * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include <linux/device.h>
-#include <linux/etherdevice.h>
-#include <linux/platform_device.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/partitions.h>
-#include <linux/spi/spi.h>
-#include <linux/spi/flash.h>
-#if defined(CONFIG_USB_ISP1362_HCD) || defined(CONFIG_USB_ISP1362_HCD_MODULE)
-#include <linux/usb/isp1362.h>
-#endif
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/usb/sl811.h>
-#include <asm/dma.h>
-#include <asm/bfin5xx_spi.h>
-#include <asm/reboot.h>
-#include <asm/portmux.h>
-#include <linux/spi/ad7877.h>
-
-/*
- * Name the Board for the /proc/cpuinfo
- */
-const char bfin_board_name[] = "UNKNOWN BOARD";
-
-/*
- *  Driver needs to know address, irq and flag pin.
- */
-
-#if defined(CONFIG_USB_ISP1760_HCD) || defined(CONFIG_USB_ISP1760_HCD_MODULE)
-#include <linux/usb/isp1760.h>
-static struct resource bfin_isp1760_resources[] = {
-	[0] = {
-		.start  = 0x203C0000,
-		.end    = 0x203C0000 + 0x000fffff,
-		.flags  = IORESOURCE_MEM,
-	},
-	[1] = {
-		.start  = IRQ_PF7,
-		.end    = IRQ_PF7,
-		.flags  = IORESOURCE_IRQ,
-	},
-};
-
-static struct isp1760_platform_data isp1760_priv = {
-	.is_isp1761 = 0,
-	.port1_disable = 0,
-	.bus_width_16 = 1,
-	.port1_otg = 0,
-	.analog_oc = 0,
-	.dack_polarity_high = 0,
-	.dreq_polarity_high = 0,
-};
-
-static struct platform_device bfin_isp1760_device = {
-	.name           = "isp1760-hcd",
-	.id             = 0,
-	.dev = {
-		.platform_data = &isp1760_priv,
-	},
-	.num_resources  = ARRAY_SIZE(bfin_isp1760_resources),
-	.resource       = bfin_isp1760_resources,
-};
-#endif
-
-#if defined(CONFIG_BFIN_CFPCMCIA) || defined(CONFIG_BFIN_CFPCMCIA_MODULE)
-static struct resource bfin_pcmcia_cf_resources[] = {
-	{
-		.start = 0x20310000, /* IO PORT */
-		.end = 0x20312000,
-		.flags = IORESOURCE_MEM,
-	}, {
-		.start = 0x20311000, /* Attribute Memory */
-		.end = 0x20311FFF,
-		.flags = IORESOURCE_MEM,
-	}, {
-		.start = IRQ_PF4,
-		.end = IRQ_PF4,
-		.flags = IORESOURCE_IRQ | IORESOURCE_IRQ_LOWLEVEL,
-	}, {
-		.start = 6, /* Card Detect PF6 */
-		.end = 6,
-		.flags = IORESOURCE_IRQ,
-	},
-};
-
-static struct platform_device bfin_pcmcia_cf_device = {
-	.name = "bfin_cf_pcmcia",
-	.id = -1,
-	.num_resources = ARRAY_SIZE(bfin_pcmcia_cf_resources),
-	.resource = bfin_pcmcia_cf_resources,
-};
-#endif
-
-#if defined(CONFIG_RTC_DRV_BFIN) || defined(CONFIG_RTC_DRV_BFIN_MODULE)
-static struct platform_device rtc_device = {
-	.name = "rtc-bfin",
-	.id   = -1,
-};
-#endif
-
-#if defined(CONFIG_SMC91X) || defined(CONFIG_SMC91X_MODULE)
-static struct resource smc91x_resources[] = {
-	{
-		.name = "smc91x-regs",
-		.start = 0x20300300,
-		.end = 0x20300300 + 16,
-		.flags = IORESOURCE_MEM,
-	}, {
-
-		.start = IRQ_PF7,
-		.end = IRQ_PF7,
-		.flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL,
-	},
-};
-static struct platform_device smc91x_device = {
-	.name = "smc91x",
-	.id = 0,
-	.num_resources = ARRAY_SIZE(smc91x_resources),
-	.resource = smc91x_resources,
-};
-#endif
-
-#if defined(CONFIG_DM9000) || defined(CONFIG_DM9000_MODULE)
-static struct resource dm9000_resources[] = {
-	[0] = {
-		.start	= 0x203FB800,
-		.end	= 0x203FB800 + 1,
-		.flags	= IORESOURCE_MEM,
-	},
-	[1] = {
-		.start	= 0x203FB800 + 4,
-		.end	= 0x203FB800 + 5,
-		.flags	= IORESOURCE_MEM,
-	},
-	[2] = {
-		.start	= IRQ_PF9,
-		.end	= IRQ_PF9,
-		.flags	= (IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHEDGE),
-	},
-};
-
-static struct platform_device dm9000_device = {
-	.name		= "dm9000",
-	.id		= -1,
-	.num_resources	= ARRAY_SIZE(dm9000_resources),
-	.resource	= dm9000_resources,
-};
-#endif
-
-#if defined(CONFIG_USB_SL811_HCD) || defined(CONFIG_USB_SL811_HCD_MODULE)
-static struct resource sl811_hcd_resources[] = {
-	{
-		.start = 0x20340000,
-		.end = 0x20340000,
-		.flags = IORESOURCE_MEM,
-	}, {
-		.start = 0x20340004,
-		.end = 0x20340004,
-		.flags = IORESOURCE_MEM,
-	}, {
-		.start = CONFIG_USB_SL811_BFIN_IRQ,
-		.end = CONFIG_USB_SL811_BFIN_IRQ,
-		.flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL,
-	},
-};
-
-#if defined(CONFIG_USB_SL811_BFIN_USE_VBUS)
-void sl811_port_power(struct device *dev, int is_on)
-{
-	gpio_request(CONFIG_USB_SL811_BFIN_GPIO_VBUS, "usb:SL811_VBUS");
-	gpio_direction_output(CONFIG_USB_SL811_BFIN_GPIO_VBUS, is_on);
-
-}
-#endif
-
-static struct sl811_platform_data sl811_priv = {
-	.potpg = 10,
-	.power = 250,       /* == 500mA */
-#if defined(CONFIG_USB_SL811_BFIN_USE_VBUS)
-	.port_power = &sl811_port_power,
-#endif
-};
-
-static struct platform_device sl811_hcd_device = {
-	.name = "sl811-hcd",
-	.id = 0,
-	.dev = {
-		.platform_data = &sl811_priv,
-	},
-	.num_resources = ARRAY_SIZE(sl811_hcd_resources),
-	.resource = sl811_hcd_resources,
-};
-#endif
-
-#if defined(CONFIG_USB_ISP1362_HCD) || defined(CONFIG_USB_ISP1362_HCD_MODULE)
-static struct resource isp1362_hcd_resources[] = {
-	{
-		.start = 0x20360000,
-		.end = 0x20360000,
-		.flags = IORESOURCE_MEM,
-	}, {
-		.start = 0x20360004,
-		.end = 0x20360004,
-		.flags = IORESOURCE_MEM,
-	}, {
-		.start = CONFIG_USB_ISP1362_BFIN_GPIO_IRQ,
-		.end = CONFIG_USB_ISP1362_BFIN_GPIO_IRQ,
-		.flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL,
-	},
-};
-
-static struct isp1362_platform_data isp1362_priv = {
-	.sel15Kres = 1,
-	.clknotstop = 0,
-	.oc_enable = 0,
-	.int_act_high = 0,
-	.int_edge_triggered = 0,
-	.remote_wakeup_connected = 0,
-	.no_power_switching = 1,
-	.power_switching_mode = 0,
-};
-
-static struct platform_device isp1362_hcd_device = {
-	.name = "isp1362-hcd",
-	.id = 0,
-	.dev = {
-		.platform_data = &isp1362_priv,
-	},
-	.num_resources = ARRAY_SIZE(isp1362_hcd_resources),
-	.resource = isp1362_hcd_resources,
-};
-#endif
-
-#if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE)
-static struct platform_device bfin_mii_bus = {
-	.name = "bfin_mii_bus",
-};
-
-static struct platform_device bfin_mac_device = {
-	.name = "bfin_mac",
-	.dev.platform_data = &bfin_mii_bus,
-};
-#endif
-
-#if defined(CONFIG_USB_NET2272) || defined(CONFIG_USB_NET2272_MODULE)
-static struct resource net2272_bfin_resources[] = {
-	{
-		.start = 0x20300000,
-		.end = 0x20300000 + 0x100,
-		.flags = IORESOURCE_MEM,
-	}, {
-		.start = IRQ_PF7,
-		.end = IRQ_PF7,
-		.flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL,
-	},
-};
-
-static struct platform_device net2272_bfin_device = {
-	.name = "net2272",
-	.id = -1,
-	.num_resources = ARRAY_SIZE(net2272_bfin_resources),
-	.resource = net2272_bfin_resources,
-};
-#endif
-
-#if defined(CONFIG_SPI_BFIN) || defined(CONFIG_SPI_BFIN_MODULE)
-/* all SPI peripherals info goes here */
-
-#if defined(CONFIG_MTD_M25P80) \
-	|| defined(CONFIG_MTD_M25P80_MODULE)
-static struct mtd_partition bfin_spi_flash_partitions[] = {
-	{
-		.name = "bootloader(spi)",
-		.size = 0x00020000,
-		.offset = 0,
-		.mask_flags = MTD_CAP_ROM
-	}, {
-		.name = "linux kernel(spi)",
-		.size = 0xe0000,
-		.offset = 0x20000
-	}, {
-		.name = "file system(spi)",
-		.size = 0x700000,
-		.offset = 0x00100000,
-	}
-};
-
-static struct flash_platform_data bfin_spi_flash_data = {
-	.name = "m25p80",
-	.parts = bfin_spi_flash_partitions,
-	.nr_parts = ARRAY_SIZE(bfin_spi_flash_partitions),
-	.type = "m25p64",
-};
-
-/* SPI flash chip (m25p64) */
-static struct bfin5xx_spi_chip spi_flash_chip_info = {
-	.enable_dma = 0,         /* use dma transfer with this chip*/
-	.bits_per_word = 8,
-};
-#endif
-
-#if defined(CONFIG_SPI_ADC_BF533) \
-	|| defined(CONFIG_SPI_ADC_BF533_MODULE)
-/* SPI ADC chip */
-static struct bfin5xx_spi_chip spi_adc_chip_info = {
-	.enable_dma = 1,         /* use dma transfer with this chip*/
-	.bits_per_word = 16,
-};
-#endif
-
-#if defined(CONFIG_SND_BLACKFIN_AD1836) \
-	|| defined(CONFIG_SND_BLACKFIN_AD1836_MODULE)
-static struct bfin5xx_spi_chip ad1836_spi_chip_info = {
-	.enable_dma = 0,
-	.bits_per_word = 16,
-};
-#endif
-
-#if defined(CONFIG_AD9960) || defined(CONFIG_AD9960_MODULE)
-static struct bfin5xx_spi_chip ad9960_spi_chip_info = {
-	.enable_dma = 0,
-	.bits_per_word = 16,
-};
-#endif
-
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-static struct bfin5xx_spi_chip spi_mmc_chip_info = {
-	.enable_dma = 1,
-	.bits_per_word = 8,
-};
-#endif
-
-#if defined(CONFIG_PBX)
-static struct bfin5xx_spi_chip spi_si3xxx_chip_info = {
-	.ctl_reg	= 0x4, /* send zero */
-	.enable_dma	= 0,
-	.bits_per_word	= 8,
-	.cs_change_per_word = 1,
-};
-#endif
-
-#if defined(CONFIG_TOUCHSCREEN_AD7877) || defined(CONFIG_TOUCHSCREEN_AD7877_MODULE)
-static struct bfin5xx_spi_chip spi_ad7877_chip_info = {
-	.enable_dma = 0,
-	.bits_per_word = 16,
-};
-
-static const struct ad7877_platform_data bfin_ad7877_ts_info = {
-	.model			= 7877,
-	.vref_delay_usecs	= 50,	/* internal, no capacitor */
-	.x_plate_ohms		= 419,
-	.y_plate_ohms		= 486,
-	.pressure_max		= 1000,
-	.pressure_min		= 0,
-	.stopacq_polarity 	= 1,
-	.first_conversion_delay = 3,
-	.acquisition_time 	= 1,
-	.averaging 		= 1,
-	.pen_down_acc_interval 	= 1,
-};
-#endif
-
-static struct spi_board_info bfin_spi_board_info[] __initdata = {
-#if defined(CONFIG_MTD_M25P80) \
-	|| defined(CONFIG_MTD_M25P80_MODULE)
-	{
-		/* the modalias must be the same as spi device driver name */
-		.modalias = "m25p80", /* Name of spi_driver for this device */
-		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0, /* Framework bus number */
-		.chip_select = 1, /* Framework chip select. On STAMP537 it is SPISSEL1*/
-		.platform_data = &bfin_spi_flash_data,
-		.controller_data = &spi_flash_chip_info,
-		.mode = SPI_MODE_3,
-	},
-#endif
-
-#if defined(CONFIG_SPI_ADC_BF533) \
-	|| defined(CONFIG_SPI_ADC_BF533_MODULE)
-	{
-		.modalias = "bfin_spi_adc", /* Name of spi_driver for this device */
-		.max_speed_hz = 6250000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0, /* Framework bus number */
-		.chip_select = 1, /* Framework chip select. */
-		.platform_data = NULL, /* No spi_driver specific config */
-		.controller_data = &spi_adc_chip_info,
-	},
-#endif
-
-#if defined(CONFIG_SND_BLACKFIN_AD1836) \
-	|| defined(CONFIG_SND_BLACKFIN_AD1836_MODULE)
-	{
-		.modalias = "ad1836-spi",
-		.max_speed_hz = 3125000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = CONFIG_SND_BLACKFIN_SPI_PFBIT,
-		.controller_data = &ad1836_spi_chip_info,
-	},
-#endif
-#if defined(CONFIG_AD9960) || defined(CONFIG_AD9960_MODULE)
-	{
-		.modalias = "ad9960-spi",
-		.max_speed_hz = 10000000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = 1,
-		.controller_data = &ad9960_spi_chip_info,
-	},
-#endif
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-	{
-		.modalias = "spi_mmc_dummy",
-		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = 0,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
-		.mode = SPI_MODE_3,
-	},
-	{
-		.modalias = "spi_mmc",
-		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = CONFIG_SPI_MMC_CS_CHAN,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
-		.mode = SPI_MODE_3,
-	},
-#endif
-#if defined(CONFIG_PBX)
-	{
-		.modalias = "fxs-spi",
-		.max_speed_hz = 12500000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = 8 - CONFIG_J11_JUMPER,
-		.controller_data = &spi_si3xxx_chip_info,
-		.mode = SPI_MODE_3,
-	},
-	{
-		.modalias = "fxo-spi",
-		.max_speed_hz = 12500000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = 8 - CONFIG_J19_JUMPER,
-		.controller_data = &spi_si3xxx_chip_info,
-		.mode = SPI_MODE_3,
-	},
-#endif
-#if defined(CONFIG_TOUCHSCREEN_AD7877) || defined(CONFIG_TOUCHSCREEN_AD7877_MODULE)
-	{
-		.modalias		= "ad7877",
-		.platform_data		= &bfin_ad7877_ts_info,
-		.irq			= IRQ_PF6,
-		.max_speed_hz	= 12500000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num	= 0,
-		.chip_select  = 1,
-		.controller_data = &spi_ad7877_chip_info,
-	},
-#endif
-};
-
-/* SPI controller data */
-static struct bfin5xx_spi_master bfin_spi0_info = {
-	.num_chipselect = 8,
-	.enable_dma = 1,  /* master has the ability to do dma transfer */
-	.pin_req = {P_SPI0_SCK, P_SPI0_MISO, P_SPI0_MOSI, 0},
-};
-
-/* SPI (0) */
-static struct resource bfin_spi0_resource[] = {
-	[0] = {
-		.start = SPI0_REGBASE,
-		.end   = SPI0_REGBASE + 0xFF,
-		.flags = IORESOURCE_MEM,
-		},
-	[1] = {
-		.start = CH_SPI,
-		.end   = CH_SPI,
-		.flags = IORESOURCE_IRQ,
-	},
-};
-
-static struct platform_device bfin_spi0_device = {
-	.name = "bfin-spi",
-	.id = 0, /* Bus number */
-	.num_resources = ARRAY_SIZE(bfin_spi0_resource),
-	.resource = bfin_spi0_resource,
-	.dev = {
-		.platform_data = &bfin_spi0_info, /* Passed to driver */
-	},
-};
-#endif  /* spi master and devices */
-
-#if defined(CONFIG_FB_BF537_LQ035) || defined(CONFIG_FB_BF537_LQ035_MODULE)
-static struct platform_device bfin_fb_device = {
-	.name = "bf537-lq035",
-};
-#endif
-
-#if defined(CONFIG_FB_BFIN_7393) || defined(CONFIG_FB_BFIN_7393_MODULE)
-static struct platform_device bfin_fb_adv7393_device = {
-	.name = "bfin-adv7393",
-};
-#endif
-
-#if defined(CONFIG_SERIAL_BFIN) || defined(CONFIG_SERIAL_BFIN_MODULE)
-static struct resource bfin_uart_resources[] = {
-	{
-		.start = 0xFFC00400,
-		.end = 0xFFC004FF,
-		.flags = IORESOURCE_MEM,
-	}, {
-		.start = 0xFFC02000,
-		.end = 0xFFC020FF,
-		.flags = IORESOURCE_MEM,
-	},
-};
-
-static struct platform_device bfin_uart_device = {
-	.name = "bfin-uart",
-	.id = 1,
-	.num_resources = ARRAY_SIZE(bfin_uart_resources),
-	.resource = bfin_uart_resources,
-};
-#endif
-
-#if defined(CONFIG_BFIN_SIR) || defined(CONFIG_BFIN_SIR_MODULE)
-#ifdef CONFIG_BFIN_SIR0
-static struct resource bfin_sir0_resources[] = {
-	{
-		.start = 0xFFC00400,
-		.end = 0xFFC004FF,
-		.flags = IORESOURCE_MEM,
-	},
-	{
-		.start = IRQ_UART0_RX,
-		.end = IRQ_UART0_RX+1,
-		.flags = IORESOURCE_IRQ,
-	},
-	{
-		.start = CH_UART0_RX,
-		.end = CH_UART0_RX+1,
-		.flags = IORESOURCE_DMA,
-	},
-};
-
-static struct platform_device bfin_sir0_device = {
-	.name = "bfin_sir",
-	.id = 0,
-	.num_resources = ARRAY_SIZE(bfin_sir0_resources),
-	.resource = bfin_sir0_resources,
-};
-#endif
-#ifdef CONFIG_BFIN_SIR1
-static struct resource bfin_sir1_resources[] = {
-	{
-		.start = 0xFFC02000,
-		.end = 0xFFC020FF,
-		.flags = IORESOURCE_MEM,
-	},
-	{
-		.start = IRQ_UART1_RX,
-		.end = IRQ_UART1_RX+1,
-		.flags = IORESOURCE_IRQ,
-	},
-	{
-		.start = CH_UART1_RX,
-		.end = CH_UART1_RX+1,
-		.flags = IORESOURCE_DMA,
-	},
-};
-
-static struct platform_device bfin_sir1_device = {
-	.name = "bfin_sir",
-	.id = 1,
-	.num_resources = ARRAY_SIZE(bfin_sir1_resources),
-	.resource = bfin_sir1_resources,
-};
-#endif
-#endif
-
-#if defined(CONFIG_I2C_BLACKFIN_TWI) || defined(CONFIG_I2C_BLACKFIN_TWI_MODULE)
-static struct resource bfin_twi0_resource[] = {
-	[0] = {
-		.start = TWI0_REGBASE,
-		.end   = TWI0_REGBASE + 0xFF,
-		.flags = IORESOURCE_MEM,
-	},
-	[1] = {
-		.start = IRQ_TWI,
-		.end   = IRQ_TWI,
-		.flags = IORESOURCE_IRQ,
-	},
-};
-
-static struct platform_device i2c_bfin_twi_device = {
-	.name = "i2c-bfin-twi",
-	.id = 0,
-	.num_resources = ARRAY_SIZE(bfin_twi0_resource),
-	.resource = bfin_twi0_resource,
-};
-#endif
-
-#if defined(CONFIG_SERIAL_BFIN_SPORT) || defined(CONFIG_SERIAL_BFIN_SPORT_MODULE)
-static struct platform_device bfin_sport0_uart_device = {
-	.name = "bfin-sport-uart",
-	.id = 0,
-};
-
-static struct platform_device bfin_sport1_uart_device = {
-	.name = "bfin-sport-uart",
-	.id = 1,
-};
-#endif
-
-static struct platform_device *stamp_devices[] __initdata = {
-#if defined(CONFIG_BFIN_CFPCMCIA) || defined(CONFIG_BFIN_CFPCMCIA_MODULE)
-	&bfin_pcmcia_cf_device,
-#endif
-
-#if defined(CONFIG_RTC_DRV_BFIN) || defined(CONFIG_RTC_DRV_BFIN_MODULE)
-	&rtc_device,
-#endif
-
-#if defined(CONFIG_USB_SL811_HCD) || defined(CONFIG_USB_SL811_HCD_MODULE)
-	&sl811_hcd_device,
-#endif
-
-#if defined(CONFIG_USB_ISP1362_HCD) || defined(CONFIG_USB_ISP1362_HCD_MODULE)
-	&isp1362_hcd_device,
-#endif
-
-#if defined(CONFIG_SMC91X) || defined(CONFIG_SMC91X_MODULE)
-	&smc91x_device,
-#endif
-
-#if defined(CONFIG_DM9000) || defined(CONFIG_DM9000_MODULE)
-	&dm9000_device,
-#endif
-
-#if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE)
-	&bfin_mii_bus,
-	&bfin_mac_device,
-#endif
-
-#if defined(CONFIG_USB_NET2272) || defined(CONFIG_USB_NET2272_MODULE)
-	&net2272_bfin_device,
-#endif
-
-#if defined(CONFIG_USB_ISP1760_HCD) || defined(CONFIG_USB_ISP1760_HCD_MODULE)
-	&bfin_isp1760_device,
-#endif
-
-#if defined(CONFIG_SPI_BFIN) || defined(CONFIG_SPI_BFIN_MODULE)
-	&bfin_spi0_device,
-#endif
-
-#if defined(CONFIG_FB_BF537_LQ035) || defined(CONFIG_FB_BF537_LQ035_MODULE)
-	&bfin_fb_device,
-#endif
-
-#if defined(CONFIG_FB_BFIN_7393) || defined(CONFIG_FB_BFIN_7393_MODULE)
-	&bfin_fb_adv7393_device,
-#endif
-
-#if defined(CONFIG_SERIAL_BFIN) || defined(CONFIG_SERIAL_BFIN_MODULE)
-	&bfin_uart_device,
-#endif
-
-#if defined(CONFIG_BFIN_SIR) || defined(CONFIG_BFIN_SIR_MODULE)
-#ifdef CONFIG_BFIN_SIR0
-	&bfin_sir0_device,
-#endif
-#ifdef CONFIG_BFIN_SIR1
-	&bfin_sir1_device,
-#endif
-#endif
-
-#if defined(CONFIG_I2C_BLACKFIN_TWI) || defined(CONFIG_I2C_BLACKFIN_TWI_MODULE)
-	&i2c_bfin_twi_device,
-#endif
-
-#if defined(CONFIG_SERIAL_BFIN_SPORT) || defined(CONFIG_SERIAL_BFIN_SPORT_MODULE)
-	&bfin_sport0_uart_device,
-	&bfin_sport1_uart_device,
-#endif
-};
-
-static int __init generic_init(void)
-{
-	printk(KERN_INFO "%s(): registering device resources\n", __func__);
-	platform_add_devices(stamp_devices, ARRAY_SIZE(stamp_devices));
-#if defined(CONFIG_SPI_BFIN) || defined(CONFIG_SPI_BFIN_MODULE)
-	spi_register_board_info(bfin_spi_board_info,
-				ARRAY_SIZE(bfin_spi_board_info));
-#endif
-
-	return 0;
-}
-
-arch_initcall(generic_init);
-
-void native_machine_restart(char *cmd)
-{
-	/* workaround reboot hang when booting from SPI */
-	if ((bfin_read_SYSCR() & 0x7) == 0x3)
-		bfin_reset_boot_spi_cs(P_DEFAULT_BOOT_SPI_CS);
-}
-
-#if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE)
-void bfin_get_ether_addr(char *addr)
-{
-	random_ether_addr(addr);
-	printk(KERN_WARNING "%s:%s: Setting Ethernet MAC to a random one\n", __FILE__, __func__);
-}
-EXPORT_SYMBOL(bfin_get_ether_addr);
-#endif
diff --git a/arch/blackfin/mach-bf561/boards/Kconfig b/arch/blackfin/mach-bf561/boards/Kconfig
index e41a67b..e4bc6d7 100644
--- a/arch/blackfin/mach-bf561/boards/Kconfig
+++ b/arch/blackfin/mach-bf561/boards/Kconfig
@@ -19,9 +19,4 @@ config BFIN561_BLUETECHNIX_CM
 	help
 	  CM-BF561 support for EVAL- and DEV-Board.
 
-config GENERIC_BF561_BOARD
-	bool "Generic"
-	help
-	  Generic or Custom board support.
-
 endchoice
diff --git a/arch/blackfin/mach-bf561/boards/Makefile b/arch/blackfin/mach-bf561/boards/Makefile
index 04add01..3a15255 100644
--- a/arch/blackfin/mach-bf561/boards/Makefile
+++ b/arch/blackfin/mach-bf561/boards/Makefile
@@ -2,7 +2,6 @@
 # arch/blackfin/mach-bf561/boards/Makefile
 #
 
-obj-$(CONFIG_GENERIC_BF561_BOARD)      += generic_board.o
 obj-$(CONFIG_BFIN561_BLUETECHNIX_CM)   += cm_bf561.o
 obj-$(CONFIG_BFIN561_EZKIT)            += ezkit.o
 obj-$(CONFIG_BFIN561_TEPLA)            += tepla.o
diff --git a/arch/blackfin/mach-bf561/boards/generic_board.c b/arch/blackfin/mach-bf561/boards/generic_board.c
deleted file mode 100644
index 0ba366a..0000000
--- a/arch/blackfin/mach-bf561/boards/generic_board.c
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * File:         arch/blackfin/mach-bf561/generic_board.c
- * Based on:     arch/blackfin/mach-bf533/ezkit.c
- * Author:       Aidan Williams <aidan@nicta.com.au>
- *
- * Created:
- * Description:
- *
- * Modified:
- *               Copyright 2005 National ICT Australia (NICTA)
- *               Copyright 2004-2006 Analog Devices Inc.
- *
- * Bugs:         Enter bugs at http://blackfin.uclinux.org/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see the file COPYING, or write
- * to the Free Software Foundation, Inc.,
- * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/irq.h>
-
-const char bfin_board_name[] = "UNKNOWN BOARD";
-
-/*
- *  Driver needs to know address, irq and flag pin.
- */
-#if defined(CONFIG_SMC91X) || defined(CONFIG_SMC91X_MODULE)
-static struct resource smc91x_resources[] = {
-	{
-		.start = 0x2C010300,
-		.end = 0x2C010300 + 16,
-		.flags = IORESOURCE_MEM,
-	}, {
-		.start = IRQ_PROG_INTB,
-		.end = IRQ_PROG_INTB,
-		.flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL,
-	}, {
-		.start = IRQ_PF9,
-		.end = IRQ_PF9,
-		.flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL,
-	},
-};
-
-static struct platform_device smc91x_device = {
-	.name = "smc91x",
-	.id = 0,
-	.num_resources = ARRAY_SIZE(smc91x_resources),
-	.resource = smc91x_resources,
-};
-#endif
-
-#if defined(CONFIG_BFIN_SIR) || defined(CONFIG_BFIN_SIR_MODULE)
-#ifdef CONFIG_BFIN_SIR0
-static struct resource bfin_sir0_resources[] = {
-	{
-		.start = 0xFFC00400,
-		.end = 0xFFC004FF,
-		.flags = IORESOURCE_MEM,
-	},
-	{
-		.start = IRQ_UART0_RX,
-		.end = IRQ_UART0_RX+1,
-		.flags = IORESOURCE_IRQ,
-	},
-	{
-		.start = CH_UART0_RX,
-		.end = CH_UART0_RX+1,
-		.flags = IORESOURCE_DMA,
-	},
-};
-
-static struct platform_device bfin_sir0_device = {
-	.name = "bfin_sir",
-	.id = 0,
-	.num_resources = ARRAY_SIZE(bfin_sir0_resources),
-	.resource = bfin_sir0_resources,
-};
-#endif
-#endif
-
-static struct platform_device *generic_board_devices[] __initdata = {
-#if defined(CONFIG_SMC91X) || defined(CONFIG_SMC91X_MODULE)
-	&smc91x_device,
-#endif
-
-#if defined(CONFIG_BFIN_SIR) || defined(CONFIG_BFIN_SIR_MODULE)
-#ifdef CONFIG_BFIN_SIR0
-	&bfin_sir0_device,
-#endif
-#endif
-};
-
-static int __init generic_board_init(void)
-{
-	printk(KERN_INFO "%s(): registering device resources\n", __func__);
-	return platform_add_devices(generic_board_devices,
-				    ARRAY_SIZE(generic_board_devices));
-}
-
-arch_initcall(generic_board_init);
-- 
cgit v0.10.2


From 28e4cf22a3707d05eb697b9b8ae6a4ed39c99b45 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Mon, 2 Mar 2009 18:04:24 +0800
Subject: Blackfin arch: Disable NAND option by default

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/configs/BF537-STAMP_defconfig b/arch/blackfin/configs/BF537-STAMP_defconfig
index 4fb4108..c61b600 100644
--- a/arch/blackfin/configs/BF537-STAMP_defconfig
+++ b/arch/blackfin/configs/BF537-STAMP_defconfig
@@ -568,15 +568,7 @@ CONFIG_MTD_PHYSMAP_BANKWIDTH=2
 # CONFIG_MTD_DOC2000 is not set
 # CONFIG_MTD_DOC2001 is not set
 # CONFIG_MTD_DOC2001PLUS is not set
-CONFIG_MTD_NAND=m
-# CONFIG_MTD_NAND_VERIFY_WRITE is not set
-# CONFIG_MTD_NAND_ECC_SMC is not set
-# CONFIG_MTD_NAND_MUSEUM_IDS is not set
-# CONFIG_MTD_NAND_BFIN is not set
-CONFIG_MTD_NAND_IDS=m
-# CONFIG_MTD_NAND_DISKONCHIP is not set
-# CONFIG_MTD_NAND_NANDSIM is not set
-CONFIG_MTD_NAND_PLATFORM=m
+# CONFIG_MTD_NAND is not set
 # CONFIG_MTD_ONENAND is not set
 
 #
-- 
cgit v0.10.2


From 0f29456a21ae55c43b4e2a64611f778b1fbe4bdf Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Mon, 2 Mar 2009 18:06:13 +0800
Subject: Blackfin arch: Make IRQ_EPPIx_ERROR naming consistent

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/mach-bf548/include/mach/irq.h b/arch/blackfin/mach-bf548/include/mach/irq.h
index 60299a7..f194625 100644
--- a/arch/blackfin/mach-bf548/include/mach/irq.h
+++ b/arch/blackfin/mach-bf548/include/mach/irq.h
@@ -123,8 +123,8 @@ Events         (highest priority)  EMU         0
 #define IRQ_MXVR_ERROR		BFIN_IRQ(51)	/* MXVR Status (Error) Interrupt */
 #define IRQ_MXVR_MSG		BFIN_IRQ(52)	/* MXVR Message Interrupt */
 #define IRQ_MXVR_PKT		BFIN_IRQ(53)	/* MXVR Packet Interrupt */
-#define IRQ_EPP1_ERROR		BFIN_IRQ(54)	/* EPPI1 Error Interrupt */
-#define IRQ_EPP2_ERROR		BFIN_IRQ(55)	/* EPPI2 Error Interrupt */
+#define IRQ_EPPI1_ERROR		BFIN_IRQ(54)	/* EPPI1 Error Interrupt */
+#define IRQ_EPPI2_ERROR		BFIN_IRQ(55)	/* EPPI2 Error Interrupt */
 #define IRQ_UART3_ERROR		BFIN_IRQ(56)	/* UART3 Status (Error) Interrupt */
 #define IRQ_HOST_ERROR		BFIN_IRQ(57)	/* HOST Status (Error) Interrupt */
 #define IRQ_PIXC_ERROR		BFIN_IRQ(59)	/* PIXC Status (Error) Interrupt */
@@ -361,8 +361,8 @@ Events         (highest priority)  EMU         0
 #define IRQ_UART2_ERR 		IRQ_UART2_ERROR
 #define IRQ_CAN0_ERR  		IRQ_CAN0_ERROR
 #define IRQ_MXVR_ERR  		IRQ_MXVR_ERROR
-#define IRQ_EPP1_ERR  		IRQ_EPP1_ERROR
-#define IRQ_EPP2_ERR  		IRQ_EPP2_ERROR
+#define IRQ_EPPI1_ERR  		IRQ_EPPI1_ERROR
+#define IRQ_EPPI2_ERR  		IRQ_EPPI2_ERROR
 #define IRQ_UART3_ERR 		IRQ_UART3_ERROR
 #define IRQ_HOST_ERR  		IRQ_HOST_ERROR
 #define IRQ_PIXC_ERR  		IRQ_PIXC_ERROR
-- 
cgit v0.10.2


From 34d464f8aa3e762ec812a131bfd53ccb4f886f69 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier.adi@gmail.com>
Date: Mon, 2 Mar 2009 18:14:47 +0800
Subject: Blackfin arch: use common KGDB_TESTS rather than our own
 KGDB_TESTCASE

Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/Kconfig.debug b/arch/blackfin/Kconfig.debug
index 5f981d9..79e7e63 100644
--- a/arch/blackfin/Kconfig.debug
+++ b/arch/blackfin/Kconfig.debug
@@ -21,12 +21,6 @@ config DEBUG_STACK_USAGE
 config HAVE_ARCH_KGDB
 	def_bool y
 
-config KGDB_TESTCASE
-	tristate "KGDB: for test case in expect"
-	default n
-	help
-	  This is a kgdb test case for automated testing.
-
 config DEBUG_VERBOSE
 	bool "Verbose fault messages"
 	default y
diff --git a/arch/blackfin/kernel/Makefile b/arch/blackfin/kernel/Makefile
index 4a92a86..fd4d432 100644
--- a/arch/blackfin/kernel/Makefile
+++ b/arch/blackfin/kernel/Makefile
@@ -15,13 +15,15 @@ else
     obj-y += time.o
 endif
 
-CFLAGS_kgdb_test.o := -mlong-calls -O0
-
 obj-$(CONFIG_IPIPE)                  += ipipe.o
 obj-$(CONFIG_IPIPE_TRACE_MCOUNT)     += mcount.o
 obj-$(CONFIG_BFIN_GPTIMERS)          += gptimers.o
 obj-$(CONFIG_CPLB_INFO)              += cplbinfo.o
 obj-$(CONFIG_MODULES)                += module.o
 obj-$(CONFIG_KGDB)                   += kgdb.o
-obj-$(CONFIG_KGDB_TESTCASE)          += kgdb_test.o
+obj-$(CONFIG_KGDB_TESTS)             += kgdb_test.o
 obj-$(CONFIG_EARLY_PRINTK)           += early_printk.o
+
+# the kgdb test puts code into L2 and without linker
+# relaxation, we need to force long calls to/from it
+CFLAGS_kgdb_test.o := -mlong-calls -O0
-- 
cgit v0.10.2


From e84dcaa18b2785d8ab20a7cb25612d89feb89fa7 Mon Sep 17 00:00:00 2001
From: Bernd Schmidt <bernds_cb1@t-online.de>
Date: Mon, 2 Mar 2009 18:37:48 +0800
Subject: Blackfin arch: fix bug - jump_to_zero test case failed on noMPU
 kernel

The nompu code is now derived from the mpu code, and had the same problem -
no null pointer detection on ICPLBs.

Signed-off-by: Bernd Schmidt <bernds_cb1@t-online.de>
Cc: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/kernel/cplb-nompu/cplbinit.c b/arch/blackfin/kernel/cplb-nompu/cplbinit.c
index 0e28f75..d6c0677 100644
--- a/arch/blackfin/kernel/cplb-nompu/cplbinit.c
+++ b/arch/blackfin/kernel/cplb-nompu/cplbinit.c
@@ -53,9 +53,13 @@ void __init generate_cplb_tables_cpu(unsigned int cpu)
 
 	i_d = i_i = 0;
 
+#ifdef CONFIG_DEBUG_HUNT_FOR_ZERO
 	/* Set up the zero page.  */
 	d_tbl[i_d].addr = 0;
 	d_tbl[i_d++].data = SDRAM_OOPS | PAGE_SIZE_1KB;
+	i_tbl[i_i].addr = 0;
+	i_tbl[i_i++].data = SDRAM_OOPS | PAGE_SIZE_1KB;
+#endif
 
 	/* Cover kernel memory with 4M pages.  */
 	addr = 0;
-- 
cgit v0.10.2


From 9976b39b5031bbf76f715893cf080b6a17683881 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Fri, 27 Feb 2009 09:19:26 -0800
Subject: xen: deal with virtually mapped percpu data

The virtually mapped percpu space causes us two problems:

 - for hypercalls which take an mfn, we need to do a full pagetable
   walk to convert the percpu va into an mfn, and

 - when a hypercall requires a page to be mapped RO via all its aliases,
   we need to make sure its RO in both the percpu mapping and in the
   linear mapping

This primarily affects the gdt and the vcpu info structure.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Xen-devel <xen-devel@lists.xensource.com>
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <htejun@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 4bd990e..1a918dd 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x)
 
 
 xmaddr_t arbitrary_virt_to_machine(void *address);
+unsigned long arbitrary_virt_to_mfn(void *vaddr);
 void make_lowmem_page_readonly(void *vaddr);
 void make_lowmem_page_readwrite(void *vaddr);
 
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 86497d5..352ea68 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu)
 
 	vcpup = &per_cpu(xen_vcpu_info, cpu);
 
-	info.mfn = virt_to_mfn(vcpup);
+	info.mfn = arbitrary_virt_to_mfn(vcpup);
 	info.offset = offset_in_page(vcpup);
 
 	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
@@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 	frames = mcs.args;
 
 	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
-		frames[f] = virt_to_mfn(va);
+		frames[f] = arbitrary_virt_to_mfn((void *)va);
+
 		make_lowmem_page_readonly((void *)va);
+		make_lowmem_page_readonly(mfn_to_virt(frames[f]));
 	}
 
 	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
@@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
 				unsigned int cpu, unsigned int i)
 {
 	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-	xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+	xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
 	struct multicall_space mc = __xen_mc_entry(0);
 
 	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
@@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 		break;
 
 	default: {
-		xmaddr_t maddr = virt_to_machine(&dt[entry]);
+		xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
 
 		xen_mc_flush();
 		if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 319bd40..cb6afa4 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	p2m_top[topidx][idx] = mfn;
 }
 
+unsigned long arbitrary_virt_to_mfn(void *vaddr)
+{
+	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
+
+	return PFN_DOWN(maddr.maddr);
+}
+
 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 {
 	unsigned long address = (unsigned long)vaddr;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 035582a..8d47056 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 {
 	struct vcpu_guest_context *ctxt;
 	struct desc_struct *gdt;
+	unsigned long gdt_mfn;
 
 	if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
 		return 0;
@@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	ctxt->ldt_ents = 0;
 
 	BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+
+	gdt_mfn = arbitrary_virt_to_mfn(gdt);
 	make_lowmem_page_readonly(gdt);
+	make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
 
-	ctxt->gdt_frames[0] = virt_to_mfn(gdt);
+	ctxt->gdt_frames[0] = gdt_mfn;
 	ctxt->gdt_ents      = GDT_ENTRIES;
 
 	ctxt->user_regs.cs = __KERNEL_CS;
-- 
cgit v0.10.2


From d1dd524785e30cf3d64d395d829b207376acb0aa Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Mon, 2 Mar 2009 06:46:50 +0000
Subject: sctp: fix crash during module unload

An extra list_del() during the module load failure and unload
resulted in a crash with a list corruption.  Now sctp can
be unloaded again.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index b78e3be..4e66384 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1322,9 +1322,8 @@ SCTP_STATIC __init int sctp_init(void)
 out:
 	return status;
 err_v6_add_protocol:
-	sctp_v6_del_protocol();
-err_add_protocol:
 	sctp_v4_del_protocol();
+err_add_protocol:
 	inet_ctl_sock_destroy(sctp_ctl_sock);
 err_ctl_sock_init:
 	sctp_v6_protosw_exit();
@@ -1335,7 +1334,6 @@ err_protosw_init:
 	sctp_v4_pf_exit();
 	sctp_v6_pf_exit();
 	sctp_sysctl_unregister();
-	list_del(&sctp_af_inet.list);
 	free_pages((unsigned long)sctp_port_hashtable,
 		   get_order(sctp_port_hashsize *
 			     sizeof(struct sctp_bind_hashbucket)));
@@ -1383,7 +1381,6 @@ SCTP_STATIC __exit void sctp_exit(void)
 	sctp_v4_pf_exit();
 
 	sctp_sysctl_unregister();
-	list_del(&sctp_af_inet.list);
 
 	free_pages((unsigned long)sctp_assoc_hashtable,
 		   get_order(sctp_assoc_hashsize *
-- 
cgit v0.10.2


From 3df2678737974accf437dad11e584c1871a3ede3 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 2 Mar 2009 06:46:51 +0000
Subject: sctp: fix kernel panic with ERROR chunk containing too many error
 causes

If ERROR chunk is received with too many error causes in ESTABLISHED
state, the kernel get panic.

This is because sctp limit the max length of cmds to 14, but while
ERROR chunk is received, one error cause will add around 2 cmds by
sctp_add_cmd_sf(). So many error causes will fill the limit of cmds
and panic.

This patch fixed the problem.

This bug can be test by SCTP Conformance Test Suite
<http://networktest.sourceforge.net/>.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index e1d6076..b5495ae 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -787,36 +787,48 @@ static void sctp_cmd_process_operr(sctp_cmd_seq_t *cmds,
 				   struct sctp_association *asoc,
 				   struct sctp_chunk *chunk)
 {
-	struct sctp_operr_chunk *operr_chunk;
 	struct sctp_errhdr *err_hdr;
+	struct sctp_ulpevent *ev;
 
-	operr_chunk = (struct sctp_operr_chunk *)chunk->chunk_hdr;
-	err_hdr = &operr_chunk->err_hdr;
+	while (chunk->chunk_end > chunk->skb->data) {
+		err_hdr = (struct sctp_errhdr *)(chunk->skb->data);
 
-	switch (err_hdr->cause) {
-	case SCTP_ERROR_UNKNOWN_CHUNK:
-	{
-		struct sctp_chunkhdr *unk_chunk_hdr;
+		ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0,
+						     GFP_ATOMIC);
+		if (!ev)
+			return;
 
-		unk_chunk_hdr = (struct sctp_chunkhdr *)err_hdr->variable;
-		switch (unk_chunk_hdr->type) {
-		/* ADDIP 4.1 A9) If the peer responds to an ASCONF with an
-		 * ERROR chunk reporting that it did not recognized the ASCONF
-		 * chunk type, the sender of the ASCONF MUST NOT send any
-		 * further ASCONF chunks and MUST stop its T-4 timer.
-		 */
-		case SCTP_CID_ASCONF:
-			asoc->peer.asconf_capable = 0;
-			sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP,
+		sctp_ulpq_tail_event(&asoc->ulpq, ev);
+
+		switch (err_hdr->cause) {
+		case SCTP_ERROR_UNKNOWN_CHUNK:
+		{
+			sctp_chunkhdr_t *unk_chunk_hdr;
+
+			unk_chunk_hdr = (sctp_chunkhdr_t *)err_hdr->variable;
+			switch (unk_chunk_hdr->type) {
+			/* ADDIP 4.1 A9) If the peer responds to an ASCONF with
+			 * an ERROR chunk reporting that it did not recognized
+			 * the ASCONF chunk type, the sender of the ASCONF MUST
+			 * NOT send any further ASCONF chunks and MUST stop its
+			 * T-4 timer.
+			 */
+			case SCTP_CID_ASCONF:
+				if (asoc->peer.asconf_capable == 0)
+					break;
+
+				asoc->peer.asconf_capable = 0;
+				sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP,
 					SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+				break;
+			default:
+				break;
+			}
 			break;
+		}
 		default:
 			break;
 		}
-		break;
-	}
-	default:
-		break;
 	}
 }
 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 3a0cd07..f88dfde 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3163,7 +3163,6 @@ sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep,
 					sctp_cmd_seq_t *commands)
 {
 	struct sctp_chunk *chunk = arg;
-	struct sctp_ulpevent *ev;
 
 	if (!sctp_vtag_verify(chunk, asoc))
 		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
@@ -3173,21 +3172,10 @@ sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep,
 		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
 						  commands);
 
-	while (chunk->chunk_end > chunk->skb->data) {
-		ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0,
-						     GFP_ATOMIC);
-		if (!ev)
-			goto nomem;
+	sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR,
+			SCTP_CHUNK(chunk));
 
-		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
-				SCTP_ULPEVENT(ev));
-		sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR,
-				SCTP_CHUNK(chunk));
-	}
 	return SCTP_DISPOSITION_CONSUME;
-
-nomem:
-	return SCTP_DISPOSITION_NOMEM;
 }
 
 /*
-- 
cgit v0.10.2


From 07555c9880da3e2e96e5eae00a03b44cc076deaf Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Mon, 2 Mar 2009 22:29:37 -0800
Subject: OMAP: enable smc911x support for LDP platform

The following patch enables SMC911x support to work on the OMAP LDP
board.  Although the SMC911x driver will eventually be obsoleted, the
smsc911x patches are rather invasive for the -rc kernels.

Rather than risk destablising smsc911x, this simpler patch is preferred
to allow the network interface to work.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/arm/mach-omap2/board-ldp.c b/arch/arm/mach-omap2/board-ldp.c
index f6a1345..6031e17 100644
--- a/arch/arm/mach-omap2/board-ldp.c
+++ b/arch/arm/mach-omap2/board-ldp.c
@@ -81,7 +81,7 @@ static inline void __init ldp_init_smc911x(void)
 	}
 
 	ldp_smc911x_resources[0].start = cs_mem_base + 0x0;
-	ldp_smc911x_resources[0].end   = cs_mem_base + 0xf;
+	ldp_smc911x_resources[0].end   = cs_mem_base + 0xff;
 	udelay(100);
 
 	eth_gpio = LDP_SMC911X_GPIO;
diff --git a/drivers/net/smc911x.h b/drivers/net/smc911x.h
index 870b4c3..a45952e 100644
--- a/drivers/net/smc911x.h
+++ b/drivers/net/smc911x.h
@@ -42,6 +42,16 @@
   #define SMC_USE_16BIT		0
   #define SMC_USE_32BIT		1
   #define SMC_IRQ_SENSE		IRQF_TRIGGER_LOW
+#elif defined(CONFIG_ARCH_OMAP34XX)
+  #define SMC_USE_16BIT		0
+  #define SMC_USE_32BIT		1
+  #define SMC_IRQ_SENSE		IRQF_TRIGGER_LOW
+  #define SMC_MEM_RESERVED	1
+#elif defined(CONFIG_ARCH_OMAP24XX)
+  #define SMC_USE_16BIT		0
+  #define SMC_USE_32BIT		1
+  #define SMC_IRQ_SENSE		IRQF_TRIGGER_LOW
+  #define SMC_MEM_RESERVED	1
 #else
 /*
  * Default configuration
@@ -675,6 +685,7 @@ smc_pxa_dma_outsl(struct smc911x_local *lp, u_long physaddr,
 #define CHIP_9116	0x0116
 #define CHIP_9117	0x0117
 #define CHIP_9118	0x0118
+#define CHIP_9211	0x9211
 #define CHIP_9215	0x115A
 #define CHIP_9217	0x117A
 #define CHIP_9218	0x118A
@@ -689,6 +700,7 @@ static const struct chip_id chip_ids[] =  {
 	{ CHIP_9116, "LAN9116" },
 	{ CHIP_9117, "LAN9117" },
 	{ CHIP_9118, "LAN9118" },
+	{ CHIP_9211, "LAN9211" },
 	{ CHIP_9215, "LAN9215" },
 	{ CHIP_9217, "LAN9217" },
 	{ CHIP_9218, "LAN9218" },
-- 
cgit v0.10.2


From 5a5990d3090b03745a9548a6f5edef02095675cf Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Thu, 26 Feb 2009 06:49:24 +0000
Subject: net: Avoid race between network down and sysfs

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 6ac29a4..484f587 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -77,7 +77,9 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
 	if (endp == buf)
 		goto err;
 
-	rtnl_lock();
+	if (!rtnl_trylock())
+		return -ERESTARTSYS;
+
 	if (dev_isalive(net)) {
 		if ((ret = (*set)(net, new)) == 0)
 			ret = len;
-- 
cgit v0.10.2


From b325fddb7f869e6c95a88dc6573220f162e5b89f Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Thu, 26 Feb 2009 06:55:31 +0000
Subject: ipv6: Fix sysctl unregistration deadlock

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f9afb45..40fabde 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -493,15 +493,17 @@ static void addrconf_forward_change(struct net *net, __s32 newf)
 	read_unlock(&dev_base_lock);
 }
 
-static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
+static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
 {
 	struct net *net;
 
 	net = (struct net *)table->extra2;
 	if (p == &net->ipv6.devconf_dflt->forwarding)
-		return;
+		return 0;
+
+	if (!rtnl_trylock())
+		return -ERESTARTSYS;
 
-	rtnl_lock();
 	if (p == &net->ipv6.devconf_all->forwarding) {
 		__s32 newf = net->ipv6.devconf_all->forwarding;
 		net->ipv6.devconf_dflt->forwarding = newf;
@@ -512,6 +514,7 @@ static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
 
 	if (*p)
 		rt6_purge_dflt_routers(net);
+	return 1;
 }
 #endif
 
@@ -3983,7 +3986,7 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
 	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
 
 	if (write)
-		addrconf_fixup_forwarding(ctl, valp, val);
+		ret = addrconf_fixup_forwarding(ctl, valp, val);
 	return ret;
 }
 
@@ -4019,8 +4022,7 @@ static int addrconf_sysctl_forward_strategy(ctl_table *table,
 	}
 
 	*valp = new;
-	addrconf_fixup_forwarding(table, valp, val);
-	return 1;
+	return addrconf_fixup_forwarding(table, valp, val);
 }
 
 static struct addrconf_sysctl_table
-- 
cgit v0.10.2


From ee554be9ddcb666445b6765d8b5cdbfe80a1e9cf Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier.adi@gmail.com>
Date: Tue, 3 Mar 2009 16:52:55 +0800
Subject: Blackfin arch: fix compile failure when missing the anomaly
 definition

make sure ANOMALY_05000278/ANOMALY_05000380 is defined for all parts

Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/mach-bf518/include/mach/anomaly.h b/arch/blackfin/mach-bf518/include/mach/anomaly.h
index e5b4bef..6e16700 100644
--- a/arch/blackfin/mach-bf518/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf518/include/mach/anomaly.h
@@ -65,6 +65,7 @@
 #define ANOMALY_05000263 (0)
 #define ANOMALY_05000266 (0)
 #define ANOMALY_05000273 (0)
+#define ANOMALY_05000278 (0)
 #define ANOMALY_05000285 (0)
 #define ANOMALY_05000307 (0)
 #define ANOMALY_05000311 (0)
@@ -72,6 +73,7 @@
 #define ANOMALY_05000323 (0)
 #define ANOMALY_05000353 (0)
 #define ANOMALY_05000363 (0)
+#define ANOMALY_05000380 (0)
 #define ANOMALY_05000386 (0)
 #define ANOMALY_05000412 (0)
 #define ANOMALY_05000432 (0)
diff --git a/arch/blackfin/mach-bf527/include/mach/anomaly.h b/arch/blackfin/mach-bf527/include/mach/anomaly.h
index 035e8d8..fdc8783 100644
--- a/arch/blackfin/mach-bf527/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf527/include/mach/anomaly.h
@@ -167,6 +167,7 @@
 #define ANOMALY_05000263 (0)
 #define ANOMALY_05000266 (0)
 #define ANOMALY_05000273 (0)
+#define ANOMALY_05000278 (0)
 #define ANOMALY_05000285 (0)
 #define ANOMALY_05000307 (0)
 #define ANOMALY_05000311 (0)
diff --git a/arch/blackfin/mach-bf533/include/mach/anomaly.h b/arch/blackfin/mach-bf533/include/mach/anomaly.h
index 0d3a034..657dd18 100644
--- a/arch/blackfin/mach-bf533/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf533/include/mach/anomaly.h
@@ -278,6 +278,7 @@
 #define ANOMALY_05000266 (0)
 #define ANOMALY_05000323 (0)
 #define ANOMALY_05000353 (1)
+#define ANOMALY_05000380 (0)
 #define ANOMALY_05000386 (1)
 #define ANOMALY_05000412 (0)
 #define ANOMALY_05000432 (0)
diff --git a/arch/blackfin/mach-bf537/include/mach/anomaly.h b/arch/blackfin/mach-bf537/include/mach/anomaly.h
index 9cb3912..fe1ae4c 100644
--- a/arch/blackfin/mach-bf537/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf537/include/mach/anomaly.h
@@ -168,6 +168,7 @@
 #define ANOMALY_05000323 (0)
 #define ANOMALY_05000353 (1)
 #define ANOMALY_05000363 (0)
+#define ANOMALY_05000380 (0)
 #define ANOMALY_05000386 (1)
 #define ANOMALY_05000412 (0)
 #define ANOMALY_05000432 (0)
diff --git a/arch/blackfin/mach-bf538/include/mach/anomaly.h b/arch/blackfin/mach-bf538/include/mach/anomaly.h
index e130b4f..56ea454 100644
--- a/arch/blackfin/mach-bf538/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf538/include/mach/anomaly.h
@@ -124,6 +124,7 @@
 #define ANOMALY_05000323 (0)
 #define ANOMALY_05000353 (1)
 #define ANOMALY_05000363 (0)
+#define ANOMALY_05000380 (0)
 #define ANOMALY_05000386 (1)
 #define ANOMALY_05000412 (0)
 #define ANOMALY_05000432 (0)
diff --git a/arch/blackfin/mach-bf548/include/mach/anomaly.h b/arch/blackfin/mach-bf548/include/mach/anomaly.h
index 23d03c5..d9d10a7 100644
--- a/arch/blackfin/mach-bf548/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf548/include/mach/anomaly.h
@@ -171,6 +171,7 @@
 #define ANOMALY_05000263 (0)
 #define ANOMALY_05000266 (0)
 #define ANOMALY_05000273 (0)
+#define ANOMALY_05000278 (0)
 #define ANOMALY_05000307 (0)
 #define ANOMALY_05000311 (0)
 #define ANOMALY_05000323 (0)
diff --git a/arch/blackfin/mach-bf561/include/mach/anomaly.h b/arch/blackfin/mach-bf561/include/mach/anomaly.h
index 1a9e175..24d3a22 100644
--- a/arch/blackfin/mach-bf561/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf561/include/mach/anomaly.h
@@ -283,6 +283,7 @@
 #define ANOMALY_05000273 (0)
 #define ANOMALY_05000311 (0)
 #define ANOMALY_05000353 (1)
+#define ANOMALY_05000380 (0)
 #define ANOMALY_05000386 (1)
 #define ANOMALY_05000432 (0)
 #define ANOMALY_05000435 (0)
-- 
cgit v0.10.2


From 176c39af29bc4edaf37f663553eeaacd47b5bc9c Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <dlezcano@fr.ibm.com>
Date: Tue, 3 Mar 2009 01:06:45 -0800
Subject: netns: fix addrconf_ifdown kernel panic

When a network namespace is destroyed the network interfaces are
all unregistered, making addrconf_ifdown called by the netdevice
notifier.
In the other hand, the addrconf exit method does a loop on the network
devices and does addrconf_ifdown on each of them. But the ordering of
the netns subsystem is not right because it uses the register_pernet_device
instead of register_pernet_subsys. If we handle the loopback as
any network device, we can safely use register_pernet_subsys.

But if we use register_pernet_subsys, the addrconf exit method will do
exactly what was already done with the unregistering of the network
devices. So in definitive, this code is pointless.

I removed the netns addrconf exit method and moved the code to the
addrconf cleanup function.

Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 40fabde..1220e2c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2611,9 +2611,6 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 	ASSERT_RTNL();
 
-	if ((dev->flags & IFF_LOOPBACK) && how == 1)
-		how = 0;
-
 	rt6_ifdown(net, dev);
 	neigh_ifdown(&nd_tbl, dev);
 
@@ -4448,25 +4445,6 @@ int unregister_inet6addr_notifier(struct notifier_block *nb)
 
 EXPORT_SYMBOL(unregister_inet6addr_notifier);
 
-static void addrconf_net_exit(struct net *net)
-{
-	struct net_device *dev;
-
-	rtnl_lock();
-	/* clean dev list */
-	for_each_netdev(net, dev) {
-		if (__in6_dev_get(dev) == NULL)
-			continue;
-		addrconf_ifdown(dev, 1);
-	}
-	addrconf_ifdown(net->loopback_dev, 2);
-	rtnl_unlock();
-}
-
-static struct pernet_operations addrconf_net_ops = {
-	.exit = addrconf_net_exit,
-};
-
 /*
  *	Init / cleanup code
  */
@@ -4508,10 +4486,6 @@ int __init addrconf_init(void)
 	if (err)
 		goto errlo;
 
-	err = register_pernet_device(&addrconf_net_ops);
-	if (err)
-		return err;
-
 	register_netdevice_notifier(&ipv6_dev_notf);
 
 	addrconf_verify(0);
@@ -4541,15 +4515,22 @@ errlo:
 void addrconf_cleanup(void)
 {
 	struct inet6_ifaddr *ifa;
+	struct net_device *dev;
 	int i;
 
 	unregister_netdevice_notifier(&ipv6_dev_notf);
-	unregister_pernet_device(&addrconf_net_ops);
-
 	unregister_pernet_subsys(&addrconf_ops);
 
 	rtnl_lock();
 
+	/* clean dev list */
+	for_each_netdev(&init_net, dev) {
+		if (__in6_dev_get(dev) == NULL)
+			continue;
+		addrconf_ifdown(dev, 1);
+	}
+	addrconf_ifdown(init_net.loopback_dev, 2);
+
 	/*
 	 *	Check hash table.
 	 */
@@ -4570,6 +4551,4 @@ void addrconf_cleanup(void)
 
 	del_timer(&addr_chk_timer);
 	rtnl_unlock();
-
-	unregister_pernet_subsys(&addrconf_net_ops);
 }
-- 
cgit v0.10.2


From 6eb0777228f31932fc941eafe8b08848466630a1 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@aristanetworks.com>
Date: Sun, 22 Feb 2009 00:09:14 -0800
Subject: netns: Fix icmp shutdown.

Recently I had a kernel panic in icmp_send during a network namespace
cleanup.  There were packets in the arp queue that failed to be sent
and we attempted to generate an ICMP host unreachable message, but
failed because icmp_sk_exit had already been called.

The network devices are removed from a network namespace and their
arp queues are flushed before we do attempt to shutdown subsystems
so this error should have been impossible.

It turns out icmp_init is using register_pernet_device instead
of register_pernet_subsys.  Which resulted in icmp being shut down
while we still had the possibility of packets in flight, making
a nasty NULL pointer deference in interrupt context possible.

Changing this to register_pernet_subsys fixes the problem in
my testing.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Acked-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 705b33b..fc562d2 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1205,7 +1205,7 @@ static struct pernet_operations __net_initdata icmp_sk_ops = {
 
 int __init icmp_init(void)
 {
-	return register_pernet_device(&icmp_sk_ops);
+	return register_pernet_subsys(&icmp_sk_ops);
 }
 
 EXPORT_SYMBOL(icmp_err_convert);
-- 
cgit v0.10.2


From 2f20d2e667ab1ca44cde5fb361386dff5bb6081d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@aristanetworks.com>
Date: Sun, 22 Feb 2009 00:10:18 -0800
Subject: tcp: Like icmp use register_pernet_subsys

To remove the possibility of packets flying around when network
devices are being cleaned up use reisger_pernet_subsys instead of
register_pernet_device.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Acked-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 19d7b42..cf74c41 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2443,7 +2443,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
 void __init tcp_v4_init(void)
 {
 	inet_hashinfo_init(&tcp_hashinfo);
-	if (register_pernet_device(&tcp_sk_ops))
+	if (register_pernet_subsys(&tcp_sk_ops))
 		panic("Failed to create the TCP control socket.\n");
 }
 
-- 
cgit v0.10.2


From 17edde520927070a6bf14a6a75027c0b843443e5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@aristanetworks.com>
Date: Sun, 22 Feb 2009 00:11:09 -0800
Subject: netns: Remove net_alive

It turns out that net_alive is unnecessary, and the original problem
that led to it being added was simply that the icmp code thought
it was a network device and wound up being unable to handle packets
while there were still packets in the network namespace.

Now that icmp and tcp have been fixed to properly register themselves
this problem is no longer present and we have a stronger guarantee
that packets will not arrive in a network namespace then that provided
by net_alive in netif_receive_skb.  So remove net_alive allowing
packet reception run a little faster.

Additionally document the strong reason why network namespace cleanup
is safe so that if something happens again someone else will have
a chance of figuring it out.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 6fc13d9..ded434b 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -109,11 +109,6 @@ extern struct list_head net_namespace_list;
 #ifdef CONFIG_NET_NS
 extern void __put_net(struct net *net);
 
-static inline int net_alive(struct net *net)
-{
-	return net && atomic_read(&net->count);
-}
-
 static inline struct net *get_net(struct net *net)
 {
 	atomic_inc(&net->count);
@@ -145,11 +140,6 @@ int net_eq(const struct net *net1, const struct net *net2)
 }
 #else
 
-static inline int net_alive(struct net *net)
-{
-	return 1;
-}
-
 static inline struct net *get_net(struct net *net)
 {
 	return net;
@@ -234,6 +224,23 @@ struct pernet_operations {
 	void (*exit)(struct net *net);
 };
 
+/*
+ * Use these carefully.  If you implement a network device and it
+ * needs per network namespace operations use device pernet operations,
+ * otherwise use pernet subsys operations.
+ *
+ * This is critically important.  Most of the network code cleanup
+ * runs with the assumption that dev_remove_pack has been called so no
+ * new packets will arrive during and after the cleanup functions have
+ * been called.  dev_remove_pack is not per namespace so instead the
+ * guarantee of no more packets arriving in a network namespace is
+ * provided by ensuring that all network devices and all sockets have
+ * left the network namespace before the cleanup methods are called.
+ *
+ * For the longest time the ipv4 icmp code was registered as a pernet
+ * device which caused kernel oops, and panics during network
+ * namespace cleanup.   So please don't get this wrong.
+ */
 extern int register_pernet_subsys(struct pernet_operations *);
 extern void unregister_pernet_subsys(struct pernet_operations *);
 extern int register_pernet_gen_subsys(int *id, struct pernet_operations *);
diff --git a/net/core/dev.c b/net/core/dev.c
index 72b0d26f..9e4afe6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2267,12 +2267,6 @@ int netif_receive_skb(struct sk_buff *skb)
 
 	rcu_read_lock();
 
-	/* Don't receive packets in an exiting network namespace */
-	if (!net_alive(dev_net(skb->dev))) {
-		kfree_skb(skb);
-		goto out;
-	}
-
 #ifdef CONFIG_NET_CLS_ACT
 	if (skb->tc_verd & TC_NCLS) {
 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2adb1a7..e3bebd3 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -157,9 +157,6 @@ static void cleanup_net(struct work_struct *work)
 	struct pernet_operations *ops;
 	struct net *net;
 
-	/* Be very certain incoming network packets will not find us */
-	rcu_barrier();
-
 	net = container_of(work, struct net, work);
 
 	mutex_lock(&net_mutex);
-- 
cgit v0.10.2


From 97d4b35fb44cd5a80bc10952e2b1de5d3a14117b Mon Sep 17 00:00:00 2001
From: Tom Parker <blackfin@tevp.net>
Date: Tue, 3 Mar 2009 17:59:39 +0800
Subject: Blackfin arch: fix bug - Error if one serial has hardware flow
 control and the other doesn't

I have a system where UART0 is configured with hardware flow control, but UART1
doesn't have it enabled. Attempting to access UART1 in this configuration
results in the following error in dmesg:

<3>bfin-gpio: GPIO 0 is already reserved as Peripheral by bfin-uart !
<5>Stack from 0082bc7c:
<5>        0082bc88 00404dd6 00000003 00000000 0054051e 004079da 0082bcb4
00000000
<5>        00000003 00000000 0052686c 0113f2a0 005fa3f0 00000032 20515249
00003035
<5>        00427228 00526e50 0113f2e0 005fa3f0 00000032 0113f2e0 0054b748
0000ffff
<5>        22222222 22222222 004e1628 00427304 00000000 00000032 00000023
0054b748
<5>        00487a94 0054b7e8 0054b748 0000000b 00487fb8 0054b748 0054b748
00000001
<5>        0000000a 005fa3f0 009d4fe8 0101e3c0 0054b748 005fa3f0 0050b134
0054b748
<5>
<5>Call Trace:
<4>[<00485c16>] _uart_startup+0x56/0x178
<4>[<004865c8>] _uart_open+0x40/0x3e0
<4>[<0048661c>] _uart_open+0x94/0x3e0
<4>[<0047f1ce>] _init_dev+0x1fa/0x450
<4>[<004e1628>] ___mutex_unlock_slowpath+0x30/0xe8
<4>[<004815da>] _tty_open+0xf6/0x21c
<4>[<0043dab0>] ___path_lookup_intent_open+0x34/0x7c
<4>[<004375e4>] _chrdev_open+0x7c/0x134
<4>[<0043dc2c>] _open_namei+0x60/0x568
<4>[<00433fa2>] ___dentry_open+0x9e/0x188
<4>[<00437568>] _chrdev_open+0x0/0x134
<4>[<0043410c>] _nameidata_to_filp+0x30/0x3c
<4>[<00434152>] _do_filp_open+0x3a/0x44
<4>[<00408826>] _task_running_tick+0x102/0x278
<4>[<0043418e>] _do_sys_open+0x32/0xac
<4>[<0043ede4>] _sys_ioctl+0x28/0x50
<4>[<0043edbc>] _sys_ioctl+0x0/0x50
<4>[<00434224>] _sys_open+0x18/0x20
<4>[<0043420c>] _sys_open+0x0/0x20
<4>[<00418174>] _sys_setuid+0x0/0xc8

This is because the #ifdef's in bfin_serial_5xx.h are messed up. More
specifically, they add/remove the uart_{rts,cts}_pin fields in
bfin_serial_resources based on whether the particular port has rts/cts enabled,
as opposed to when either port has it enabled.

This patch fixed this.

Signed-off-by: Tom Parker <blackfin@tevp.net>
Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/mach-bf518/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf518/include/mach/bfin_serial_5xx.h
index b50a63b..e21c1c3 100644
--- a/arch/blackfin/mach-bf518/include/mach/bfin_serial_5xx.h
+++ b/arch/blackfin/mach-bf518/include/mach/bfin_serial_5xx.h
@@ -144,7 +144,7 @@ struct bfin_serial_res bfin_serial_resource[] = {
 	 CH_UART0_TX,
 	 CH_UART0_RX,
 #endif
-#ifdef CONFIG_BFIN_UART0_CTSRTS
+#ifdef CONFIG_SERIAL_BFIN_CTSRTS
 	 CONFIG_UART0_CTS_PIN,
 	 CONFIG_UART0_RTS_PIN,
 #endif
@@ -158,7 +158,7 @@ struct bfin_serial_res bfin_serial_resource[] = {
 	 CH_UART1_TX,
 	 CH_UART1_RX,
 #endif
-#ifdef CONFIG_BFIN_UART1_CTSRTS
+#ifdef CONFIG_SERIAL_BFIN_CTSRTS
 	 CONFIG_UART1_CTS_PIN,
 	 CONFIG_UART1_RTS_PIN,
 #endif
diff --git a/arch/blackfin/mach-bf527/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf527/include/mach/bfin_serial_5xx.h
index 75722d6..e8c41fd 100644
--- a/arch/blackfin/mach-bf527/include/mach/bfin_serial_5xx.h
+++ b/arch/blackfin/mach-bf527/include/mach/bfin_serial_5xx.h
@@ -144,7 +144,7 @@ struct bfin_serial_res bfin_serial_resource[] = {
 	 CH_UART0_TX,
 	 CH_UART0_RX,
 #endif
-#ifdef CONFIG_BFIN_UART0_CTSRTS
+#ifdef CONFIG_SERIAL_BFIN_CTSRTS
 	 CONFIG_UART0_CTS_PIN,
 	 CONFIG_UART0_RTS_PIN,
 #endif
@@ -158,7 +158,7 @@ struct bfin_serial_res bfin_serial_resource[] = {
 	 CH_UART1_TX,
 	 CH_UART1_RX,
 #endif
-#ifdef CONFIG_BFIN_UART1_CTSRTS
+#ifdef CONFIG_SERIAL_BFIN_CTSRTS
 	 CONFIG_UART1_CTS_PIN,
 	 CONFIG_UART1_RTS_PIN,
 #endif
diff --git a/arch/blackfin/mach-bf533/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf533/include/mach/bfin_serial_5xx.h
index f3d9e49..5f517f5 100644
--- a/arch/blackfin/mach-bf533/include/mach/bfin_serial_5xx.h
+++ b/arch/blackfin/mach-bf533/include/mach/bfin_serial_5xx.h
@@ -134,7 +134,7 @@ struct bfin_serial_res bfin_serial_resource[] = {
 	CH_UART_TX,
 	CH_UART_RX,
 #endif
-#ifdef CONFIG_BFIN_UART0_CTSRTS
+#ifdef CONFIG_SERIAL_BFIN_CTSRTS
 	CONFIG_UART0_CTS_PIN,
 	CONFIG_UART0_RTS_PIN,
 #endif
diff --git a/arch/blackfin/mach-bf537/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf537/include/mach/bfin_serial_5xx.h
index b3f87e1..9e34700 100644
--- a/arch/blackfin/mach-bf537/include/mach/bfin_serial_5xx.h
+++ b/arch/blackfin/mach-bf537/include/mach/bfin_serial_5xx.h
@@ -144,7 +144,7 @@ struct bfin_serial_res bfin_serial_resource[] = {
 	CH_UART0_TX,
 	CH_UART0_RX,
 #endif
-#ifdef CONFIG_BFIN_UART0_CTSRTS
+#ifdef CONFIG_SERIAL_BFIN_CTSRTS
 	CONFIG_UART0_CTS_PIN,
 	CONFIG_UART0_RTS_PIN,
 #endif
@@ -158,7 +158,7 @@ struct bfin_serial_res bfin_serial_resource[] = {
 	CH_UART1_TX,
 	CH_UART1_RX,
 #endif
-#ifdef CONFIG_BFIN_UART1_CTSRTS
+#ifdef CONFIG_SERIAL_BFIN_CTSRTS
 	CONFIG_UART1_CTS_PIN,
 	CONFIG_UART1_RTS_PIN,
 #endif
diff --git a/arch/blackfin/mach-bf538/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf538/include/mach/bfin_serial_5xx.h
index 40503b6..3c2811e 100644
--- a/arch/blackfin/mach-bf538/include/mach/bfin_serial_5xx.h
+++ b/arch/blackfin/mach-bf538/include/mach/bfin_serial_5xx.h
@@ -144,7 +144,7 @@ struct bfin_serial_res bfin_serial_resource[] = {
 	CH_UART0_TX,
 	CH_UART0_RX,
 #endif
-#ifdef CONFIG_BFIN_UART0_CTSRTS
+#ifdef CONFIG_SERIAL_BFIN_CTSRTS
 	CONFIG_UART0_CTS_PIN,
 	CONFIG_UART0_RTS_PIN,
 #endif
@@ -158,7 +158,7 @@ struct bfin_serial_res bfin_serial_resource[] = {
 	CH_UART1_TX,
 	CH_UART1_RX,
 #endif
-#ifdef CONFIG_BFIN_UART1_CTSRTS
+#ifdef CONFIG_SERIAL_BFIN_CTSRTS
 	CONFIG_UART1_CTS_PIN,
 	CONFIG_UART1_RTS_PIN,
 #endif
diff --git a/arch/blackfin/mach-bf548/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf548/include/mach/bfin_serial_5xx.h
index e4cf35e..c05e79c 100644
--- a/arch/blackfin/mach-bf548/include/mach/bfin_serial_5xx.h
+++ b/arch/blackfin/mach-bf548/include/mach/bfin_serial_5xx.h
@@ -63,7 +63,7 @@
 #define UART_ENABLE_INTS(x, v) UART_SET_IER(x, v)
 #define UART_DISABLE_INTS(x) UART_CLEAR_IER(x, 0xF)
 
-#if defined(CONFIG_BFIN_UART0_CTSRTS) || defined(CONFIG_BFIN_UART1_CTSRTS)
+#if defined(CONFIG_BFIN_UART0_CTSRTS) || defined(CONFIG_BFIN_UART2_CTSRTS)
 # define CONFIG_SERIAL_BFIN_CTSRTS
 
 # ifndef CONFIG_UART0_CTS_PIN
@@ -74,12 +74,12 @@
 #  define CONFIG_UART0_RTS_PIN -1
 # endif
 
-# ifndef CONFIG_UART1_CTS_PIN
-#  define CONFIG_UART1_CTS_PIN -1
+# ifndef CONFIG_UART2_CTS_PIN
+#  define CONFIG_UART2_CTS_PIN -1
 # endif
 
-# ifndef CONFIG_UART1_RTS_PIN
-#  define CONFIG_UART1_RTS_PIN -1
+# ifndef CONFIG_UART2_RTS_PIN
+#  define CONFIG_UART2_RTS_PIN -1
 # endif
 #endif
 
@@ -130,7 +130,7 @@ struct bfin_serial_res bfin_serial_resource[] = {
 	CH_UART0_TX,
 	CH_UART0_RX,
 #endif
-#ifdef CONFIG_BFIN_UART0_CTSRTS
+#ifdef CONFIG_SERIAL_BFIN_CTSRTS
 	CONFIG_UART0_CTS_PIN,
 	CONFIG_UART0_RTS_PIN,
 #endif
@@ -144,6 +144,10 @@ struct bfin_serial_res bfin_serial_resource[] = {
 	CH_UART1_TX,
 	CH_UART1_RX,
 #endif
+#ifdef CONFIG_SERIAL_BFIN_CTSRTS
+	0,
+	0,
+#endif
 	},
 #endif
 #ifdef CONFIG_SERIAL_BFIN_UART2
@@ -154,7 +158,7 @@ struct bfin_serial_res bfin_serial_resource[] = {
 	CH_UART2_TX,
 	CH_UART2_RX,
 #endif
-#ifdef CONFIG_BFIN_UART2_CTSRTS
+#ifdef CONFIG_SERIAL_BFIN_CTSRTS
 	CONFIG_UART2_CTS_PIN,
 	CONFIG_UART2_RTS_PIN,
 #endif
@@ -168,6 +172,10 @@ struct bfin_serial_res bfin_serial_resource[] = {
 	CH_UART3_TX,
 	CH_UART3_RX,
 #endif
+#ifdef CONFIG_SERIAL_BFIN_CTSRTS
+	0,
+	0,
+#endif
 	},
 #endif
 };
diff --git a/arch/blackfin/mach-bf561/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf561/include/mach/bfin_serial_5xx.h
index 043bfcf..ca8c5f6 100644
--- a/arch/blackfin/mach-bf561/include/mach/bfin_serial_5xx.h
+++ b/arch/blackfin/mach-bf561/include/mach/bfin_serial_5xx.h
@@ -134,7 +134,7 @@ struct bfin_serial_res bfin_serial_resource[] = {
 	CH_UART_TX,
 	CH_UART_RX,
 #endif
-#ifdef CONFIG_BFIN_UART0_CTSRTS
+#ifdef CONFIG_SERIAL_BFIN_CTSRTS
 	CONFIG_UART0_CTS_PIN,
 	CONFIG_UART0_RTS_PIN,
 #endif
-- 
cgit v0.10.2


From 25ef4a67e78e1322d55f0a38783537ed89addc02 Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@gmail.com>
Date: Mon, 2 Mar 2009 22:39:36 +0100
Subject: [ARM] 5416/1: Use unused address in v6_early_abort

The target of the strex instruction to clear the exlusive monitor
is currently the top of the stack.  If the store succeeeds this
corrupts r0 in pt_regs.  Use the next stack location instead of
the current one to prevent any chance of corrupting an in-use
address.

Signed-off-by: Seth Forshee <seth.forshee@gmail.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/mm/abort-ev6.S b/arch/arm/mm/abort-ev6.S
index 8a7f65b..94077fb 100644
--- a/arch/arm/mm/abort-ev6.S
+++ b/arch/arm/mm/abort-ev6.S
@@ -23,7 +23,8 @@ ENTRY(v6_early_abort)
 #ifdef CONFIG_CPU_32v6K
 	clrex
 #else
-	strex	r0, r1, [sp]			@ Clear the exclusive monitor
+	sub	r1, sp, #4			@ Get unused stack location
+	strex	r0, r1, [r1]			@ Clear the exclusive monitor
 #endif
 	mrc	p15, 0, r1, c5, c0, 0		@ get FSR
 	mrc	p15, 0, r0, c6, c0, 0		@ get FAR
-- 
cgit v0.10.2


From b57ee99fab25dbc12150fe66fe54dc52bc6de784 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 3 Mar 2009 11:44:12 +0100
Subject: [ARM] 5417/1: Set the correct cacheid for ARMv6 CPUs with ARMv7 style
 MMU

The cacheid_init() function assumes that if cpu_architecture() returns
7, the caches are VIPT_NONALIASING. The cpu_architecture() function
returns the version of the supported MMU features (e.g. TEX remapping)
but it doesn't make any assumptions about the cache type. The patch adds
the checking of the Cache Type Register for the ARMv7 format.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 7049815..68d6494 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -233,12 +233,13 @@ static void __init cacheid_init(void)
 	unsigned int cachetype = read_cpuid_cachetype();
 	unsigned int arch = cpu_architecture();
 
-	if (arch >= CPU_ARCH_ARMv7) {
-		cacheid = CACHEID_VIPT_NONALIASING;
-		if ((cachetype & (3 << 14)) == 1 << 14)
-			cacheid |= CACHEID_ASID_TAGGED;
-	} else if (arch >= CPU_ARCH_ARMv6) {
-		if (cachetype & (1 << 23))
+	if (arch >= CPU_ARCH_ARMv6) {
+		if ((cachetype & (7 << 29)) == 4 << 29) {
+			/* ARMv7 register format */
+			cacheid = CACHEID_VIPT_NONALIASING;
+			if ((cachetype & (3 << 14)) == 1 << 14)
+				cacheid |= CACHEID_ASID_TAGGED;
+		} else if (cachetype & (1 << 23))
 			cacheid = CACHEID_VIPT_ALIASING;
 		else
 			cacheid = CACHEID_VIPT_NONALIASING;
-- 
cgit v0.10.2


From bdf602bd737eb07d63d6fa2da826b4751fdf9bab Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Tue, 3 Mar 2009 13:43:47 +0000
Subject: [ARM] fix lots of ARM __devexit sillyness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`iop_adma_remove' referenced in section `.data' of drivers/built-in.o: defined in discarded section `.devexit.text' of drivers/built-in.o
`mv_xor_remove' referenced in section `.data' of drivers/built-in.o: defined in discarded section `.devexit.text' of drivers/built-in.o
`mv64xxx_i2c_unmap_regs' referenced in section `.devinit.text' of drivers/built-in.o: defined in discarded section `.devexit.text' of drivers/built-in.o
`mv64xxx_i2c_remove' referenced in section `.data' of drivers/built-in.o: defined in discarded section `.devexit.text' of drivers/built-in.o
`orion_nand_remove' referenced in section `.data' of drivers/built-in.o: defined in discarded section `.devexit.text' of drivers/built-in.o
`pxafb_remove' referenced in section `.data' of drivers/built-in.o: defined in discarded section `.devexit.text' of drivers/built-in.o

Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index ea5440d..647374a 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -1401,7 +1401,7 @@ MODULE_ALIAS("platform:iop-adma");
 
 static struct platform_driver iop_adma_driver = {
 	.probe		= iop_adma_probe,
-	.remove		= iop_adma_remove,
+	.remove		= __devexit_p(iop_adma_remove),
 	.driver		= {
 		.owner	= THIS_MODULE,
 		.name	= "iop-adma",
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index d35cbd1..5d5d5b3 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -1287,7 +1287,7 @@ mv_xor_conf_mbus_windows(struct mv_xor_shared_private *msp,
 
 static struct platform_driver mv_xor_driver = {
 	.probe		= mv_xor_probe,
-	.remove		= mv_xor_remove,
+	.remove		= __devexit_p(mv_xor_remove),
 	.driver		= {
 		.owner	= THIS_MODULE,
 		.name	= MV_XOR_NAME,
diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c
index eeda276f..7f186bb 100644
--- a/drivers/i2c/busses/i2c-mv64xxx.c
+++ b/drivers/i2c/busses/i2c-mv64xxx.c
@@ -482,7 +482,7 @@ mv64xxx_i2c_map_regs(struct platform_device *pd,
 	return 0;
 }
 
-static void __devexit
+static void
 mv64xxx_i2c_unmap_regs(struct mv64xxx_i2c_data *drv_data)
 {
 	if (drv_data->reg_base) {
@@ -577,7 +577,7 @@ mv64xxx_i2c_remove(struct platform_device *dev)
 
 static struct platform_driver mv64xxx_i2c_driver = {
 	.probe	= mv64xxx_i2c_probe,
-	.remove	= mv64xxx_i2c_remove,
+	.remove	= __devexit_p(mv64xxx_i2c_remove),
 	.driver	= {
 		.owner	= THIS_MODULE,
 		.name	= MV64XXX_I2C_CTLR_NAME,
diff --git a/drivers/mtd/nand/orion_nand.c b/drivers/mtd/nand/orion_nand.c
index 917cf8d..c2dfd3e 100644
--- a/drivers/mtd/nand/orion_nand.c
+++ b/drivers/mtd/nand/orion_nand.c
@@ -149,7 +149,7 @@ static int __devexit orion_nand_remove(struct platform_device *pdev)
 
 static struct platform_driver orion_nand_driver = {
 	.probe		= orion_nand_probe,
-	.remove		= orion_nand_remove,
+	.remove		= __devexit_p(orion_nand_remove),
 	.driver		= {
 		.name	= "orion_nand",
 		.owner	= THIS_MODULE,
diff --git a/drivers/video/pxafb.c b/drivers/video/pxafb.c
index 48ff701..2552b9f 100644
--- a/drivers/video/pxafb.c
+++ b/drivers/video/pxafb.c
@@ -2230,7 +2230,7 @@ static int __devexit pxafb_remove(struct platform_device *dev)
 
 static struct platform_driver pxafb_driver = {
 	.probe		= pxafb_probe,
-	.remove 	= pxafb_remove,
+	.remove 	= __devexit_p(pxafb_remove),
 	.suspend	= pxafb_suspend,
 	.resume		= pxafb_resume,
 	.driver		= {
-- 
cgit v0.10.2


From 1777f1a978153e8b887c1e1eb5160ac46098b142 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krzysztof=20Ha=C5=82asa?= <khc@pm.waw.pl>
Date: Wed, 4 Mar 2009 08:01:22 +0800
Subject: crypto: ixp4xx - Fix qmgr_request_queue build failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is another user of IXP4xx queue manager, fix it.

Signed-off-by: Krzysztof Hałasa <khc@pm.waw.pl>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c
index 2d637e0..d9e751b 100644
--- a/drivers/crypto/ixp4xx_crypto.c
+++ b/drivers/crypto/ixp4xx_crypto.c
@@ -457,10 +457,12 @@ static int init_ixp_crypto(void)
 	if (!ctx_pool) {
 		goto err;
 	}
-	ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0);
+	ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0,
+				 "ixp_crypto:out", NULL);
 	if (ret)
 		goto err;
-	ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0);
+	ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0,
+				 "ixp_crypto:in", NULL);
 	if (ret) {
 		qmgr_release_queue(SEND_QID);
 		goto err;
-- 
cgit v0.10.2


From fec6c6fec3e20637bee5d276fb61dd8b49a3f9cc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 3 Mar 2009 17:05:22 -0800
Subject: Linux 2.6.29-rc7


diff --git a/Makefile b/Makefile
index df6ce3e..d04ee0a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 29
-EXTRAVERSION = -rc6
+EXTRAVERSION = -rc7
 NAME = Erotic Pickled Herring
 
 # *DOCUMENTATION*
-- 
cgit v0.10.2


From 4e8304758cc09a6097dbd2c4f44a5369e5c1edb0 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 3 Mar 2009 11:51:39 -0800
Subject: x86: remove vestigial fix_ioremap prototypes

The function seems to have disappeared at some point, leaving
some vestigial prototypes behind...

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 683d0b4..e5383e3 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -172,8 +172,6 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
 
 extern void iounmap(volatile void __iomem *addr);
 
-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
-
 
 #ifdef CONFIG_X86_32
 # include "io_32.h"
@@ -198,7 +196,6 @@ extern void early_ioremap_reset(void);
 extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
 extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
 extern void early_iounmap(void __iomem *addr, unsigned long size);
-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
 
 #define IO_SPACE_LIMIT 0xffff
 
-- 
cgit v0.10.2


From f254f3909efaf59ca2d0f408de2d044dace60706 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 3 Mar 2009 12:02:57 -0800
Subject: x86: un-__init fill_pud/pmd/pte

They are used by __set_fixmap->set_pte_vaddr_pud, which can
be used by arch_setup_additional_pages(), and so is used
after init.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 11981fc..07f44d4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -168,7 +168,7 @@ static __ref void *spp_getpage(void)
 	return ptr;
 }
 
-static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr)
+static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
 {
 	if (pgd_none(*pgd)) {
 		pud_t *pud = (pud_t *)spp_getpage();
@@ -180,7 +180,7 @@ static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr)
 	return pud_offset(pgd, vaddr);
 }
 
-static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr)
+static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
 {
 	if (pud_none(*pud)) {
 		pmd_t *pmd = (pmd_t *) spp_getpage();
@@ -192,7 +192,7 @@ static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr)
 	return pmd_offset(pud, vaddr);
 }
 
-static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr)
+static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
 {
 	if (pmd_none(*pmd)) {
 		pte_t *pte = (pte_t *) spp_getpage();
-- 
cgit v0.10.2


From 9bd50df6aa9bdd583793a16b0b9379dfd78c079e Mon Sep 17 00:00:00 2001
From: Philippe Gerum <rpm@xenomai.org>
Date: Wed, 4 Mar 2009 16:52:38 +0800
Subject: Blackfin arch:  Update adeos blackfin arch patch to 1.9-00

Signed-off-by: Philippe Gerum <rpm@xenomai.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/include/asm/ipipe.h b/arch/blackfin/include/asm/ipipe.h
index 76f53d8..343b563 100644
--- a/arch/blackfin/include/asm/ipipe.h
+++ b/arch/blackfin/include/asm/ipipe.h
@@ -35,9 +35,9 @@
 #include <asm/atomic.h>
 #include <asm/traps.h>
 
-#define IPIPE_ARCH_STRING     "1.8-00"
+#define IPIPE_ARCH_STRING     "1.9-00"
 #define IPIPE_MAJOR_NUMBER    1
-#define IPIPE_MINOR_NUMBER    8
+#define IPIPE_MINOR_NUMBER    9
 #define IPIPE_PATCH_NUMBER    0
 
 #ifdef CONFIG_SMP
@@ -83,9 +83,9 @@ struct ipipe_sysinfo {
 				"%2 = CYCLES2\n"		\
 				"CC = %2 == %0\n"		\
 				"if ! CC jump 1b\n"		\
-				: "=r" (((unsigned long *)&t)[1]),	\
-				  "=r" (((unsigned long *)&t)[0]),	\
-				  "=r" (__cy2)				\
+				: "=d,a" (((unsigned long *)&t)[1]),	\
+				  "=d,a" (((unsigned long *)&t)[0]),	\
+				  "=d,a" (__cy2)				\
 				: /*no input*/ : "CC");			\
 	t;								\
 	})
@@ -118,35 +118,40 @@ void __ipipe_disable_irqdesc(struct ipipe_domain *ipd,
 
 #define __ipipe_disable_irq(irq)	(irq_desc[irq].chip->mask(irq))
 
-#define __ipipe_lock_root()					\
-	set_bit(IPIPE_ROOTLOCK_FLAG, &ipipe_root_domain->flags)
+static inline int __ipipe_check_tickdev(const char *devname)
+{
+	return 1;
+}
 
-#define __ipipe_unlock_root()					\
-	clear_bit(IPIPE_ROOTLOCK_FLAG, &ipipe_root_domain->flags)
+static inline void __ipipe_lock_root(void)
+{
+	set_bit(IPIPE_SYNCDEFER_FLAG, &ipipe_root_cpudom_var(status));
+}
+
+static inline void __ipipe_unlock_root(void)
+{
+	clear_bit(IPIPE_SYNCDEFER_FLAG, &ipipe_root_cpudom_var(status));
+}
 
 void __ipipe_enable_pipeline(void);
 
 #define __ipipe_hook_critical_ipi(ipd) do { } while (0)
 
-#define __ipipe_sync_pipeline(syncmask)					\
-	do {								\
-		struct ipipe_domain *ipd = ipipe_current_domain;	\
-		if (likely(ipd != ipipe_root_domain || !test_bit(IPIPE_ROOTLOCK_FLAG, &ipd->flags))) \
-			__ipipe_sync_stage(syncmask);			\
-	} while (0)
+#define __ipipe_sync_pipeline  ___ipipe_sync_pipeline
+void ___ipipe_sync_pipeline(unsigned long syncmask);
 
 void __ipipe_handle_irq(unsigned irq, struct pt_regs *regs);
 
 int __ipipe_get_irq_priority(unsigned irq);
 
-int __ipipe_get_irqthread_priority(unsigned irq);
-
 void __ipipe_stall_root_raw(void);
 
 void __ipipe_unstall_root_raw(void);
 
 void __ipipe_serial_debug(const char *fmt, ...);
 
+asmlinkage void __ipipe_call_irqtail(unsigned long addr);
+
 DECLARE_PER_CPU(struct pt_regs, __ipipe_tick_regs);
 
 extern unsigned long __ipipe_core_clock;
@@ -162,42 +167,25 @@ static inline unsigned long __ipipe_ffnz(unsigned long ul)
 
 #define __ipipe_run_irqtail()  /* Must be a macro */			\
 	do {								\
-		asmlinkage void __ipipe_call_irqtail(void);		\
 		unsigned long __pending;				\
-		CSYNC();					\
+		CSYNC();						\
 		__pending = bfin_read_IPEND();				\
 		if (__pending & 0x8000) {				\
 			__pending &= ~0x8010;				\
 			if (__pending && (__pending & (__pending - 1)) == 0) \
-				__ipipe_call_irqtail();			\
+				__ipipe_call_irqtail(__ipipe_irq_tail_hook); \
 		}							\
 	} while (0)
 
 #define __ipipe_run_isr(ipd, irq)					\
 	do {								\
 		if (ipd == ipipe_root_domain) {				\
-			/*						\
-			 * Note: the I-pipe implements a threaded interrupt model on \
-			 * this arch for Linux external IRQs. The interrupt handler we \
-			 * call here only wakes up the associated IRQ thread. \
-			 */						\
-			if (ipipe_virtual_irq_p(irq)) {			\
-				/* No irqtail here; virtual interrupts have no effect \
-				   on IPEND so there is no need for processing \
-				   deferral. */				\
-				local_irq_enable_nohead(ipd);		\
+			local_irq_enable_hw();				\
+			if (ipipe_virtual_irq_p(irq))			\
 				ipd->irqs[irq].handler(irq, ipd->irqs[irq].cookie); \
-				local_irq_disable_nohead(ipd);		\
-			} else						\
-				/*					\
-				 * No need to run the irqtail here either; \
-				 * we can't be preempted by hw IRQs, so	\
-				 * non-Linux IRQs cannot stack over the short \
-				 * thread wakeup code. Which in turn means \
-				 * that no irqtail condition could be pending \
-				 * for domains above Linux in the pipeline. \
-				 */					\
+			else						\
 				ipd->irqs[irq].handler(irq, &__raw_get_cpu_var(__ipipe_tick_regs)); \
+			local_irq_disable_hw();				\
 		} else {						\
 			__clear_bit(IPIPE_SYNC_FLAG, &ipipe_cpudom_var(ipd, status)); \
 			local_irq_enable_nohead(ipd);			\
@@ -217,42 +205,24 @@ void ipipe_init_irq_threads(void);
 
 int ipipe_start_irq_thread(unsigned irq, struct irq_desc *desc);
 
-#define IS_SYSIRQ(irq)		((irq) > IRQ_CORETMR && (irq) <= SYS_IRQS)
-#define IS_GPIOIRQ(irq)		((irq) >= GPIO_IRQ_BASE && (irq) < NR_IRQS)
-
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+#define IRQ_SYSTMR		IRQ_CORETMR
+#define IRQ_PRIOTMR		IRQ_CORETMR
+#else
 #define IRQ_SYSTMR		IRQ_TIMER0
 #define IRQ_PRIOTMR		CONFIG_IRQ_TIMER0
+#endif
 
-#if defined(CONFIG_BF531) || defined(CONFIG_BF532) || defined(CONFIG_BF533)
-#define PRIO_GPIODEMUX(irq)	CONFIG_PFA
-#elif defined(CONFIG_BF534) || defined(CONFIG_BF536) || defined(CONFIG_BF537)
-#define PRIO_GPIODEMUX(irq)	CONFIG_IRQ_PROG_INTA
-#elif defined(CONFIG_BF52x)
-#define PRIO_GPIODEMUX(irq)	((irq) == IRQ_PORTF_INTA ? CONFIG_IRQ_PORTF_INTA : \
-				 (irq) == IRQ_PORTG_INTA ? CONFIG_IRQ_PORTG_INTA : \
-				 (irq) == IRQ_PORTH_INTA ? CONFIG_IRQ_PORTH_INTA : \
-				 -1)
-#elif defined(CONFIG_BF561)
-#define PRIO_GPIODEMUX(irq)	((irq) == IRQ_PROG0_INTA ? CONFIG_IRQ_PROG0_INTA : \
-				 (irq) == IRQ_PROG1_INTA ? CONFIG_IRQ_PROG1_INTA : \
-				 (irq) == IRQ_PROG2_INTA ? CONFIG_IRQ_PROG2_INTA : \
-				 -1)
+#ifdef CONFIG_BF561
 #define bfin_write_TIMER_DISABLE(val)	bfin_write_TMRS8_DISABLE(val)
 #define bfin_write_TIMER_ENABLE(val)	bfin_write_TMRS8_ENABLE(val)
 #define bfin_write_TIMER_STATUS(val)	bfin_write_TMRS8_STATUS(val)
 #define bfin_read_TIMER_STATUS()	bfin_read_TMRS8_STATUS()
 #elif defined(CONFIG_BF54x)
-#define PRIO_GPIODEMUX(irq)	((irq) == IRQ_PINT0 ? CONFIG_IRQ_PINT0 : \
-				 (irq) == IRQ_PINT1 ? CONFIG_IRQ_PINT1 : \
-				 (irq) == IRQ_PINT2 ? CONFIG_IRQ_PINT2 : \
-				 (irq) == IRQ_PINT3 ? CONFIG_IRQ_PINT3 : \
-				 -1)
 #define bfin_write_TIMER_DISABLE(val)	bfin_write_TIMER_DISABLE0(val)
 #define bfin_write_TIMER_ENABLE(val)	bfin_write_TIMER_ENABLE0(val)
 #define bfin_write_TIMER_STATUS(val)	bfin_write_TIMER_STATUS0(val)
 #define bfin_read_TIMER_STATUS(val)	bfin_read_TIMER_STATUS0(val)
-#else
-# error "no PRIO_GPIODEMUX() for this part"
 #endif
 
 #define __ipipe_root_tick_p(regs)	((regs->ipend & 0x10) != 0)
@@ -275,4 +245,6 @@ int ipipe_start_irq_thread(unsigned irq, struct irq_desc *desc);
 
 #endif /* !CONFIG_IPIPE */
 
+#define ipipe_update_tick_evtdev(evtdev)	do { } while (0)
+
 #endif	/* !__ASM_BLACKFIN_IPIPE_H */
diff --git a/arch/blackfin/include/asm/ipipe_base.h b/arch/blackfin/include/asm/ipipe_base.h
index cb1025a..3e8acbd 100644
--- a/arch/blackfin/include/asm/ipipe_base.h
+++ b/arch/blackfin/include/asm/ipipe_base.h
@@ -1,5 +1,5 @@
 /*   -*- linux-c -*-
- *   include/asm-blackfin/_baseipipe.h
+ *   include/asm-blackfin/ipipe_base.h
  *
  *   Copyright (C) 2007 Philippe Gerum.
  *
@@ -27,8 +27,9 @@
 #define IPIPE_NR_XIRQS		NR_IRQS
 #define IPIPE_IRQ_ISHIFT	5	/* 2^5 for 32bits arch. */
 
-/* Blackfin-specific, global domain flags */
-#define IPIPE_ROOTLOCK_FLAG	1	/* Lock pipeline for root */
+/* Blackfin-specific, per-cpu pipeline status */
+#define IPIPE_SYNCDEFER_FLAG	15
+#define IPIPE_SYNCDEFER_MASK	(1L << IPIPE_SYNCDEFER_MASK)
 
  /* Blackfin traps -- i.e. exception vector numbers */
 #define IPIPE_NR_FAULTS		52 /* We leave a gap after VEC_ILL_RES. */
@@ -48,11 +49,6 @@
 
 #ifndef __ASSEMBLY__
 
-#include <linux/bitops.h>
-
-extern int test_bit(int nr, const void *addr);
-
-
 extern unsigned long __ipipe_root_status; /* Alias to ipipe_root_cpudom_var(status) */
 
 static inline void __ipipe_stall_root(void)
diff --git a/arch/blackfin/include/asm/irq.h b/arch/blackfin/include/asm/irq.h
index 3d97790..7645e85 100644
--- a/arch/blackfin/include/asm/irq.h
+++ b/arch/blackfin/include/asm/irq.h
@@ -61,20 +61,38 @@ void __ipipe_restore_root(unsigned long flags);
 #define raw_irqs_disabled_flags(flags)	(!irqs_enabled_from_flags_hw(flags))
 #define local_test_iflag_hw(x)		irqs_enabled_from_flags_hw(x)
 
-#define local_save_flags(x)						\
-	do {								\
-		(x) = __ipipe_test_root() ? \
+#define local_save_flags(x)					 \
+	do {							 \
+		(x) = __ipipe_test_root() ?			 \
 			__all_masked_irq_flags : bfin_irq_flags; \
+		barrier();					 \
 	} while (0)
 
-#define local_irq_save(x)				\
-	do {						\
-		(x) = __ipipe_test_and_stall_root();	\
+#define local_irq_save(x)					 \
+	do {						 	 \
+		(x) = __ipipe_test_and_stall_root() ?		 \
+			__all_masked_irq_flags : bfin_irq_flags; \
+		barrier();					 \
+	} while (0)
+
+static inline void local_irq_restore(unsigned long x)
+{
+	barrier();
+	__ipipe_restore_root(x == __all_masked_irq_flags);
+}
+
+#define local_irq_disable()			\
+	do {					\
+		__ipipe_stall_root();		\
+		barrier();			\
 	} while (0)
 
-#define local_irq_restore(x)	__ipipe_restore_root(x)
-#define local_irq_disable()	__ipipe_stall_root()
-#define local_irq_enable()	__ipipe_unstall_root()
+static inline void local_irq_enable(void)
+{
+	barrier();
+	__ipipe_unstall_root();
+}
+
 #define irqs_disabled()		__ipipe_test_root()
 
 #define local_save_flags_hw(x) \
diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h
index e721ce5..2920087 100644
--- a/arch/blackfin/include/asm/thread_info.h
+++ b/arch/blackfin/include/asm/thread_info.h
@@ -122,6 +122,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_MEMDIE              4
 #define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
 #define TIF_FREEZE              6       /* is freezing for suspend */
+#define TIF_IRQ_SYNC            7       /* sync pipeline stage */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -130,6 +131,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_FREEZE             (1<<TIF_FREEZE)
+#define _TIF_IRQ_SYNC           (1<<TIF_IRQ_SYNC)
 
 #define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
 
diff --git a/arch/blackfin/kernel/ipipe.c b/arch/blackfin/kernel/ipipe.c
index 339be5a..a5de8d4 100644
--- a/arch/blackfin/kernel/ipipe.c
+++ b/arch/blackfin/kernel/ipipe.c
@@ -35,14 +35,8 @@
 #include <asm/atomic.h>
 #include <asm/io.h>
 
-static int create_irq_threads;
-
 DEFINE_PER_CPU(struct pt_regs, __ipipe_tick_regs);
 
-static DEFINE_PER_CPU(unsigned long, pending_irqthread_mask);
-
-static DEFINE_PER_CPU(int [IVG13 + 1], pending_irq_count);
-
 asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs);
 
 static void __ipipe_no_irqtail(void);
@@ -93,6 +87,7 @@ void __ipipe_enable_pipeline(void)
  */
 void __ipipe_handle_irq(unsigned irq, struct pt_regs *regs)
 {
+	struct ipipe_percpu_domain_data *p = ipipe_root_cpudom_ptr();
 	struct ipipe_domain *this_domain, *next_domain;
 	struct list_head *head, *pos;
 	int m_ack, s = -1;
@@ -104,7 +99,6 @@ void __ipipe_handle_irq(unsigned irq, struct pt_regs *regs)
 	 * interrupt.
 	 */
 	m_ack = (regs == NULL || irq == IRQ_SYSTMR || irq == IRQ_CORETMR);
-
 	this_domain = ipipe_current_domain;
 
 	if (unlikely(test_bit(IPIPE_STICKY_FLAG, &this_domain->irqs[irq].control)))
@@ -114,49 +108,28 @@ void __ipipe_handle_irq(unsigned irq, struct pt_regs *regs)
 		next_domain = list_entry(head, struct ipipe_domain, p_link);
 		if (likely(test_bit(IPIPE_WIRED_FLAG, &next_domain->irqs[irq].control))) {
 			if (!m_ack && next_domain->irqs[irq].acknowledge != NULL)
-				next_domain->irqs[irq].acknowledge(irq, irq_desc + irq);
-			if (test_bit(IPIPE_ROOTLOCK_FLAG, &ipipe_root_domain->flags))
-				s = __test_and_set_bit(IPIPE_STALL_FLAG,
-						       &ipipe_root_cpudom_var(status));
+				next_domain->irqs[irq].acknowledge(irq, irq_to_desc(irq));
+			if (test_bit(IPIPE_SYNCDEFER_FLAG, &p->status))
+				s = __test_and_set_bit(IPIPE_STALL_FLAG, &p->status);
 			__ipipe_dispatch_wired(next_domain, irq);
-				goto finalize;
-			return;
+			goto out;
 		}
 	}
 
 	/* Ack the interrupt. */
 
 	pos = head;
-
 	while (pos != &__ipipe_pipeline) {
 		next_domain = list_entry(pos, struct ipipe_domain, p_link);
-		/*
-		 * For each domain handling the incoming IRQ, mark it
-		 * as pending in its log.
-		 */
 		if (test_bit(IPIPE_HANDLE_FLAG, &next_domain->irqs[irq].control)) {
-			/*
-			 * Domains that handle this IRQ are polled for
-			 * acknowledging it by decreasing priority
-			 * order. The interrupt must be made pending
-			 * _first_ in the domain's status flags before
-			 * the PIC is unlocked.
-			 */
 			__ipipe_set_irq_pending(next_domain, irq);
-
 			if (!m_ack && next_domain->irqs[irq].acknowledge != NULL) {
-				next_domain->irqs[irq].acknowledge(irq, irq_desc + irq);
+				next_domain->irqs[irq].acknowledge(irq, irq_to_desc(irq));
 				m_ack = 1;
 			}
 		}
-
-		/*
-		 * If the domain does not want the IRQ to be passed
-		 * down the interrupt pipe, exit the loop now.
-		 */
 		if (!test_bit(IPIPE_PASS_FLAG, &next_domain->irqs[irq].control))
 			break;
-
 		pos = next_domain->p_link.next;
 	}
 
@@ -166,18 +139,24 @@ void __ipipe_handle_irq(unsigned irq, struct pt_regs *regs)
 	 * immediately to the current domain if the interrupt has been
 	 * marked as 'sticky'. This search does not go beyond the
 	 * current domain in the pipeline. We also enforce the
-	 * additional root stage lock (blackfin-specific). */
+	 * additional root stage lock (blackfin-specific).
+	 */
+	if (test_bit(IPIPE_SYNCDEFER_FLAG, &p->status))
+		s = __test_and_set_bit(IPIPE_STALL_FLAG, &p->status);
 
-	if (test_bit(IPIPE_ROOTLOCK_FLAG, &ipipe_root_domain->flags))
-		s = __test_and_set_bit(IPIPE_STALL_FLAG,
-				       &ipipe_root_cpudom_var(status));
-finalize:
+	/*
+	 * If the interrupt preempted the head domain, then do not
+	 * even try to walk the pipeline, unless an interrupt is
+	 * pending for it.
+	 */
+	if (test_bit(IPIPE_AHEAD_FLAG, &this_domain->flags) &&
+	    ipipe_head_cpudom_var(irqpend_himask) == 0)
+		goto out;
 
 	__ipipe_walk_pipeline(head);
-
+out:
 	if (!s)
-		__clear_bit(IPIPE_STALL_FLAG,
-			    &ipipe_root_cpudom_var(status));
+		__clear_bit(IPIPE_STALL_FLAG, &p->status);
 }
 
 int __ipipe_check_root(void)
@@ -187,7 +166,7 @@ int __ipipe_check_root(void)
 
 void __ipipe_enable_irqdesc(struct ipipe_domain *ipd, unsigned irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 	int prio = desc->ic_prio;
 
 	desc->depth = 0;
@@ -199,7 +178,7 @@ EXPORT_SYMBOL(__ipipe_enable_irqdesc);
 
 void __ipipe_disable_irqdesc(struct ipipe_domain *ipd, unsigned irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 	int prio = desc->ic_prio;
 
 	if (ipd != &ipipe_root &&
@@ -236,15 +215,18 @@ int __ipipe_syscall_root(struct pt_regs *regs)
 {
 	unsigned long flags;
 
-	/* We need to run the IRQ tail hook whenever we don't
+	/*
+	 * We need to run the IRQ tail hook whenever we don't
 	 * propagate a syscall to higher domains, because we know that
 	 * important operations might be pending there (e.g. Xenomai
-	 * deferred rescheduling). */
+	 * deferred rescheduling).
+	 */
 
-	if (!__ipipe_syscall_watched_p(current, regs->orig_p0)) {
+	if (regs->orig_p0 < NR_syscalls) {
 		void (*hook)(void) = (void (*)(void))__ipipe_irq_tail_hook;
 		hook();
-		return 0;
+		if ((current->flags & PF_EVNOTIFY) == 0)
+			return 0;
 	}
 
 	/*
@@ -312,112 +294,46 @@ int ipipe_trigger_irq(unsigned irq)
 {
 	unsigned long flags;
 
+#ifdef CONFIG_IPIPE_DEBUG
 	if (irq >= IPIPE_NR_IRQS ||
 	    (ipipe_virtual_irq_p(irq)
 	     && !test_bit(irq - IPIPE_VIRQ_BASE, &__ipipe_virtual_irq_map)))
 		return -EINVAL;
+#endif
 
 	local_irq_save_hw(flags);
-
 	__ipipe_handle_irq(irq, NULL);
-
 	local_irq_restore_hw(flags);
 
 	return 1;
 }
 
-/* Move Linux IRQ to threads. */
-
-static int do_irqd(void *__desc)
+asmlinkage void __ipipe_sync_root(void)
 {
-	struct irq_desc *desc = __desc;
-	unsigned irq = desc - irq_desc;
-	int thrprio = desc->thr_prio;
-	int thrmask = 1 << thrprio;
-	int cpu = smp_processor_id();
-	cpumask_t cpumask;
-
-	sigfillset(&current->blocked);
-	current->flags |= PF_NOFREEZE;
-	cpumask = cpumask_of_cpu(cpu);
-	set_cpus_allowed(current, cpumask);
-	ipipe_setscheduler_root(current, SCHED_FIFO, 50 + thrprio);
-
-	while (!kthread_should_stop()) {
-		local_irq_disable();
-		if (!(desc->status & IRQ_SCHEDULED)) {
-			set_current_state(TASK_INTERRUPTIBLE);
-resched:
-			local_irq_enable();
-			schedule();
-			local_irq_disable();
-		}
-		__set_current_state(TASK_RUNNING);
-		/*
-		 * If higher priority interrupt servers are ready to
-		 * run, reschedule immediately. We need this for the
-		 * GPIO demux IRQ handler to unmask the interrupt line
-		 * _last_, after all GPIO IRQs have run.
-		 */
-		if (per_cpu(pending_irqthread_mask, cpu) & ~(thrmask|(thrmask-1)))
-			goto resched;
-		if (--per_cpu(pending_irq_count[thrprio], cpu) == 0)
-			per_cpu(pending_irqthread_mask, cpu) &= ~thrmask;
-		desc->status &= ~IRQ_SCHEDULED;
-		desc->thr_handler(irq, &__raw_get_cpu_var(__ipipe_tick_regs));
-		local_irq_enable();
-	}
-	__set_current_state(TASK_RUNNING);
-	return 0;
-}
+	unsigned long flags;
 
-static void kick_irqd(unsigned irq, void *cookie)
-{
-	struct irq_desc *desc = irq_desc + irq;
-	int thrprio = desc->thr_prio;
-	int thrmask = 1 << thrprio;
-	int cpu = smp_processor_id();
-
-	if (!(desc->status & IRQ_SCHEDULED)) {
-		desc->status |= IRQ_SCHEDULED;
-		per_cpu(pending_irqthread_mask, cpu) |= thrmask;
-		++per_cpu(pending_irq_count[thrprio], cpu);
-		wake_up_process(desc->thread);
-	}
-}
+	BUG_ON(irqs_disabled());
 
-int ipipe_start_irq_thread(unsigned irq, struct irq_desc *desc)
-{
-	if (desc->thread || !create_irq_threads)
-		return 0;
-
-	desc->thread = kthread_create(do_irqd, desc, "IRQ %d", irq);
-	if (desc->thread == NULL) {
-		printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq);
-		return -ENOMEM;
-	}
+	local_irq_save_hw(flags);
 
-	wake_up_process(desc->thread);
+	clear_thread_flag(TIF_IRQ_SYNC);
 
-	desc->thr_handler = ipipe_root_domain->irqs[irq].handler;
-	ipipe_root_domain->irqs[irq].handler = &kick_irqd;
+	if (ipipe_root_cpudom_var(irqpend_himask) != 0)
+		__ipipe_sync_pipeline(IPIPE_IRQMASK_ANY);
 
-	return 0;
+	local_irq_restore_hw(flags);
 }
 
-void __init ipipe_init_irq_threads(void)
+void ___ipipe_sync_pipeline(unsigned long syncmask)
 {
-	unsigned irq;
-	struct irq_desc *desc;
-
-	create_irq_threads = 1;
+	struct ipipe_domain *ipd = ipipe_current_domain;
 
-	for (irq = 0; irq < NR_IRQS; irq++) {
-		desc = irq_desc + irq;
-		if (desc->action != NULL ||
-			(desc->status & IRQ_NOREQUEST) != 0)
-			ipipe_start_irq_thread(irq, desc);
+	if (ipd == ipipe_root_domain) {
+		if (test_bit(IPIPE_SYNCDEFER_FLAG, &ipipe_root_cpudom_var(status)))
+			return;
 	}
+
+	__ipipe_sync_stage(syncmask);
 }
 
 EXPORT_SYMBOL(show_stack);
diff --git a/arch/blackfin/kernel/irqchip.c b/arch/blackfin/kernel/irqchip.c
index 75724ee..7fd1265 100644
--- a/arch/blackfin/kernel/irqchip.c
+++ b/arch/blackfin/kernel/irqchip.c
@@ -144,11 +144,15 @@ asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
 #endif
 	generic_handle_irq(irq);
 
-#ifndef CONFIG_IPIPE	/* Useless and bugous over the I-pipe: IRQs are threaded. */
-	/* If we're the only interrupt running (ignoring IRQ15 which is for
-	   syscalls), lower our priority to IRQ14 so that softirqs run at
-	   that level.  If there's another, lower-level interrupt, irq_exit
-	   will defer softirqs to that.  */
+#ifndef CONFIG_IPIPE
+	/*
+	 * If we're the only interrupt running (ignoring IRQ15 which
+	 * is for syscalls), lower our priority to IRQ14 so that
+	 * softirqs run at that level.  If there's another,
+	 * lower-level interrupt, irq_exit will defer softirqs to
+	 * that. If the interrupt pipeline is enabled, we are already
+	 * running at IRQ14 priority, so we don't need this code.
+	 */
 	CSYNC();
 	pending = bfin_read_IPEND() & ~0x8000;
 	other_ints = pending & (pending - 1);
diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c
index 172b4c58..1bbacfb 100644
--- a/arch/blackfin/kernel/time.c
+++ b/arch/blackfin/kernel/time.c
@@ -134,7 +134,10 @@ irqreturn_t timer_interrupt(int irq, void *dummy)
 
 	write_seqlock(&xtime_lock);
 #if defined(CONFIG_TICK_SOURCE_SYSTMR0) && !defined(CONFIG_IPIPE)
-/* FIXME: Here TIMIL0 is not set when IPIPE enabled, why? */
+	/*
+	 * TIMIL0 is latched in __ipipe_grab_irq() when the I-Pipe is
+	 * enabled.
+	 */
 	if (get_gptimer_status(0) & TIMER_STATUS_TIMIL0) {
 #endif
 		do_timer(1);
diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S
index 88de053..21e65a3 100644
--- a/arch/blackfin/mach-common/entry.S
+++ b/arch/blackfin/mach-common/entry.S
@@ -600,6 +600,19 @@ ENTRY(_system_call)
 	p2 = [p2];
 
 	[p2+(TASK_THREAD+THREAD_KSP)] = sp;
+#ifdef CONFIG_IPIPE
+	r0 = sp;
+	SP += -12;
+	call ___ipipe_syscall_root;
+	SP += 12;
+	cc = r0 == 1;
+	if cc jump .Lsyscall_really_exit;
+	cc = r0 == -1;
+	if cc jump .Lresume_userspace;
+	r3 = [sp + PT_R3];
+	r4 = [sp + PT_R4];
+	p0 = [sp + PT_ORIG_P0];
+#endif /* CONFIG_IPIPE */
 
 	/* Check the System Call */
 	r7 = __NR_syscall;
@@ -654,6 +667,17 @@ ENTRY(_system_call)
 	r7 =  r7 & r4;
 
 .Lsyscall_resched:
+#ifdef CONFIG_IPIPE
+	cc = BITTST(r7, TIF_IRQ_SYNC);
+	if !cc jump .Lsyscall_no_irqsync;
+	[--sp] = reti;
+	r0 = [sp++];
+	SP += -12;
+	call ___ipipe_sync_root;
+	SP += 12;
+	jump .Lresume_userspace_1;
+.Lsyscall_no_irqsync:
+#endif
 	cc = BITTST(r7, TIF_NEED_RESCHED);
 	if !cc jump .Lsyscall_sigpending;
 
@@ -685,6 +709,10 @@ ENTRY(_system_call)
 .Lsyscall_really_exit:
 	r5 = [sp + PT_RESERVED];
 	rets = r5;
+#ifdef CONFIG_IPIPE
+	[--sp] = reti;
+	r5 = [sp++];
+#endif /* CONFIG_IPIPE */
 	rts;
 ENDPROC(_system_call)
 
@@ -771,6 +799,15 @@ _new_old_task:
 ENDPROC(_resume)
 
 ENTRY(_ret_from_exception)
+#ifdef CONFIG_IPIPE
+	[--sp] = rets;
+	SP += -12;
+	call ___ipipe_check_root
+	SP += 12
+	rets = [sp++];
+	cc = r0 == 0;
+	if cc jump 4f;                /* not on behalf of Linux, get out */
+#endif /* CONFIG_IPIPE */
 	p2.l = lo(IPEND);
 	p2.h = hi(IPEND);
 
@@ -827,6 +864,28 @@ ENTRY(_ret_from_exception)
 	rts;
 ENDPROC(_ret_from_exception)
 
+#ifdef CONFIG_IPIPE
+
+_sync_root_irqs:
+	[--sp] = reti;		/* Reenable interrupts */
+	r0 = [sp++];
+	jump.l ___ipipe_sync_root
+
+_resume_kernel_from_int:
+	r0.l = _sync_root_irqs
+	r0.h = _sync_root_irqs
+	[--sp] = rets;
+	[--sp] = ( r7:4, p5:3 );
+	SP += -12;
+	call ___ipipe_call_irqtail
+	SP += 12;
+	( r7:4, p5:3 ) = [sp++];
+	rets = [sp++];
+	rts
+#else
+#define _resume_kernel_from_int	 2f
+#endif
+
 ENTRY(_return_from_int)
 	/* If someone else already raised IRQ 15, do nothing.  */
 	csync;
@@ -848,7 +907,7 @@ ENTRY(_return_from_int)
 	r1 = r0 - r1;
 	r2 = r0 & r1;
 	cc = r2 == 0;
-	if !cc jump 2f;
+	if !cc jump _resume_kernel_from_int;
 
 	/* Lower the interrupt level to 15.  */
 	p0.l = lo(EVT15);
diff --git a/arch/blackfin/mach-common/interrupt.S b/arch/blackfin/mach-common/interrupt.S
index 43c4eb9..0069c2d 100644
--- a/arch/blackfin/mach-common/interrupt.S
+++ b/arch/blackfin/mach-common/interrupt.S
@@ -235,6 +235,7 @@ ENDPROC(_evt_system_call)
 
 #ifdef CONFIG_IPIPE
 ENTRY(___ipipe_call_irqtail)
+	p0 = r0;
 	r0.l = 1f;
 	r0.h = 1f;
 	reti = r0;
@@ -242,9 +243,6 @@ ENTRY(___ipipe_call_irqtail)
 1:
 	[--sp] = rets;
 	[--sp] = ( r7:4, p5:3 );
-	p0.l = ___ipipe_irq_tail_hook;
-	p0.h = ___ipipe_irq_tail_hook;
-	p0 = [p0];
 	sp += -12;
 	call (p0);
 	sp += 12;
@@ -259,7 +257,7 @@ ENTRY(___ipipe_call_irqtail)
 	p0.h = hi(EVT14);
 	[p0] = r0;
 	csync;
-	r0 = 0x401f;
+	r0 = 0x401f (z);
 	sti r0;
 	raise 14;
 	[--sp] = reti;          /* IRQs on. */
@@ -277,11 +275,7 @@ ENTRY(___ipipe_call_irqtail)
 	p0.h = _bfin_irq_flags;
 	r0 = [p0];
 	sti r0;
-#if 0 /* FIXME: this actually raises scheduling latencies */
-	/* Reenable interrupts */
-	[--sp] = reti;
-	r0 = [sp++];
-#endif
 	rts;
 ENDPROC(___ipipe_call_irqtail)
+
 #endif /* CONFIG_IPIPE */
diff --git a/arch/blackfin/mach-common/ints-priority.c b/arch/blackfin/mach-common/ints-priority.c
index 2024945..a7d7b2d 100644
--- a/arch/blackfin/mach-common/ints-priority.c
+++ b/arch/blackfin/mach-common/ints-priority.c
@@ -161,11 +161,15 @@ static void bfin_core_unmask_irq(unsigned int irq)
 
 static void bfin_internal_mask_irq(unsigned int irq)
 {
+	unsigned long flags;
+
 #ifdef CONFIG_BF53x
+	local_irq_save_hw(flags);
 	bfin_write_SIC_IMASK(bfin_read_SIC_IMASK() &
 			     ~(1 << SIC_SYSIRQ(irq)));
 #else
 	unsigned mask_bank, mask_bit;
+	local_irq_save_hw(flags);
 	mask_bank = SIC_SYSIRQ(irq) / 32;
 	mask_bit = SIC_SYSIRQ(irq) % 32;
 	bfin_write_SIC_IMASK(mask_bank, bfin_read_SIC_IMASK(mask_bank) &
@@ -175,15 +179,20 @@ static void bfin_internal_mask_irq(unsigned int irq)
 			     ~(1 << mask_bit));
 #endif
 #endif
+	local_irq_restore_hw(flags);
 }
 
 static void bfin_internal_unmask_irq(unsigned int irq)
 {
+	unsigned long flags;
+
 #ifdef CONFIG_BF53x
+	local_irq_save_hw(flags);
 	bfin_write_SIC_IMASK(bfin_read_SIC_IMASK() |
 			     (1 << SIC_SYSIRQ(irq)));
 #else
 	unsigned mask_bank, mask_bit;
+	local_irq_save_hw(flags);
 	mask_bank = SIC_SYSIRQ(irq) / 32;
 	mask_bit = SIC_SYSIRQ(irq) % 32;
 	bfin_write_SIC_IMASK(mask_bank, bfin_read_SIC_IMASK(mask_bank) |
@@ -193,6 +202,7 @@ static void bfin_internal_unmask_irq(unsigned int irq)
 			     (1 << mask_bit));
 #endif
 #endif
+	local_irq_restore_hw(flags);
 }
 
 #ifdef CONFIG_PM
@@ -390,7 +400,7 @@ static void bfin_demux_error_irq(unsigned int int_err_irq,
 static inline void bfin_set_irq_handler(unsigned irq, irq_flow_handler_t handle)
 {
 #ifdef CONFIG_IPIPE
-	_set_irq_handler(irq, handle_edge_irq);
+	_set_irq_handler(irq, handle_level_irq);
 #else
 	struct irq_desc *desc = irq_desc + irq;
 	/* May not call generic set_irq_handler() due to spinlock
@@ -1055,13 +1065,18 @@ int __init init_arch_irq(void)
 #endif
 		default:
 #ifdef CONFIG_IPIPE
-	/*
-	 * We want internal interrupt sources to be masked, because
-	 * ISRs may trigger interrupts recursively (e.g. DMA), but
-	 * interrupts are _not_ masked at CPU level. So let's handle
-	 * them as level interrupts.
-	 */
-			set_irq_handler(irq, handle_level_irq);
+			/*
+			 * We want internal interrupt sources to be
+			 * masked, because ISRs may trigger interrupts
+			 * recursively (e.g. DMA), but interrupts are
+			 * _not_ masked at CPU level. So let's handle
+			 * most of them as level interrupts, except
+			 * the timer interrupt which is special.
+			 */
+			if (irq == IRQ_SYSTMR || irq == IRQ_CORETMR)
+				set_irq_handler(irq, handle_simple_irq);
+			else
+				set_irq_handler(irq, handle_level_irq);
 #else /* !CONFIG_IPIPE */
 			set_irq_handler(irq, handle_simple_irq);
 #endif /* !CONFIG_IPIPE */
@@ -1123,9 +1138,8 @@ int __init init_arch_irq(void)
 
 #ifdef CONFIG_IPIPE
 	for (irq = 0; irq < NR_IRQS; irq++) {
-		struct irq_desc *desc = irq_desc + irq;
+		struct irq_desc *desc = irq_to_desc(irq);
 		desc->ic_prio = __ipipe_get_irq_priority(irq);
-		desc->thr_prio = __ipipe_get_irqthread_priority(irq);
 	}
 #endif /* CONFIG_IPIPE */
 
@@ -1208,76 +1222,21 @@ int __ipipe_get_irq_priority(unsigned irq)
 	return IVG15;
 }
 
-int __ipipe_get_irqthread_priority(unsigned irq)
-{
-	int ient, prio;
-	int demux_irq;
-
-	/* The returned priority value is rescaled to [0..IVG13+1]
-	 * with 0 being the lowest effective priority level. */
-
-	if (irq <= IRQ_CORETMR)
-		return IVG13 - irq + 1;
-
-	/* GPIO IRQs are given the priority of the demux
-	 * interrupt. */
-	if (IS_GPIOIRQ(irq)) {
-#if defined(CONFIG_BF54x)
-		u32 bank = PINT_2_BANK(irq2pint_lut[irq - SYS_IRQS]);
-		demux_irq = (bank == 0 ? IRQ_PINT0 :
-				bank == 1 ? IRQ_PINT1 :
-				bank == 2 ? IRQ_PINT2 :
-				IRQ_PINT3);
-#elif defined(CONFIG_BF561)
-		demux_irq = (irq >= IRQ_PF32 ? IRQ_PROG2_INTA :
-				irq >= IRQ_PF16 ? IRQ_PROG1_INTA :
-				IRQ_PROG0_INTA);
-#elif defined(CONFIG_BF52x)
-		demux_irq = (irq >= IRQ_PH0 ? IRQ_PORTH_INTA :
-				irq >= IRQ_PG0 ? IRQ_PORTG_INTA :
-				IRQ_PORTF_INTA);
-#else
-		demux_irq = irq;
-#endif
-		return IVG13 - PRIO_GPIODEMUX(demux_irq) + 1;
-	}
-
-	/* The GPIO demux interrupt is given a lower priority
-	 * than the GPIO IRQs, so that its threaded handler
-	 * unmasks the interrupt line after the decoded IRQs
-	 * have been processed. */
-	prio = PRIO_GPIODEMUX(irq);
-	/* demux irq? */
-	if (prio != -1)
-		return IVG13 - prio;
-
-	for (ient = 0; ient < NR_PERI_INTS; ient++) {
-		struct ivgx *ivg = ivg_table + ient;
-		if (ivg->irqno == irq) {
-			for (prio = 0; prio <= IVG13-IVG7; prio++) {
-				if (ivg7_13[prio].ifirst <= ivg &&
-				    ivg7_13[prio].istop > ivg)
-					return IVG7 - prio;
-			}
-		}
-	}
-
-	return 0;
-}
-
 /* Hw interrupts are disabled on entry (check SAVE_CONTEXT). */
 #ifdef CONFIG_DO_IRQ_L1
 __attribute__((l1_text))
 #endif
 asmlinkage int __ipipe_grab_irq(int vec, struct pt_regs *regs)
 {
+	struct ipipe_percpu_domain_data *p = ipipe_root_cpudom_ptr();
+	struct ipipe_domain *this_domain = ipipe_current_domain;
 	struct ivgx *ivg_stop = ivg7_13[vec-IVG7].istop;
 	struct ivgx *ivg = ivg7_13[vec-IVG7].ifirst;
-	int irq;
+	int irq, s;
 
 	if (likely(vec == EVT_IVTMR_P)) {
 		irq = IRQ_CORETMR;
-		goto handle_irq;
+		goto core_tick;
 	}
 
 	SSYNC();
@@ -1319,24 +1278,39 @@ asmlinkage int __ipipe_grab_irq(int vec, struct pt_regs *regs)
 	irq = ivg->irqno;
 
 	if (irq == IRQ_SYSTMR) {
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+core_tick:
+#else
 		bfin_write_TIMER_STATUS(1); /* Latch TIMIL0 */
+#endif
 		/* This is basically what we need from the register frame. */
 		__raw_get_cpu_var(__ipipe_tick_regs).ipend = regs->ipend;
 		__raw_get_cpu_var(__ipipe_tick_regs).pc = regs->pc;
-		if (!ipipe_root_domain_p)
-			__raw_get_cpu_var(__ipipe_tick_regs).ipend |= 0x10;
-		else
+		if (this_domain != ipipe_root_domain)
 			__raw_get_cpu_var(__ipipe_tick_regs).ipend &= ~0x10;
+		else
+			__raw_get_cpu_var(__ipipe_tick_regs).ipend |= 0x10;
 	}
 
-handle_irq:
+#ifndef CONFIG_GENERIC_CLOCKEVENTS
+core_tick:
+#endif
+	if (this_domain == ipipe_root_domain) {
+		s = __test_and_set_bit(IPIPE_SYNCDEFER_FLAG, &p->status);
+		barrier();
+	}
 
 	ipipe_trace_irq_entry(irq);
 	__ipipe_handle_irq(irq, regs);
-       ipipe_trace_irq_exit(irq);
+	ipipe_trace_irq_exit(irq);
 
-       if (ipipe_root_domain_p)
-		return !test_bit(IPIPE_STALL_FLAG, &ipipe_root_cpudom_var(status));
+	if (this_domain == ipipe_root_domain) {
+		set_thread_flag(TIF_IRQ_SYNC);
+		if (!s) {
+			__clear_bit(IPIPE_SYNCDEFER_FLAG, &p->status);
+			return !test_bit(IPIPE_STALL_FLAG, &p->status);
+		}
+	}
 
        return 0;
 }
-- 
cgit v0.10.2


From 8bf6774d7fe44ada94bb8df50f089c6e60bdfb26 Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Wed, 4 Mar 2009 11:34:10 +0800
Subject: Blackfin arch: Enable Write Back Cache on all Blackfin Boards

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/configs/BF527-EZKIT_defconfig b/arch/blackfin/configs/BF527-EZKIT_defconfig
index 833128b..a50050f 100644
--- a/arch/blackfin/configs/BF527-EZKIT_defconfig
+++ b/arch/blackfin/configs/BF527-EZKIT_defconfig
@@ -327,8 +327,8 @@ CONFIG_BFIN_ICACHE=y
 CONFIG_BFIN_DCACHE=y
 # CONFIG_BFIN_DCACHE_BANKA is not set
 # CONFIG_BFIN_ICACHE_LOCK is not set
-# CONFIG_BFIN_WB is not set
-CONFIG_BFIN_WT=y
+CONFIG_BFIN_WB=y
+# CONFIG_BFIN_WT is not set
 # CONFIG_MPU is not set
 
 #
diff --git a/arch/blackfin/configs/BF533-EZKIT_defconfig b/arch/blackfin/configs/BF533-EZKIT_defconfig
index 334c94b..0a2a00d 100644
--- a/arch/blackfin/configs/BF533-EZKIT_defconfig
+++ b/arch/blackfin/configs/BF533-EZKIT_defconfig
@@ -290,8 +290,8 @@ CONFIG_BFIN_ICACHE=y
 CONFIG_BFIN_DCACHE=y
 # CONFIG_BFIN_DCACHE_BANKA is not set
 # CONFIG_BFIN_ICACHE_LOCK is not set
-# CONFIG_BFIN_WB is not set
-CONFIG_BFIN_WT=y
+CONFIG_BFIN_WB=y
+# CONFIG_BFIN_WT is not set
 # CONFIG_MPU is not set
 
 #
diff --git a/arch/blackfin/configs/BF533-STAMP_defconfig b/arch/blackfin/configs/BF533-STAMP_defconfig
index 9d73343..eb02758 100644
--- a/arch/blackfin/configs/BF533-STAMP_defconfig
+++ b/arch/blackfin/configs/BF533-STAMP_defconfig
@@ -290,8 +290,8 @@ CONFIG_BFIN_ICACHE=y
 CONFIG_BFIN_DCACHE=y
 # CONFIG_BFIN_DCACHE_BANKA is not set
 # CONFIG_BFIN_ICACHE_LOCK is not set
-# CONFIG_BFIN_WB is not set
-CONFIG_BFIN_WT=y
+CONFIG_BFIN_WB=y
+# CONFIG_BFIN_WT is not set
 # CONFIG_MPU is not set
 
 #
diff --git a/arch/blackfin/configs/BF537-STAMP_defconfig b/arch/blackfin/configs/BF537-STAMP_defconfig
index c61b600..9e62b9f 100644
--- a/arch/blackfin/configs/BF537-STAMP_defconfig
+++ b/arch/blackfin/configs/BF537-STAMP_defconfig
@@ -298,8 +298,8 @@ CONFIG_BFIN_ICACHE=y
 CONFIG_BFIN_DCACHE=y
 # CONFIG_BFIN_DCACHE_BANKA is not set
 # CONFIG_BFIN_ICACHE_LOCK is not set
-# CONFIG_BFIN_WB is not set
-CONFIG_BFIN_WT=y
+CONFIG_BFIN_WB=y
+# CONFIG_BFIN_WT is not set
 # CONFIG_MPU is not set
 
 #
diff --git a/arch/blackfin/configs/BF538-EZKIT_defconfig b/arch/blackfin/configs/BF538-EZKIT_defconfig
index cb32f56..dd6ad6be 100644
--- a/arch/blackfin/configs/BF538-EZKIT_defconfig
+++ b/arch/blackfin/configs/BF538-EZKIT_defconfig
@@ -306,8 +306,8 @@ CONFIG_BFIN_ICACHE=y
 CONFIG_BFIN_DCACHE=y
 # CONFIG_BFIN_DCACHE_BANKA is not set
 # CONFIG_BFIN_ICACHE_LOCK is not set
-# CONFIG_BFIN_WB is not set
-CONFIG_BFIN_WT=y
+CONFIG_BFIN_WB=y
+# CONFIG_BFIN_WT is not set
 # CONFIG_MPU is not set
 
 #
diff --git a/arch/blackfin/configs/BF548-EZKIT_defconfig b/arch/blackfin/configs/BF548-EZKIT_defconfig
index 0f869761..b41dbd0 100644
--- a/arch/blackfin/configs/BF548-EZKIT_defconfig
+++ b/arch/blackfin/configs/BF548-EZKIT_defconfig
@@ -361,8 +361,8 @@ CONFIG_BFIN_ICACHE=y
 CONFIG_BFIN_DCACHE=y
 # CONFIG_BFIN_DCACHE_BANKA is not set
 # CONFIG_BFIN_ICACHE_LOCK is not set
-# CONFIG_BFIN_WB is not set
-CONFIG_BFIN_WT=y
+CONFIG_BFIN_WB=y
+# CONFIG_BFIN_WT is not set
 # CONFIG_BFIN_L2_CACHEABLE is not set
 # CONFIG_MPU is not set
 
diff --git a/arch/blackfin/configs/BF561-EZKIT_defconfig b/arch/blackfin/configs/BF561-EZKIT_defconfig
index 042c7ad..69714fb 100644
--- a/arch/blackfin/configs/BF561-EZKIT_defconfig
+++ b/arch/blackfin/configs/BF561-EZKIT_defconfig
@@ -329,8 +329,8 @@ CONFIG_BFIN_ICACHE=y
 CONFIG_BFIN_DCACHE=y
 # CONFIG_BFIN_DCACHE_BANKA is not set
 # CONFIG_BFIN_ICACHE_LOCK is not set
-# CONFIG_BFIN_WB is not set
-CONFIG_BFIN_WT=y
+CONFIG_BFIN_WB=y
+# CONFIG_BFIN_WT is not set
 # CONFIG_BFIN_L2_CACHEABLE is not set
 # CONFIG_MPU is not set
 
diff --git a/arch/blackfin/configs/BlackStamp_defconfig b/arch/blackfin/configs/BlackStamp_defconfig
index 3a20e28..017c6ea 100644
--- a/arch/blackfin/configs/BlackStamp_defconfig
+++ b/arch/blackfin/configs/BlackStamp_defconfig
@@ -288,8 +288,8 @@ CONFIG_BFIN_ICACHE=y
 CONFIG_BFIN_DCACHE=y
 # CONFIG_BFIN_DCACHE_BANKA is not set
 # CONFIG_BFIN_ICACHE_LOCK is not set
-# CONFIG_BFIN_WB is not set
-CONFIG_BFIN_WT=y
+CONFIG_BFIN_WB=y
+# CONFIG_BFIN_WT is not set
 # CONFIG_MPU is not set
 
 #
diff --git a/arch/blackfin/configs/CM-BF527_defconfig b/arch/blackfin/configs/CM-BF527_defconfig
index 865ed85..d880ef7 100644
--- a/arch/blackfin/configs/CM-BF527_defconfig
+++ b/arch/blackfin/configs/CM-BF527_defconfig
@@ -332,8 +332,8 @@ CONFIG_BFIN_ICACHE=y
 CONFIG_BFIN_DCACHE=y
 # CONFIG_BFIN_DCACHE_BANKA is not set
 # CONFIG_BFIN_ICACHE_LOCK is not set
-# CONFIG_BFIN_WB is not set
-CONFIG_BFIN_WT=y
+CONFIG_BFIN_WB=y
+# CONFIG_BFIN_WT is not set
 # CONFIG_MPU is not set
 
 #
diff --git a/arch/blackfin/configs/CM-BF548_defconfig b/arch/blackfin/configs/CM-BF548_defconfig
index efe9741..542ebc5 100644
--- a/arch/blackfin/configs/CM-BF548_defconfig
+++ b/arch/blackfin/configs/CM-BF548_defconfig
@@ -336,8 +336,8 @@ CONFIG_BFIN_ICACHE=y
 CONFIG_BFIN_DCACHE=y
 # CONFIG_BFIN_DCACHE_BANKA is not set
 # CONFIG_BFIN_ICACHE_LOCK is not set
-# CONFIG_BFIN_WB is not set
-CONFIG_BFIN_WT=y
+CONFIG_BFIN_WB=y
+# CONFIG_BFIN_WT is not set
 CONFIG_L1_MAX_PIECE=16
 # CONFIG_MPU is not set
 
diff --git a/arch/blackfin/configs/SRV1_defconfig b/arch/blackfin/configs/SRV1_defconfig
index fa580af..a46529c 100644
--- a/arch/blackfin/configs/SRV1_defconfig
+++ b/arch/blackfin/configs/SRV1_defconfig
@@ -282,8 +282,8 @@ CONFIG_BFIN_ICACHE=y
 CONFIG_BFIN_DCACHE=y
 # CONFIG_BFIN_DCACHE_BANKA is not set
 # CONFIG_BFIN_ICACHE_LOCK is not set
-# CONFIG_BFIN_WB is not set
-CONFIG_BFIN_WT=y
+CONFIG_BFIN_WB=y
+# CONFIG_BFIN_WT is not set
 CONFIG_L1_MAX_PIECE=16
 
 #
-- 
cgit v0.10.2


From 368a12117dd8abf6eaefa37c21ac313b517128b9 Mon Sep 17 00:00:00 2001
From: Tony Breeds <tony@bakeyournoodle.com>
Date: Tue, 3 Mar 2009 17:59:30 +0000
Subject: powerpc: Run sbc610 USB fixup code only on the appropriate platform.

commit a969e76a7101bf5f3d369563df1ca1253dd6131b (powerpc: Correct USB
support for GE Fanuc SBC610) introduced a fixup for NEC usb controllers.
This fixup should only run on GEF SBC610 boards.

Fixes Fedora bug #486511.
(https://bugzilla.redhat.com/show_bug.cgi?id=486511)

Signed-off-by: Tony Breeds <tony@bakeyournoodle.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/86xx/gef_sbc610.c b/arch/powerpc/platforms/86xx/gef_sbc610.c
index fb371f5..d6b772b 100644
--- a/arch/powerpc/platforms/86xx/gef_sbc610.c
+++ b/arch/powerpc/platforms/86xx/gef_sbc610.c
@@ -142,6 +142,10 @@ static void __init gef_sbc610_nec_fixup(struct pci_dev *pdev)
 {
 	unsigned int val;
 
+	/* Do not do the fixup on other platforms! */
+	if (!machine_is(gef_sbc610))
+		return;
+
 	printk(KERN_INFO "Running NEC uPD720101 Fixup\n");
 
 	/* Ensure ports 1, 2, 3, 4 & 5 are enabled */
-- 
cgit v0.10.2


From 4843b93c96ae5043c6279c4ec6fcd8ee3866ff5b Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 3 Mar 2009 23:37:30 -0800
Subject: netlink: invert error code in netlink_set_err()

The callers of netlink_set_err() currently pass a negative value
as parameter for the error code. However, sk->sk_err wants a
positive error value. Without this patch, skb_recv_datagram() called
by netlink_recvmsg() may return a positive value to report an error.

Another choice to fix this is to change callers to pass a positive
error value, but this seems a bit inconsistent and error prone
to me. Indeed, the callers of netlink_set_err() assumed that the
(usual) negative value for error codes was fine before this patch :).

This patch also includes some documentation in docbook format
for netlink_set_err() to avoid this sort of confusion.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 9eb895c..3ae3cb8 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1084,6 +1084,13 @@ out:
 	return 0;
 }
 
+/**
+ * netlink_set_err - report error to broadcast listeners
+ * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
+ * @pid: the PID of a process that we want to skip (if any)
+ * @groups: the broadcast group that will notice the error
+ * @code: error code, must be negative (as usual in kernelspace)
+ */
 void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
 {
 	struct netlink_set_err_data info;
@@ -1093,7 +1100,8 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
 	info.exclude_sk = ssk;
 	info.pid = pid;
 	info.group = group;
-	info.code = code;
+	/* sk->sk_err wants a positive error value */
+	info.code = -code;
 
 	read_lock(&nl_table_lock);
 
-- 
cgit v0.10.2


From a1a69c8db7f988f903349442a7538d21b56c38e9 Mon Sep 17 00:00:00 2001
From: Peter Korsgaard <jacmet@sunsite.dk>
Date: Tue, 3 Mar 2009 23:48:16 -0800
Subject: dm9601: new vendor/product IDs

Add vendor/product IDs for new no name dm9601 compatible usb ethernet
adaptors.

Reported-by: Eric Lauriault <eric@linux.ca>
Signed-off-by: Peter Korsgaard <jacmet@sunsite.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/usb/dm9601.c b/drivers/net/usb/dm9601.c
index 5b67bbf..81682c6 100644
--- a/drivers/net/usb/dm9601.c
+++ b/drivers/net/usb/dm9601.c
@@ -635,6 +635,10 @@ static const struct usb_device_id products[] = {
 	USB_DEVICE(0x0a47, 0x9601),	/* Hirose USB-100 */
 	.driver_info = (unsigned long)&dm9601_info,
 	 },
+	{
+	USB_DEVICE(0x0fe6, 0x8101),	/* DM9601 USB to Fast Ethernet Adapter */
+	.driver_info = (unsigned long)&dm9601_info,
+	 },
 	{},			// END
 };
 
-- 
cgit v0.10.2


From 4222474519ff5b31a526dfa1da7aa4b0e38bef5c Mon Sep 17 00:00:00 2001
From: Meelis Roos <mroos@linux.ee>
Date: Tue, 3 Mar 2009 23:48:50 -0800
Subject: net: fix tokenring license

Currently, modular tokenring ("tr") lacks a license and fails to load:

tr: module license 'unspecified' taints kernel.
tr: Unknown symbol proc_net_fops_create

Beacuse of this, no tokenring driver can load if it depends on modular
tr. Fix this by adding GPL module license as it is in the kernel.

With this fix, tr module loads fine and tms380 driver also loads. Well,
it does'nt work but that's a different bug.

Signed-off-by: Meelis Roos <mroos@linux.ee>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/802/tr.c b/net/802/tr.c
index 158150f..f47ae28 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -668,3 +668,5 @@ module_init(rif_init);
 
 EXPORT_SYMBOL(tr_type_trans);
 EXPORT_SYMBOL(alloc_trdev);
+
+MODULE_LICENSE("GPL");
-- 
cgit v0.10.2


From 5fd3a17ed456637a224cf4ca82b9ad9d005bc8d4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 4 Mar 2009 00:57:25 -0700
Subject: md: fix deadlock when stopping arrays

Resolve a deadlock when stopping redundant arrays, i.e. ones that
require a call to sysfs_remove_group when shutdown.  The deadlock is
summarized below:

Thread1                Thread2
-------                -------
read sysfs attribute   stop array
                       take mddev lock
                       sysfs_remove_group
sysfs_get_active
wait for mddev lock
                       wait for active

Sysrq-w:
--------
mdmon         S 00000017  2212  4163      1
  f1982ea8 00000046 2dcf6b85 00000017 c0b23100 f2f83ed0 c0b23100 f2f8413c
  c0b23100 c0b23100 c0b1fb98 f2f8413c 00000000 f2f8413c c0b23100 f2291ecc
  00000002 c0b23100 00000000 00000017 f2f83ed0 f1982eac 00000046 c044d9dd
Call Trace:
  [<c044d9dd>] ? debug_mutex_add_waiter+0x1d/0x58
  [<c06ef451>] __mutex_lock_common+0x1d9/0x338
  [<c06ef451>] ? __mutex_lock_common+0x1d9/0x338
  [<c06ef5e3>] mutex_lock_interruptible_nested+0x33/0x3a
  [<c0634553>] ? mddev_lock+0x14/0x16
  [<c0634553>] mddev_lock+0x14/0x16
  [<c0634eda>] md_attr_show+0x2a/0x49
  [<c04e9997>] sysfs_read_file+0x93/0xf9
mdadm         D 00000017  2812  4177      1
  f0401d78 00000046 430456f8 00000017 f0401d58 f0401d20 c0b23100 f2da2c4c
  c0b23100 c0b23100 c0b1fb98 f2da2c4c 0a10fc36 00000000 c0b23100 f0401d70
  00000003 c0b23100 00000000 00000017 f2da29e0 00000001 00000002 00000000
Call Trace:
  [<c06eed1b>] schedule_timeout+0x1b/0x95
  [<c06eed1b>] ? schedule_timeout+0x1b/0x95
  [<c06eeb97>] ? wait_for_common+0x34/0xdc
  [<c044fa8a>] ? trace_hardirqs_on_caller+0x18/0x145
  [<c044fbc2>] ? trace_hardirqs_on+0xb/0xd
  [<c06eec03>] wait_for_common+0xa0/0xdc
  [<c0428c7c>] ? default_wake_function+0x0/0x12
  [<c06eeccc>] wait_for_completion+0x17/0x19
  [<c04ea620>] sysfs_addrm_finish+0x19f/0x1d1
  [<c04e920e>] sysfs_hash_and_remove+0x42/0x55
  [<c04eb4db>] sysfs_remove_group+0x57/0x86
  [<c0638086>] do_md_stop+0x13a/0x499

This has been there for a while, but is easier to trigger now that mdmon
is closely watching sysfs.

Cc: <stable@kernel.org>
Reported-by: Jacek Danecki <jacek.danecki@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 03b4cd0..a307f87 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -214,12 +214,7 @@ static inline mddev_t *mddev_get(mddev_t *mddev)
 	return mddev;
 }
 
-static void mddev_delayed_delete(struct work_struct *ws)
-{
-	mddev_t *mddev = container_of(ws, mddev_t, del_work);
-	kobject_del(&mddev->kobj);
-	kobject_put(&mddev->kobj);
-}
+static void mddev_delayed_delete(struct work_struct *ws);
 
 static void mddev_put(mddev_t *mddev)
 {
@@ -3542,6 +3537,21 @@ static struct kobj_type md_ktype = {
 
 int mdp_major = 0;
 
+static void mddev_delayed_delete(struct work_struct *ws)
+{
+	mddev_t *mddev = container_of(ws, mddev_t, del_work);
+
+	if (mddev->private == &md_redundancy_group) {
+		sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
+		if (mddev->sysfs_action)
+			sysfs_put(mddev->sysfs_action);
+		mddev->sysfs_action = NULL;
+		mddev->private = NULL;
+	}
+	kobject_del(&mddev->kobj);
+	kobject_put(&mddev->kobj);
+}
+
 static int md_alloc(dev_t dev, char *name)
 {
 	static DEFINE_MUTEX(disks_mutex);
@@ -4033,13 +4043,9 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			mddev->queue->merge_bvec_fn = NULL;
 			mddev->queue->unplug_fn = NULL;
 			mddev->queue->backing_dev_info.congested_fn = NULL;
-			if (mddev->pers->sync_request) {
-				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
-				if (mddev->sysfs_action)
-					sysfs_put(mddev->sysfs_action);
-				mddev->sysfs_action = NULL;
-			}
 			module_put(mddev->pers->owner);
+			if (mddev->pers->sync_request)
+				mddev->private = &md_redundancy_group;
 			mddev->pers = NULL;
 			/* tell userspace to handle 'inactive' */
 			sysfs_notify_dirent(mddev->sysfs_state);
-- 
cgit v0.10.2


From 858b9ced6e73a0f087294c398a1ae70a7eeed94f Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Wed, 4 Mar 2009 00:11:42 -0800
Subject: net: more timeouts that reach -1

with while (timeout-- > 0); timeout reaches -1 after the loop, so the tests
below are off by one. also don't do an '< 0' test on an unsigned.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/arm/ks8695net.c b/drivers/net/arm/ks8695net.c
index 1cf2f94..f3a1274 100644
--- a/drivers/net/arm/ks8695net.c
+++ b/drivers/net/arm/ks8695net.c
@@ -560,7 +560,7 @@ ks8695_reset(struct ks8695_priv *ksp)
 		msleep(1);
 	}
 
-	if (reset_timeout == 0) {
+	if (reset_timeout < 0) {
 		dev_crit(ksp->dev,
 			 "Timeout waiting for DMA engines to reset\n");
 		/* And blithely carry on */
diff --git a/drivers/net/jme.c b/drivers/net/jme.c
index 08b3405..a6e1a35 100644
--- a/drivers/net/jme.c
+++ b/drivers/net/jme.c
@@ -957,13 +957,14 @@ jme_process_receive(struct jme_adapter *jme, int limit)
 		goto out_inc;
 
 	i = atomic_read(&rxring->next_to_clean);
-	while (limit-- > 0) {
+	while (limit > 0) {
 		rxdesc = rxring->desc;
 		rxdesc += i;
 
 		if ((rxdesc->descwb.flags & cpu_to_le16(RXWBFLAG_OWN)) ||
 		!(rxdesc->descwb.desccnt & RXWBDCNT_WBCPL))
 			goto out;
+		--limit;
 
 		desccnt = rxdesc->descwb.desccnt & RXWBDCNT_DCNT;
 
diff --git a/drivers/net/ucc_geth_mii.c b/drivers/net/ucc_geth_mii.c
index 5463591..0ada4ed 100644
--- a/drivers/net/ucc_geth_mii.c
+++ b/drivers/net/ucc_geth_mii.c
@@ -107,7 +107,7 @@ int uec_mdio_read(struct mii_bus *bus, int mii_id, int regnum)
 static int uec_mdio_reset(struct mii_bus *bus)
 {
 	struct ucc_mii_mng __iomem *regs = (void __iomem *)bus->priv;
-	unsigned int timeout = PHY_INIT_TIMEOUT;
+	int timeout = PHY_INIT_TIMEOUT;
 
 	mutex_lock(&bus->mdio_lock);
 
@@ -123,7 +123,7 @@ static int uec_mdio_reset(struct mii_bus *bus)
 
 	mutex_unlock(&bus->mdio_lock);
 
-	if (timeout <= 0) {
+	if (timeout < 0) {
 		printk(KERN_ERR "%s: The MII Bus is stuck!\n", bus->name);
 		return -EBUSY;
 	}
-- 
cgit v0.10.2


From b9bdcd9bd78d253dcc8e13c29f0acd67e080e7c1 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Wed, 4 Mar 2009 00:05:56 -0800
Subject: net pcmcia: worklimit reaches -1

with while (--worklimit >= 0); worklimit reaches -1 after the loop. In
3c589_cs.c this caused a warning not to be printed.

In 3c574_cs.c contrastingly, el3_rx() treats worklimit differently:

static int el3_rx(struct net_device *dev, int worklimit)
{
	while (--worklimit >= 0) { ... }
	return worklimit;
}

el3_rx() is only called by function el3_interrupt(): twice:

static irqreturn_t el3_interrupt(int irq, void *dev_id)
{
        int work_budget = max_interrupt_work;
	while(...) {
		if (...)
			work_budget = el3_rx(dev, work_budget);
		if (...)
			work_budget = el3_rx(dev, work_budget);
		if (--work_budget < 0) {
		        ...
		        break;
		}
	}
}
The error path can occur 2 too early.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c
index e5cb6b1..2404a83 100644
--- a/drivers/net/pcmcia/3c574_cs.c
+++ b/drivers/net/pcmcia/3c574_cs.c
@@ -1035,7 +1035,8 @@ static int el3_rx(struct net_device *dev, int worklimit)
 	DEBUG(3, "%s: in rx_packet(), status %4.4x, rx_status %4.4x.\n",
 		  dev->name, inw(ioaddr+EL3_STATUS), inw(ioaddr+RxStatus));
 	while (!((rx_status = inw(ioaddr + RxStatus)) & 0x8000) &&
-		   (--worklimit >= 0)) {
+			worklimit > 0) {
+		worklimit--;
 		if (rx_status & 0x4000) { /* Error, update stats. */
 			short error = rx_status & 0x3800;
 			dev->stats.rx_errors++;
diff --git a/drivers/net/pcmcia/3c589_cs.c b/drivers/net/pcmcia/3c589_cs.c
index 73ecc65..1e01b8a6 100644
--- a/drivers/net/pcmcia/3c589_cs.c
+++ b/drivers/net/pcmcia/3c589_cs.c
@@ -857,7 +857,8 @@ static int el3_rx(struct net_device *dev)
     DEBUG(3, "%s: in rx_packet(), status %4.4x, rx_status %4.4x.\n",
 	  dev->name, inw(ioaddr+EL3_STATUS), inw(ioaddr+RX_STATUS));
     while (!((rx_status = inw(ioaddr + RX_STATUS)) & 0x8000) &&
-	   (--worklimit >= 0)) {
+		    worklimit > 0) {
+	worklimit--;
 	if (rx_status & 0x4000) { /* Error, update stats. */
 	    short error = rx_status & 0x3800;
 	    dev->stats.rx_errors++;
-- 
cgit v0.10.2


From 948731115774c2e5ff7409360f35389459502211 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Wed, 4 Mar 2009 00:07:57 -0800
Subject: aoe: error printed 1 too early

with while (i-- > 0); i reaches -1 after the loop, so the test below is printed
one too early: 0 still means success.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index cc25057..eeea477 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -173,7 +173,7 @@ skbfree(struct sk_buff *skb)
 		return;
 	while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0)
 		msleep(Sms);
-	if (i <= 0) {
+	if (i < 0) {
 		printk(KERN_ERR
 			"aoe: %s holds ref: %s\n",
 			skb->dev ? skb->dev->name : "netif",
-- 
cgit v0.10.2


From 4a8fd2cfdad4d043a1fadba2f3f340945d966825 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Wed, 4 Mar 2009 00:08:39 -0800
Subject: sungem: another error printed one too early

Another error was printed one too early.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c
index 8d64b1d..0fcb750 100644
--- a/drivers/net/sungem.c
+++ b/drivers/net/sungem.c
@@ -1229,7 +1229,7 @@ static void gem_reset(struct gem *gp)
 			break;
 	} while (val & (GREG_SWRST_TXRST | GREG_SWRST_RXRST));
 
-	if (limit <= 0)
+	if (limit < 0)
 		printk(KERN_ERR "%s: SW reset is ghetto.\n", gp->dev->name);
 
 	if (gp->phy_type == phy_serialink || gp->phy_type == phy_serdes)
-- 
cgit v0.10.2


From 0bf3d933085509916335480121761a1b225e6c23 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Thu, 5 Mar 2009 16:44:53 +0800
Subject: Blackfin arch: fix bug - kgdb fails to continue after setting
 breakpoint on bf561-ezkit kernel with smp patch

Free spinlock before call IPI handlers.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>


Header from folded patch 'blackfin_arch__fix_bug_-_kgdb_fails_to_continue_after_setting_breakpoint_on_bf561-ezkit_kernel_with_smp_patch-1':

Blackfin arch: fix bug - kgdb fails to continue after setting breakpoint on bf561-ezkit kernel with smp patch

Don't test l1 code in SMP kernel.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/kernel/kgdb_test.c b/arch/blackfin/kernel/kgdb_test.c
index 3dba9c1..dbcf3e4 100644
--- a/arch/blackfin/kernel/kgdb_test.c
+++ b/arch/blackfin/kernel/kgdb_test.c
@@ -20,6 +20,7 @@
 static char cmdline[256];
 static unsigned long len;
 
+#ifndef CONFIG_SMP
 static int num1 __attribute__((l1_data));
 
 void kgdb_l1_test(void) __attribute__((l1_text));
@@ -32,6 +33,8 @@ void kgdb_l1_test(void)
 	printk(KERN_ALERT "L1(after change) : data variable addr = 0x%p, data value is %d\n", &num1, num1);
 	return ;
 }
+#endif
+
 #if L2_LENGTH
 
 static int num2 __attribute__((l2));
@@ -59,10 +62,12 @@ int kgdb_test(char *name, int len, int count, int z)
 static int test_proc_output(char *buf)
 {
 	kgdb_test("hello world!", 12, 0x55, 0x10);
+#ifndef CONFIG_SMP
 	kgdb_l1_test();
-	#if L2_LENGTH
+#endif
+#if L2_LENGTH
 	kgdb_l2_test();
-	#endif
+#endif
 
 	return 0;
 }
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 77c9928..93eab61 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -158,10 +158,14 @@ static irqreturn_t ipi_handler(int irq, void *dev_instance)
 			kfree(msg);
 			break;
 		case BFIN_IPI_CALL_FUNC:
+			spin_unlock(&msg_queue->lock);
 			ipi_call_function(cpu, msg);
+			spin_lock(&msg_queue->lock);
 			break;
 		case BFIN_IPI_CPU_STOP:
+			spin_unlock(&msg_queue->lock);
 			ipi_cpu_stop(cpu);
+			spin_lock(&msg_queue->lock);
 			kfree(msg);
 			break;
 		default:
@@ -457,7 +461,7 @@ void smp_icache_flush_range_others(unsigned long start, unsigned long end)
 	smp_flush_data.start = start;
 	smp_flush_data.end = end;
 
-	if (smp_call_function(&ipi_flush_icache, &smp_flush_data, 1))
+	if (smp_call_function(&ipi_flush_icache, &smp_flush_data, 0))
 		printk(KERN_WARNING "SMP: failed to run I-cache flush request on other CPUs\n");
 }
 EXPORT_SYMBOL_GPL(smp_icache_flush_range_others);
-- 
cgit v0.10.2


From 199862892e83d04a294eebad45adc40c658b8630 Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Thu, 5 Mar 2009 16:45:55 +0800
Subject: Blackfin arch: PM_BFIN_WAKE_GP: update help

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index 8f1f97d..be60971 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -1168,6 +1168,12 @@ config PM_BFIN_WAKE_GP
 	default n
 	help
 	  Enable General-Purpose Wake-Up (Voltage Regulator Power-Up)
+	  (all processors, except ADSP-BF549). This option sets
+	  the general-purpose wake-up enable (GPWE) control bit to enable
+	  wake-up upon detection of an active low signal on the /GPW (PH7) pin.
+	  On ADSP-BF549 this option enables the the same functionality on the
+	  /MRXON pin also PH7.
+
 endmenu
 
 menu "CPU Frequency scaling"
-- 
cgit v0.10.2


From ff19fed4fe54f2f1dd439ac02969333ea9a9b4ff Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Wed, 4 Mar 2009 17:35:51 +0800
Subject: Blackfin arch: Fix BUG - kernel fails to build in pm.c when allow
 wakeup fromi standby by GPIO

This feature is not available on BF54x.

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index be60971..0c1f86e 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -1129,6 +1129,7 @@ endchoice
 
 config PM_WAKEUP_BY_GPIO
 	bool "Allow Wakeup from Standby by GPIO"
+	depends on PM && !BF54x
 
 config PM_WAKEUP_GPIO_NUMBER
 	int "GPIO number"
-- 
cgit v0.10.2


From c18e99cfba746ab0ad8d45e1f351ed990947c58c Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier.adi@gmail.com>
Date: Wed, 4 Mar 2009 17:36:49 +0800
Subject: Blackfin arch: update anomaly sheets to match latest public info

Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/mach-bf518/include/mach/anomaly.h b/arch/blackfin/mach-bf518/include/mach/anomaly.h
index 6e16700..143a29d 100644
--- a/arch/blackfin/mach-bf518/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf518/include/mach/anomaly.h
@@ -2,12 +2,12 @@
  * File: include/asm-blackfin/mach-bf518/anomaly.h
  * Bugs: Enter bugs at http://blackfin.uclinux.org/
  *
- * Copyright (C) 2004-2008 Analog Devices Inc.
+ * Copyright (C) 2004-2009 Analog Devices Inc.
  * Licensed under the GPL-2 or later.
  */
 
 /* This file shoule be up to date with:
- *  - ????
+ *  - Revision B, 02/03/2009; ADSP-BF512/BF514/BF516/BF518 Blackfin Processor Anomaly List
  */
 
 #ifndef _MACH_ANOMALY_H_
@@ -19,6 +19,8 @@
 #define ANOMALY_05000122 (1)
 /* False Hardware Error from an Access in the Shadow of a Conditional Branch */
 #define ANOMALY_05000245 (1)
+/* Incorrect Timer Pulse Width in Single-Shot PWM_OUT Mode with External Clock */
+#define ANOMALY_05000254 (1)
 /* Sensitivity To Noise with Slow Input Edge Rates on External SPORT TX and RX Clocks */
 #define ANOMALY_05000265 (1)
 /* False Hardware Errors Caused by Fetches at the Boundary of Reserved Memory */
@@ -53,6 +55,12 @@
 #define ANOMALY_05000443 (1)
 /* Incorrect L1 Instruction Bank B Memory Map Location */
 #define ANOMALY_05000444 (1)
+/* Incorrect Default Hysteresis Setting for RESET, NMI, and BMODE Signals */
+#define ANOMALY_05000452 (1)
+/* PWM_TRIPB Signal Not Available on PG10 */
+#define ANOMALY_05000453 (1)
+/* PPI_FS3 is Driven One Half Cycle Later Than PPI Data */
+#define ANOMALY_05000455 (1)
 
 /* Anomalies that don't exist on this proc */
 #define ANOMALY_05000125 (0)
@@ -67,6 +75,7 @@
 #define ANOMALY_05000273 (0)
 #define ANOMALY_05000278 (0)
 #define ANOMALY_05000285 (0)
+#define ANOMALY_05000305 (0)
 #define ANOMALY_05000307 (0)
 #define ANOMALY_05000311 (0)
 #define ANOMALY_05000312 (0)
diff --git a/arch/blackfin/mach-bf527/include/mach/anomaly.h b/arch/blackfin/mach-bf527/include/mach/anomaly.h
index fdc8783..4ffccb6 100644
--- a/arch/blackfin/mach-bf527/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf527/include/mach/anomaly.h
@@ -2,7 +2,7 @@
  * File: include/asm-blackfin/mach-bf527/anomaly.h
  * Bugs: Enter bugs at http://blackfin.uclinux.org/
  *
- * Copyright (C) 2004-2008 Analog Devices Inc.
+ * Copyright (C) 2004-2009 Analog Devices Inc.
  * Licensed under the GPL-2 or later.
  */
 
@@ -169,6 +169,7 @@
 #define ANOMALY_05000273 (0)
 #define ANOMALY_05000278 (0)
 #define ANOMALY_05000285 (0)
+#define ANOMALY_05000305 (0)
 #define ANOMALY_05000307 (0)
 #define ANOMALY_05000311 (0)
 #define ANOMALY_05000312 (0)
diff --git a/arch/blackfin/mach-bf533/include/mach/anomaly.h b/arch/blackfin/mach-bf533/include/mach/anomaly.h
index 657dd18..9e838f8 100644
--- a/arch/blackfin/mach-bf533/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf533/include/mach/anomaly.h
@@ -2,7 +2,7 @@
  * File: include/asm-blackfin/mach-bf533/anomaly.h
  * Bugs: Enter bugs at http://blackfin.uclinux.org/
  *
- * Copyright (C) 2004-2008 Analog Devices Inc.
+ * Copyright (C) 2004-2009 Analog Devices Inc.
  * Licensed under the GPL-2 or later.
  */
 
@@ -160,7 +160,7 @@
 #define ANOMALY_05000301 (__SILICON_REVISION__ < 6)
 /* SSYNCs After Writes To DMA MMR Registers May Not Be Handled Correctly */
 #define ANOMALY_05000302 (__SILICON_REVISION__ < 5)
-/* New Feature: Additional Hysteresis on SPORT Input Pins (Not Available On Older Silicon) */
+/* SPORT_HYS Bit in PLL_CTL Register Is Not Functional */
 #define ANOMALY_05000305 (__SILICON_REVISION__ < 5)
 /* New Feature: Additional PPI Frame Sync Sampling Options (Not Available On Older Silicon) */
 #define ANOMALY_05000306 (__SILICON_REVISION__ < 5)
diff --git a/arch/blackfin/mach-bf537/include/mach/anomaly.h b/arch/blackfin/mach-bf537/include/mach/anomaly.h
index fe1ae4c..438b43f 100644
--- a/arch/blackfin/mach-bf537/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf537/include/mach/anomaly.h
@@ -2,7 +2,7 @@
  * File: include/asm-blackfin/mach-bf537/anomaly.h
  * Bugs: Enter bugs at http://blackfin.uclinux.org/
  *
- * Copyright (C) 2004-2008 Analog Devices Inc.
+ * Copyright (C) 2004-2009 Analog Devices Inc.
  * Licensed under the GPL-2 or later.
  */
 
@@ -110,7 +110,7 @@
 #define ANOMALY_05000301 (1)
 /* SSYNCs After Writes To CAN/DMA MMR Registers Are Not Always Handled Correctly */
 #define ANOMALY_05000304 (__SILICON_REVISION__ < 3)
-/* New Feature: Additional Hysteresis on SPORT Input Pins (Not Available On Older Silicon) */
+/* SPORT_HYS Bit in PLL_CTL Register Is Not Functional */
 #define ANOMALY_05000305 (__SILICON_REVISION__ < 3)
 /* SCKELOW Bit Does Not Maintain State Through Hibernate */
 #define ANOMALY_05000307 (__SILICON_REVISION__ < 3)
diff --git a/arch/blackfin/mach-bf538/include/mach/anomaly.h b/arch/blackfin/mach-bf538/include/mach/anomaly.h
index 56ea454..f4ece1c 100644
--- a/arch/blackfin/mach-bf538/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf538/include/mach/anomaly.h
@@ -2,7 +2,7 @@
  * File: include/asm-blackfin/mach-bf538/anomaly.h
  * Bugs: Enter bugs at http://blackfin.uclinux.org/
  *
- * Copyright (C) 2004-2008 Analog Devices Inc.
+ * Copyright (C) 2004-2009 Analog Devices Inc.
  * Licensed under the GPL-2 or later.
  */
 
@@ -120,6 +120,7 @@
 #define ANOMALY_05000198 (0)
 #define ANOMALY_05000230 (0)
 #define ANOMALY_05000263 (0)
+#define ANOMALY_05000305 (0)
 #define ANOMALY_05000311 (0)
 #define ANOMALY_05000323 (0)
 #define ANOMALY_05000353 (1)
diff --git a/arch/blackfin/mach-bf548/include/mach/anomaly.h b/arch/blackfin/mach-bf548/include/mach/anomaly.h
index d9d10a7..882e40c 100644
--- a/arch/blackfin/mach-bf548/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf548/include/mach/anomaly.h
@@ -2,12 +2,12 @@
  * File: include/asm-blackfin/mach-bf548/anomaly.h
  * Bugs: Enter bugs at http://blackfin.uclinux.org/
  *
- * Copyright (C) 2004-2008 Analog Devices Inc.
+ * Copyright (C) 2004-2009 Analog Devices Inc.
  * Licensed under the GPL-2 or later.
  */
 
 /* This file shoule be up to date with:
- *  - Revision G, 08/07/2008; ADSP-BF542/BF544/BF547/BF548/BF549 Blackfin Processor Anomaly List
+ *  - Revision H, 01/16/2009; ADSP-BF542/BF544/BF547/BF548/BF549 Blackfin Processor Anomaly List
  */
 
 #ifndef _MACH_ANOMALY_H_
@@ -91,8 +91,6 @@
 #define ANOMALY_05000371 (__SILICON_REVISION__ < 2)
 /* USB DP/DM Data Pins May Lose State When Entering Hibernate */
 #define ANOMALY_05000372 (__SILICON_REVISION__ < 1)
-/* Mobile DDR Operation Not Functional */
-#define ANOMALY_05000377 (1)
 /* Security/Authentication Speedpath Causes Authentication To Fail To Initiate */
 #define ANOMALY_05000378 (__SILICON_REVISION__ < 2)
 /* 16-Bit NAND FLASH Boot Mode Is Not Functional */
@@ -157,8 +155,22 @@
 #define ANOMALY_05000429 (__SILICON_REVISION__ < 2)
 /* Software System Reset Corrupts PLL_LOCKCNT Register */
 #define ANOMALY_05000430 (__SILICON_REVISION__ >= 2)
+/* Incorrect Use of Stack in Lockbox Firmware During Authentication */
+#define ANOMALY_05000431 (__SILICON_REVISION__ < 3)
+/* OTP Write Accesses Not Supported */
+#define ANOMALY_05000442 (__SILICON_REVISION__ < 1)
 /* IFLUSH Instruction at End of Hardware Loop Causes Infinite Stall */
 #define ANOMALY_05000443 (1)
+/* CDMAPRIO and L2DMAPRIO Bits in the SYSCR Register Are Not Functional */
+#define ANOMALY_05000446 (1)
+/* UART IrDA Receiver Fails on Extended Bit Pulses */
+#define ANOMALY_05000447 (1)
+/* DDR Clock Duty Cycle Spec Violation (tCH, tCL) */
+#define ANOMALY_05000448 (__SILICON_REVISION__ == 1)
+/* Reduced Timing Margins on DDR Output Setup and Hold (tDS and tDH) */
+#define ANOMALY_05000449 (__SILICON_REVISION__ == 1)
+/* USB DMA Mode 1 Short Packet Data Corruption */
+#define ANOMALY_05000450 (1
 
 /* Anomalies that don't exist on this proc */
 #define ANOMALY_05000125 (0)
@@ -172,6 +184,7 @@
 #define ANOMALY_05000266 (0)
 #define ANOMALY_05000273 (0)
 #define ANOMALY_05000278 (0)
+#define ANOMALY_05000305 (0)
 #define ANOMALY_05000307 (0)
 #define ANOMALY_05000311 (0)
 #define ANOMALY_05000323 (0)
diff --git a/arch/blackfin/mach-bf561/include/mach/anomaly.h b/arch/blackfin/mach-bf561/include/mach/anomaly.h
index 24d3a22..d78bc6b 100644
--- a/arch/blackfin/mach-bf561/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf561/include/mach/anomaly.h
@@ -2,7 +2,7 @@
  * File: include/asm-blackfin/mach-bf561/anomaly.h
  * Bugs: Enter bugs at http://blackfin.uclinux.org/
  *
- * Copyright (C) 2004-2008 Analog Devices Inc.
+ * Copyright (C) 2004-2009 Analog Devices Inc.
  * Licensed under the GPL-2 or later.
  */
 
@@ -224,7 +224,7 @@
 #define ANOMALY_05000301 (1)
 /* SSYNCs After Writes To DMA MMR Registers May Not Be Handled Correctly */
 #define ANOMALY_05000302 (1)
-/* New Feature: Additional Hysteresis on SPORT Input Pins (Not Available On Older Silicon) */
+/* SPORT_HYS Bit in PLL_CTL Register Is Not Functional */
 #define ANOMALY_05000305 (__SILICON_REVISION__ < 5)
 /* SCKELOW Bit Does Not Maintain State Through Hibernate */
 #define ANOMALY_05000307 (__SILICON_REVISION__ < 5)
-- 
cgit v0.10.2


From 540aca06b737cc38965b52eeceefba3d24376461 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 4 Mar 2009 11:46:40 +0200
Subject: x86: move devmem_is_allowed() to common mm/init.c

Impact: cleanup

The function is identical on 32-bit and 64-bit configurations so move
it to the common mm/init.c file.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1236160001.29024.29.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ce6a722..f89df52 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,9 +1,33 @@
+#include <linux/ioport.h>
 #include <linux/swap.h>
+
 #include <asm/cacheflush.h>
 #include <asm/page.h>
+#include <asm/page_types.h>
 #include <asm/sections.h>
 #include <asm/system.h>
 
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+	if (pagenr <= 256)
+		return 1;
+	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+		return 0;
+	if (!page_is_ram(pagenr))
+		return 1;
+	return 0;
+}
+
 void free_init_pages(char *what, unsigned long begin, unsigned long end)
 {
 	unsigned long addr = begin;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0b087dc..917c4e6 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -354,27 +354,6 @@ repeat:
 	}
 }
 
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
-	if (pagenr <= 256)
-		return 1;
-	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
-		return 0;
-	if (!page_is_ram(pagenr))
-		return 1;
-	return 0;
-}
-
 pte_t *kmap_pte;
 pgprot_t kmap_prot;
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 724e537..074435e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -876,28 +876,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
-	if (pagenr <= 256)
-		return 1;
-	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
-		return 0;
-	if (!page_is_ram(pagenr))
-		return 1;
-	return 0;
-}
-
-
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
 			 kcore_modules, kcore_vsyscall;
 
-- 
cgit v0.10.2


From 73af76dfd1f998dba71d8e8e785cbe77a990bf17 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 4 Mar 2009 11:47:17 +0100
Subject: x86, mce: fix build failure in arch/x86/kernel/cpu/mcheck/threshold.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: build fix

The APIC code rewrite in the x86 tree broke the x86/mce branch:

 arch/x86/kernel/cpu/mcheck/threshold.c: In function ‘mce_threshold_interrupt’:
 arch/x86/kernel/cpu/mcheck/threshold.c:24: error: implicit declaration of function ‘ack_APIC_irq’

Also tidy up the file a bit while at it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index e4b8a38..23ee9e7 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -1,9 +1,13 @@
-/* Common corrected MCE threshold handler code */
-#include <linux/kernel.h>
+/*
+ * Common corrected MCE threshold handler code:
+ */
 #include <linux/interrupt.h>
-#include <asm/mce.h>
+#include <linux/kernel.h>
+
 #include <asm/irq_vectors.h>
+#include <asm/apic.h>
 #include <asm/idle.h>
+#include <asm/mce.h>
 
 static void default_threshold_interrupt(void)
 {
-- 
cgit v0.10.2


From fe7ca2e1e847b65c12d245cbf402af89da96888a Mon Sep 17 00:00:00 2001
From: Brian Haley <brian.haley@hp.com>
Date: Wed, 4 Mar 2009 03:18:11 -0800
Subject: IPv6: add "disable" module parameter support to ipv6.ko

Add "disable" module parameter support to ipv6.ko by specifying
"disable=1" on module load.  We just do the minimum of initializing
inetsw6[] so calls from other modules to inet6_register_protosw()
won't OOPs, then bail out.  No IPv6 addresses or sockets can be
created as a result, and a reboot is required to enable IPv6.

Signed-off-by: Brian Haley <brian.haley@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/Documentation/networking/ipv6.txt b/Documentation/networking/ipv6.txt
new file mode 100644
index 0000000..268e5c1
--- /dev/null
+++ b/Documentation/networking/ipv6.txt
@@ -0,0 +1,35 @@
+
+Options for the ipv6 module are supplied as parameters at load time.
+
+Module options may be given as command line arguments to the insmod
+or modprobe command, but are usually specified in either the
+/etc/modules.conf or /etc/modprobe.conf configuration file, or in a
+distro-specific configuration file.
+
+The available ipv6 module parameters are listed below.  If a parameter
+is not specified the default value is used.
+
+The parameters are as follows:
+
+disable
+
+	Specifies whether to load the IPv6 module, but disable all
+	its functionality.  This might be used when another module
+	has a dependency on the IPv6 module being loaded, but no
+	IPv6 addresses or operations are desired.
+
+	The possible values and their effects are:
+
+	0
+		IPv6 is enabled.
+
+		This is the default value.
+
+	1
+		IPv6 is disabled.
+
+		No IPv6 addresses will be added to interfaces, and
+		it will not be possible to open an IPv6 socket.
+
+		A reboot is required to enable IPv6.
+
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index c802bc1..da944ec 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -72,6 +72,10 @@ MODULE_LICENSE("GPL");
 static struct list_head inetsw6[SOCK_MAX];
 static DEFINE_SPINLOCK(inetsw6_lock);
 
+static int disable_ipv6 = 0;
+module_param_named(disable, disable_ipv6, int, 0);
+MODULE_PARM_DESC(disable, "Disable IPv6 such that it is non-functional");
+
 static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
 {
 	const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);
@@ -991,10 +995,21 @@ static int __init inet6_init(void)
 {
 	struct sk_buff *dummy_skb;
 	struct list_head *r;
-	int err;
+	int err = 0;
 
 	BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb));
 
+	/* Register the socket-side information for inet6_create.  */
+	for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
+		INIT_LIST_HEAD(r);
+
+	if (disable_ipv6) {
+		printk(KERN_INFO
+		       "IPv6: Loaded, but administratively disabled, "
+		       "reboot required to enable\n");
+		goto out;
+	}
+
 	err = proto_register(&tcpv6_prot, 1);
 	if (err)
 		goto out;
@@ -1012,10 +1027,6 @@ static int __init inet6_init(void)
 		goto out_unregister_udplite_proto;
 
 
-	/* Register the socket-side information for inet6_create.  */
-	for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
-		INIT_LIST_HEAD(r);
-
 	/* We MUST register RAW sockets before we create the ICMP6,
 	 * IGMP6, or NDISC control sockets.
 	 */
-- 
cgit v0.10.2


From fb13d9f9e450bceafd88ac8a98f7a98e8096a5fe Mon Sep 17 00:00:00 2001
From: Brian Haley <brian.haley@hp.com>
Date: Wed, 4 Mar 2009 03:20:26 -0800
Subject: SCTP: change sctp_ctl_sock_init() to try IPv4 if IPv6 fails

Change sctp_ctl_sock_init() to try IPv4 if IPv6 socket registration
fails.  Required if the IPv6 module is loaded with "disable=1", else
SCTP will fail to load.

Signed-off-by: Brian Haley <brian.haley@hp.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 4e66384..c4986d0 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -717,15 +717,20 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
 static int sctp_ctl_sock_init(void)
 {
 	int err;
-	sa_family_t family;
+	sa_family_t family = PF_INET;
 
 	if (sctp_get_pf_specific(PF_INET6))
 		family = PF_INET6;
-	else
-		family = PF_INET;
 
 	err = inet_ctl_sock_create(&sctp_ctl_sock, family,
 				   SOCK_SEQPACKET, IPPROTO_SCTP, &init_net);
+
+	/* If IPv6 socket could not be created, try the IPv4 socket */
+	if (err < 0 && family == PF_INET6)
+		err = inet_ctl_sock_create(&sctp_ctl_sock, AF_INET,
+					   SOCK_SEQPACKET, IPPROTO_SCTP,
+					   &init_net);
+
 	if (err < 0) {
 		printk(KERN_ERR
 		       "SCTP: Failed to create the SCTP control socket.\n");
-- 
cgit v0.10.2


From 5ad8b7d12605e88d1e532061699102797fdefe08 Mon Sep 17 00:00:00 2001
From: Helge Bahmann <helge.bahmann@secunet.com>
Date: Wed, 4 Mar 2009 21:49:14 +1000
Subject: drm: fix double lock typo

[airlied: you shall not retype patches from other trees half asleep]

Signed-of-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/drm_stub.c b/drivers/gpu/drm/drm_stub.c
index 096e2a3..7c8b15b 100644
--- a/drivers/gpu/drm/drm_stub.c
+++ b/drivers/gpu/drm/drm_stub.c
@@ -168,7 +168,7 @@ int drm_setmaster_ioctl(struct drm_device *dev, void *data,
 	    file_priv->minor->master != file_priv->master) {
 		mutex_lock(&dev->struct_mutex);
 		file_priv->minor->master = drm_master_get(file_priv->master);
-		mutex_lock(&dev->struct_mutex);
+		mutex_unlock(&dev->struct_mutex);
 	}
 
 	return 0;
-- 
cgit v0.10.2


From ff0c0874905fb312ca1491bbdac2653b0b48c20b Mon Sep 17 00:00:00 2001
From: Brian Maly <bmaly@redhat.com>
Date: Tue, 3 Mar 2009 21:55:31 -0500
Subject: x86: fix DMI on EFI

Impact: reactivate DMI quirks on EFI hardware

DMI tables are loaded by EFI, so the dmi calls must happen after
efi_init() and not before.

Currently Apple hardware uses DMI to determine the framebuffer mappings
for efifb. Without DMI working you also have no video on MacBook Pro.

This patch resolves the DMI issue for EFI hardware (DMI is now properly
detected at boot), and additionally efifb now loads on Apple hardware
(i.e. video works).

Signed-off-by: Brian Maly <bmaly@redhat>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: ying.huang@intel.com
LKML-Reference: <49ADEDA3.1030406@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

 arch/x86/kernel/setup.c |    5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c461f6d..6a8811a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -770,6 +770,9 @@ void __init setup_arch(char **cmdline_p)
 
 	finish_e820_parsing();
 
+	if (efi_enabled)
+		efi_init();
+
 	dmi_scan_machine();
 
 	dmi_check_system(bad_bios_dmi_table);
@@ -789,8 +792,6 @@ void __init setup_arch(char **cmdline_p)
 	insert_resource(&iomem_resource, &data_resource);
 	insert_resource(&iomem_resource, &bss_resource);
 
-	if (efi_enabled)
-		efi_init();
 
 #ifdef CONFIG_X86_32
 	if (ppro_with_ram_bug()) {
-- 
cgit v0.10.2


From 6298e719cf388f43b674f43799af467d3e4e5aa7 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 4 Mar 2009 10:16:07 +0200
Subject: x86: set_highmem_pages_init() cleanup, #2

Impact: cleanup

The zones are set up at this stage so there's a highmem zone
available even for the UMA case.

The only difference there is that for machines that have
CONFIG_HIGHMEM enabled but don't have any highmem available,
->zone_start_pfn is zero whereas highstart_pfn is non-zero).

The field is left zeroed because of the !size test in
free_area_init_core() but shouldn't be a problem because
add_highpages_with_active_regions() handles empty ranges just
fine.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Mel Gorman <mel@csn.ul.ie>
LKML-Reference: <1236154567.29024.23.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 00f127c..d117453 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -158,7 +158,6 @@ EXPORT_SYMBOL(kunmap);
 EXPORT_SYMBOL(kmap_atomic);
 EXPORT_SYMBOL(kunmap_atomic);
 
-#ifdef CONFIG_NUMA
 void __init set_highmem_pages_init(void)
 {
 	struct zone *zone;
@@ -182,11 +181,3 @@ void __init set_highmem_pages_init(void)
 	}
 	totalram_pages += totalhigh_pages;
 }
-#else
-void __init set_highmem_pages_init(void)
-{
-	add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
-
-	totalram_pages += totalhigh_pages;
-}
-#endif /* CONFIG_NUMA */
-- 
cgit v0.10.2


From dd39ecf522ba86c70809715af46e6557f6491131 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 4 Mar 2009 10:58:33 +0800
Subject: x86: EFI: Back efi_ioremap with init_memory_mapping instead of
 FIX_MAP

Impact: Fix boot failure on EFI system with large runtime memory range

Brian Maly reported that some EFI system with large runtime memory
range can not boot. Because the FIX_MAP used to map runtime memory
range is smaller than run time memory range.

This patch fixes this issue by re-implement efi_ioremap() with
init_memory_mapping().

Reported-and-tested-by: Brian Maly <bmaly@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Brian Maly <bmaly@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236135513.6204.306.camel@yhuang-dev.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index ca5ffb2..edc90f2 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -37,8 +37,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
 
 #else /* !CONFIG_X86_32 */
 
-#define MAX_EFI_IO_PAGES	100
-
 extern u64 efi_call0(void *fp);
 extern u64 efi_call1(void *fp, u64 arg1);
 extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/include/asm/fixmap_64.h b/arch/x86/include/asm/fixmap_64.h
index 00a30ab9..8be7409 100644
--- a/arch/x86/include/asm/fixmap_64.h
+++ b/arch/x86/include/asm/fixmap_64.h
@@ -16,7 +16,6 @@
 #include <asm/apicdef.h>
 #include <asm/page.h>
 #include <asm/vsyscall.h>
-#include <asm/efi.h>
 
 /*
  * Here we define all the compile-time 'special' virtual
@@ -43,9 +42,6 @@ enum fixed_addresses {
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
-	FIX_EFI_IO_MAP_LAST_PAGE,
-	FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
-				  + MAX_EFI_IO_PAGES - 1,
 #ifdef CONFIG_PARAVIRT
 	FIX_PARAVIRT_BOOTMAP,
 #endif
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 1119d24..eb1ef3b 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -467,7 +467,7 @@ void __init efi_enter_virtual_mode(void)
 	efi_memory_desc_t *md;
 	efi_status_t status;
 	unsigned long size;
-	u64 end, systab, addr, npages;
+	u64 end, systab, addr, npages, end_pfn;
 	void *p, *va;
 
 	efi.systab = NULL;
@@ -479,7 +479,10 @@ void __init efi_enter_virtual_mode(void)
 		size = md->num_pages << EFI_PAGE_SHIFT;
 		end = md->phys_addr + size;
 
-		if (PFN_UP(end) <= max_low_pfn_mapped)
+		end_pfn = PFN_UP(end);
+		if (end_pfn <= max_low_pfn_mapped
+		    || (end_pfn > (1UL << (32 - PAGE_SHIFT))
+			&& end_pfn <= max_pfn_mapped))
 			va = __va(md->phys_addr);
 		else
 			va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 652c528..cb783b9 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -99,24 +99,11 @@ void __init efi_call_phys_epilog(void)
 
 void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
 {
-	static unsigned pages_mapped __initdata;
-	unsigned i, pages;
-	unsigned long offset;
+	unsigned long last_map_pfn;
 
-	pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr);
-	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
-
-	if (pages_mapped + pages > MAX_EFI_IO_PAGES)
+	last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
+	if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
 		return NULL;
 
-	for (i = 0; i < pages; i++) {
-		__set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
-			     phys_addr, PAGE_KERNEL);
-		phys_addr += PAGE_SIZE;
-		pages_mapped++;
-	}
-
-	return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
-					     (pages_mapped - pages)) + offset;
+	return (void __iomem *)__va(phys_addr);
 }
-- 
cgit v0.10.2


From acaabe795a62bba089c185917af86b44654313dc Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Wed, 4 Mar 2009 12:56:05 -0600
Subject: x86: UV, SGI RTC: add generic system vector

This patch allocates a system interrupt vector for various platform
specific uses.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: john stultz <johnstul@us.ibm.com>
LKML-Reference: <20090304185605.GA24419@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 854d538..c2e6bed 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -33,6 +33,8 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
 		 smp_invalidate_interrupt)
 #endif
 
+BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR)
+
 /*
  * every pentium local APIC has two 'local interrupts', with a
  * soft-definable vector attached to both interrupts, one of
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 176f058..039db6a 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,6 +12,7 @@ typedef struct {
 	unsigned int apic_timer_irqs;	/* arch dependent */
 	unsigned int irq_spurious_count;
 #endif
+	unsigned int generic_irqs;	/* arch dependent */
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 370e1c8..b762ea4 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,6 +27,7 @@
 
 /* Interrupt handlers registered during init_IRQ */
 extern void apic_timer_interrupt(void);
+extern void generic_interrupt(void);
 extern void error_interrupt(void);
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 107eb21..f38481b 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,6 +36,7 @@ static inline int irq_canonicalize(int irq)
 extern void fixup_irqs(void);
 #endif
 
+extern void (*generic_interrupt_extension)(void);
 extern void init_IRQ(void);
 extern void native_init_IRQ(void);
 extern bool handle_irq(unsigned irq, struct pt_regs *regs);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8a285f3..3cbd79b 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -112,6 +112,11 @@
 #define LOCAL_PERF_VECTOR		0xee
 
 /*
+ * Generic system vector for platform specific use
+ */
+#define GENERIC_INTERRUPT_VECTOR	0xed
+
+/*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
  * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 83d1836..7ba4621 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \
 #endif
 apicinterrupt LOCAL_TIMER_VECTOR \
 	apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt GENERIC_INTERRUPT_VECTOR \
+	generic_interrupt smp_generic_interrupt
 
 #ifdef CONFIG_SMP
 apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index f13ca16..b864341 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -15,6 +15,9 @@
 
 atomic_t irq_err_count;
 
+/* Function pointer for generic interrupt vector handling */
+void (*generic_interrupt_extension)(void) = NULL;
+
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
  * each architecture has to answer this themselves.
@@ -56,6 +59,12 @@ static int show_other_interrupts(struct seq_file *p)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
 	seq_printf(p, "  Local timer interrupts\n");
 #endif
+	if (generic_interrupt_extension) {
+		seq_printf(p, "PLT: ");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+		seq_printf(p, "  Platform interrupts\n");
+	}
 #ifdef CONFIG_SMP
 	seq_printf(p, "RES: ");
 	for_each_online_cpu(j)
@@ -163,6 +172,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 #ifdef CONFIG_X86_LOCAL_APIC
 	sum += irq_stats(cpu)->apic_timer_irqs;
 #endif
+	if (generic_interrupt_extension)
+		sum += irq_stats(cpu)->generic_irqs;
 #ifdef CONFIG_SMP
 	sum += irq_stats(cpu)->irq_resched_count;
 	sum += irq_stats(cpu)->irq_call_count;
@@ -226,4 +237,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	return 1;
 }
 
+/*
+ * Handler for GENERIC_INTERRUPT_VECTOR.
+ */
+void smp_generic_interrupt(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	ack_APIC_irq();
+
+	exit_idle();
+
+	irq_enter();
+
+	inc_irq_stat(generic_irqs);
+
+	if (generic_interrupt_extension)
+		generic_interrupt_extension();
+
+	irq_exit();
+
+	set_irq_regs(old_regs);
+}
+
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 50b8c3a..bc13261 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -175,6 +175,9 @@ void __init native_init_IRQ(void)
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
+	/* generic IPI for platform specific use */
+	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index da481a1..c7a49e0 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -147,6 +147,9 @@ static void __init apic_intr_init(void)
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
+	/* generic IPI for platform specific use */
+	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-- 
cgit v0.10.2


From 8661984f628c6f7d9cbaac6697f26d6b0be3ad3b Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Wed, 4 Mar 2009 12:57:19 -0600
Subject: x86: UV, SGI RTC: loop through installed UV blades

Add macro to loop through each possible blade.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: john stultz <johnstul@us.ibm.com>
LKML-Reference: <20090304185719.GB24419@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 777327e..9f4dfba 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -199,6 +199,10 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 #define SCIR_CPU_ACTIVITY	0x02	/* not idle */
 #define SCIR_CPU_HB_INTERVAL	(HZ)	/* once per second */
 
+/* Loop through all installed blades */
+#define for_each_possible_blade(bid)		\
+	for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++)
+
 /*
  * Macros for converting between kernel virtual addresses, socket local physical
  * addresses, and UV global physical addresses.
-- 
cgit v0.10.2


From 5ab5ab34498f94d60884c4ccea890601e429042e Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Wed, 4 Mar 2009 12:59:18 -0600
Subject: x86: UV, SGI RTC: add UV RTC clocksource/clockevents

This patch provides a high resolution clock/timer source using the
SGI UV system-wide synchronized RTC clock/timer hardware.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: john stultz <johnstul@us.ibm.com>
LKML-Reference: <20090304185918.GC24419@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 95f216b..339ce35 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -111,7 +111,7 @@ obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb_64.o # NB rename without _64
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
-	obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o
+	obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
 	obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
 	obj-$(CONFIG_AUDIT)		+= audit_64.o
 
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
new file mode 100644
index 0000000..6f8e325
--- /dev/null
+++ b/arch/x86/kernel/uv_time.c
@@ -0,0 +1,391 @@
+/*
+ * SGI RTC clock/timer routines.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Copyright (c) 2009 Silicon Graphics, Inc.  All Rights Reserved.
+ *  Copyright (c) Dimitri Sivanich
+ */
+#include <linux/clockchips.h>
+
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+
+#define RTC_NAME		"sgi_rtc"
+
+static cycle_t uv_read_rtc(void);
+static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
+static void uv_rtc_timer_setup(enum clock_event_mode,
+				struct clock_event_device *);
+
+static struct clocksource clocksource_uv = {
+	.name		= RTC_NAME,
+	.rating		= 400,
+	.read		= uv_read_rtc,
+	.mask		= (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
+	.shift		= 10,
+	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static struct clock_event_device clock_event_device_uv = {
+	.name		= RTC_NAME,
+	.features	= CLOCK_EVT_FEAT_ONESHOT,
+	.shift		= 20,
+	.rating		= 400,
+	.irq		= -1,
+	.set_next_event	= uv_rtc_next_event,
+	.set_mode	= uv_rtc_timer_setup,
+	.event_handler	= NULL,
+};
+
+static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
+
+/* There is one of these allocated per node */
+struct uv_rtc_timer_head {
+	spinlock_t	lock;
+	/* next cpu waiting for timer, local node relative: */
+	int		next_cpu;
+	/* number of cpus on this node: */
+	int		ncpus;
+	struct {
+		int	lcpu;		/* systemwide logical cpu number */
+		u64	expires;	/* next timer expiration for this cpu */
+	} cpu[1];
+};
+
+/*
+ * Access to uv_rtc_timer_head via blade id.
+ */
+static struct uv_rtc_timer_head		**blade_info __read_mostly;
+
+static int				uv_rtc_enable;
+
+/*
+ * Hardware interface routines
+ */
+
+/* Send IPIs to another node */
+static void uv_rtc_send_IPI(int cpu)
+{
+	unsigned long apicid, val;
+	int pnode;
+
+	apicid = per_cpu(x86_cpu_to_apicid, cpu);
+	pnode = uv_apicid_to_pnode(apicid);
+	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+	      (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+	      (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
+
+	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+}
+
+/* Check for an RTC interrupt pending */
+static int uv_intr_pending(int pnode)
+{
+	return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
+		UVH_EVENT_OCCURRED0_RTC1_MASK;
+}
+
+/* Setup interrupt and return non-zero if early expiration occurred. */
+static int uv_setup_intr(int cpu, u64 expires)
+{
+	u64 val;
+	int pnode = uv_cpu_to_pnode(cpu);
+
+	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+		UVH_RTC1_INT_CONFIG_M_MASK);
+	uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
+
+	uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
+		UVH_EVENT_OCCURRED0_RTC1_MASK);
+
+	val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
+		((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
+
+	/* Set configuration */
+	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
+	/* Initialize comparator value */
+	uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
+
+	return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
+}
+
+/*
+ * Per-cpu timer tracking routines
+ */
+
+static __init void uv_rtc_deallocate_timers(void)
+{
+	int bid;
+
+	for_each_possible_blade(bid) {
+		kfree(blade_info[bid]);
+	}
+	kfree(blade_info);
+}
+
+/* Allocate per-node list of cpu timer expiration times. */
+static __init int uv_rtc_allocate_timers(void)
+{
+	int cpu;
+
+	blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
+	if (!blade_info)
+		return -ENOMEM;
+	memset(blade_info, 0, uv_possible_blades * sizeof(void *));
+
+	for_each_present_cpu(cpu) {
+		int nid = cpu_to_node(cpu);
+		int bid = uv_cpu_to_blade_id(cpu);
+		int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+		struct uv_rtc_timer_head *head = blade_info[bid];
+
+		if (!head) {
+			head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
+				(uv_blade_nr_possible_cpus(bid) *
+					2 * sizeof(u64)),
+				GFP_KERNEL, nid);
+			if (!head) {
+				uv_rtc_deallocate_timers();
+				return -ENOMEM;
+			}
+			spin_lock_init(&head->lock);
+			head->ncpus = uv_blade_nr_possible_cpus(bid);
+			head->next_cpu = -1;
+			blade_info[bid] = head;
+		}
+
+		head->cpu[bcpu].lcpu = cpu;
+		head->cpu[bcpu].expires = ULLONG_MAX;
+	}
+
+	return 0;
+}
+
+/* Find and set the next expiring timer.  */
+static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
+{
+	u64 lowest = ULLONG_MAX;
+	int c, bcpu = -1;
+
+	head->next_cpu = -1;
+	for (c = 0; c < head->ncpus; c++) {
+		u64 exp = head->cpu[c].expires;
+		if (exp < lowest) {
+			bcpu = c;
+			lowest = exp;
+		}
+	}
+	if (bcpu >= 0) {
+		head->next_cpu = bcpu;
+		c = head->cpu[bcpu].lcpu;
+		if (uv_setup_intr(c, lowest))
+			/* If we didn't set it up in time, trigger */
+			uv_rtc_send_IPI(c);
+	} else {
+		uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+			UVH_RTC1_INT_CONFIG_M_MASK);
+	}
+}
+
+/*
+ * Set expiration time for current cpu.
+ *
+ * Returns 1 if we missed the expiration time.
+ */
+static int uv_rtc_set_timer(int cpu, u64 expires)
+{
+	int pnode = uv_cpu_to_pnode(cpu);
+	int bid = uv_cpu_to_blade_id(cpu);
+	struct uv_rtc_timer_head *head = blade_info[bid];
+	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+	u64 *t = &head->cpu[bcpu].expires;
+	unsigned long flags;
+	int next_cpu;
+
+	spin_lock_irqsave(&head->lock, flags);
+
+	next_cpu = head->next_cpu;
+	*t = expires;
+	/* Will this one be next to go off? */
+	if (next_cpu < 0 || bcpu == next_cpu ||
+			expires < head->cpu[next_cpu].expires) {
+		head->next_cpu = bcpu;
+		if (uv_setup_intr(cpu, expires)) {
+			*t = ULLONG_MAX;
+			uv_rtc_find_next_timer(head, pnode);
+			spin_unlock_irqrestore(&head->lock, flags);
+			return 1;
+		}
+	}
+
+	spin_unlock_irqrestore(&head->lock, flags);
+	return 0;
+}
+
+/*
+ * Unset expiration time for current cpu.
+ *
+ * Returns 1 if this timer was pending.
+ */
+static int uv_rtc_unset_timer(int cpu)
+{
+	int pnode = uv_cpu_to_pnode(cpu);
+	int bid = uv_cpu_to_blade_id(cpu);
+	struct uv_rtc_timer_head *head = blade_info[bid];
+	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+	u64 *t = &head->cpu[bcpu].expires;
+	unsigned long flags;
+	int rc = 0;
+
+	spin_lock_irqsave(&head->lock, flags);
+
+	if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
+		rc = 1;
+
+	*t = ULLONG_MAX;
+
+	/* Was the hardware setup for this timer? */
+	if (head->next_cpu == bcpu)
+		uv_rtc_find_next_timer(head, pnode);
+
+	spin_unlock_irqrestore(&head->lock, flags);
+
+	return rc;
+}
+
+
+/*
+ * Kernel interface routines.
+ */
+
+/*
+ * Read the RTC.
+ */
+static cycle_t uv_read_rtc(void)
+{
+	return (cycle_t)uv_read_local_mmr(UVH_RTC);
+}
+
+/*
+ * Program the next event, relative to now
+ */
+static int uv_rtc_next_event(unsigned long delta,
+			     struct clock_event_device *ced)
+{
+	int ced_cpu = cpumask_first(ced->cpumask);
+
+	return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
+}
+
+/*
+ * Setup the RTC timer in oneshot mode
+ */
+static void uv_rtc_timer_setup(enum clock_event_mode mode,
+			       struct clock_event_device *evt)
+{
+	int ced_cpu = cpumask_first(evt->cpumask);
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here yet */
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		uv_rtc_unset_timer(ced_cpu);
+		break;
+	}
+}
+
+static void uv_rtc_interrupt(void)
+{
+	struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+	int cpu = smp_processor_id();
+
+	if (!ced || !ced->event_handler)
+		return;
+
+	if (uv_rtc_unset_timer(cpu) != 1)
+		return;
+
+	ced->event_handler(ced);
+}
+
+static int __init uv_enable_rtc(char *str)
+{
+	uv_rtc_enable = 1;
+
+	return 1;
+}
+__setup("uvrtc", uv_enable_rtc);
+
+static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
+{
+	struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+
+	*ced = clock_event_device_uv;
+	ced->cpumask = cpumask_of(smp_processor_id());
+	clockevents_register_device(ced);
+}
+
+static __init int uv_rtc_setup_clock(void)
+{
+	int rc;
+
+	if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
+		return -ENODEV;
+
+	generic_interrupt_extension = uv_rtc_interrupt;
+
+	clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
+				clocksource_uv.shift);
+
+	rc = clocksource_register(&clocksource_uv);
+	if (rc) {
+		generic_interrupt_extension = NULL;
+		return rc;
+	}
+
+	/* Setup and register clockevents */
+	rc = uv_rtc_allocate_timers();
+	if (rc) {
+		clocksource_unregister(&clocksource_uv);
+		generic_interrupt_extension = NULL;
+		return rc;
+	}
+
+	clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
+				NSEC_PER_SEC, clock_event_device_uv.shift);
+
+	clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
+						sn_rtc_cycles_per_second;
+
+	clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
+				(NSEC_PER_SEC / sn_rtc_cycles_per_second);
+
+	rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
+	if (rc) {
+		clocksource_unregister(&clocksource_uv);
+		generic_interrupt_extension = NULL;
+		uv_rtc_deallocate_timers();
+	}
+
+	return rc;
+}
+arch_initcall(uv_rtc_setup_clock);
-- 
cgit v0.10.2


From ab9e18587f4cdb5f3fb3854c732f27a36f98e8f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Gl=C3=B6ckner?= <dg@emlix.com>
Date: Wed, 4 Mar 2009 19:42:27 +0100
Subject: x86, math-emu: fix init_fpu for task != current
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: fix math-emu related crash while using GDB/ptrace

init_fpu() calls finit to initialize a task's xstate, while finit always
works on the current task. If we use PTRACE_GETFPREGS on another
process and both processes did not already use floating point, we get
a null pointer exception in finit.

This patch creates a new function finit_task that takes a task_struct
parameter. finit becomes a wrapper that simply calls finit_task with
current. On the plus side this avoids many calls to get_current which
would each resolve to an inline assembler mov instruction.

An empty finit_task has been added to i387.h to avoid linker errors in
case the compiler still emits the call in init_fpu when
CONFIG_MATH_EMULATION is not defined.

The declaration of finit in i387.h has been removed as the remaining
code using this function gets its prototype from fpu_proto.h.

Signed-off-by: Daniel Glöckner <dg@emlix.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Pallipadi Venkatesh" <venkatesh.pallipadi@intel.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Bill Metzenthen <billm@melbpc.org.au>
LKML-Reference: <E1Lew31-0004il-Fg@mailer.emlix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 48f0004..71c9e51 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk)
 
 #else  /* CONFIG_X86_32 */
 
-extern void finit(void);
+#ifdef CONFIG_MATH_EMULATION
+extern void finit_task(struct task_struct *tsk);
+#else
+static inline void finit_task(struct task_struct *tsk)
+{
+}
+#endif
 
 static inline void tolerant_fwait(void)
 {
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index b0f61f0..f2f8540 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk)
 #ifdef CONFIG_X86_32
 	if (!HAVE_HWFP) {
 		memset(tsk->thread.xstate, 0, xstate_size);
-		finit();
+		finit_task(tsk);
 		set_stopped_child_used_math(tsk);
 		return 0;
 	}
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 491e737..aa09870 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,20 +30,29 @@ static void fclex(void)
 }
 
 /* Needs to be externally visible */
-void finit(void)
+void finit_task(struct task_struct *tsk)
 {
-	control_word = 0x037f;
-	partial_status = 0;
-	top = 0;		/* We don't keep top in the status word internally. */
-	fpu_tag_word = 0xffff;
+	struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
+	struct address *oaddr, *iaddr;
+	soft->cwd = 0x037f;
+	soft->swd = 0;
+	soft->ftop = 0;	/* We don't keep top in the status word internally. */
+	soft->twd = 0xffff;
 	/* The behaviour is different from that detailed in
 	   Section 15.1.6 of the Intel manual */
-	operand_address.offset = 0;
-	operand_address.selector = 0;
-	instruction_address.offset = 0;
-	instruction_address.selector = 0;
-	instruction_address.opcode = 0;
-	no_ip_update = 1;
+	oaddr = (struct address *)&soft->foo;
+	oaddr->offset = 0;
+	oaddr->selector = 0;
+	iaddr = (struct address *)&soft->fip;
+	iaddr->offset = 0;
+	iaddr->selector = 0;
+	iaddr->opcode = 0;
+	soft->no_update = 1;
+}
+
+void finit(void)
+{
+	finit_task(current);
 }
 
 /*
-- 
cgit v0.10.2


From a71edd1f46c8a599509bda478fb4eea27fb0da63 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 4 Mar 2009 01:22:35 -0800
Subject: x86: fix bootmem cross node for 32bit numa

Impact: fix panic on system 2g x4 sockets

Found one system with 4 sockets and every sockets has 2g can not boot
with numa32 because boot mem is crossing nodes.

So try to have numa version of setup_bootmem_allocator().

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <49AE485B.8000902@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 917c4e6..67bdb59 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -776,9 +776,37 @@ static void __init zone_sizes_init(void)
 	free_area_init_nodes(max_zone_pfns);
 }
 
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+static unsigned long __init setup_node_bootmem(int nodeid,
+				 unsigned long start_pfn,
+				 unsigned long end_pfn,
+				 unsigned long bootmap)
+{
+	unsigned long bootmap_size;
+
+	if (start_pfn > max_low_pfn)
+		return bootmap;
+	if (end_pfn > max_low_pfn)
+		end_pfn = max_low_pfn;
+
+	/* don't touch min_low_pfn */
+	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+					 bootmap >> PAGE_SHIFT,
+					 start_pfn, end_pfn);
+	printk(KERN_INFO "  node %d low ram: %08lx - %08lx\n",
+		nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+	printk(KERN_INFO "  node %d bootmap %08lx - %08lx\n",
+		 nodeid, bootmap, bootmap + bootmap_size);
+	free_bootmem_with_active_regions(nodeid, end_pfn);
+	early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+
+	return bootmap + bootmap_size;
+}
+#endif
+
 void __init setup_bootmem_allocator(void)
 {
-	int i;
+	int nodeid;
 	unsigned long bootmap_size, bootmap;
 	/*
 	 * Initialize the boot-time allocator (with low memory only):
@@ -791,18 +819,24 @@ void __init setup_bootmem_allocator(void)
 		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
 	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
 
-	/* don't touch min_low_pfn */
-	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
-					 min_low_pfn, max_low_pfn);
 	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
 		 max_pfn_mapped<<PAGE_SHIFT);
 	printk(KERN_INFO "  low ram: %08lx - %08lx\n",
 		 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+	for_each_online_node(nodeid)
+		bootmap = setup_node_bootmem(nodeid, node_start_pfn[nodeid],
+					node_end_pfn[nodeid], bootmap);
+#else
+	/* don't touch min_low_pfn */
+	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
+					 min_low_pfn, max_low_pfn);
 	printk(KERN_INFO "  bootmap %08lx - %08lx\n",
 		 bootmap, bootmap + bootmap_size);
-	for_each_online_node(i)
-		free_bootmem_with_active_regions(i, max_low_pfn);
+	free_bootmem_with_active_regions(0, max_low_pfn);
 	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
+#endif
 
 	after_init_bootmem = 1;
 }
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 451fe95..3daefa0 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -416,10 +416,11 @@ void __init initmem_init(unsigned long start_pfn,
 	for_each_online_node(nid)
 		propagate_e820_map_node(nid);
 
-	for_each_online_node(nid)
+	for_each_online_node(nid) {
 		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
+	}
 
-	NODE_DATA(0)->bdata = &bootmem_node_data[0];
 	setup_bootmem_allocator();
 }
 
-- 
cgit v0.10.2


From b68adb16f29c8ea02f21f5ebf65bcabffe217e9f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 4 Mar 2009 01:24:04 -0800
Subject: x86: make 32-bit init_memory_mapping range change more like 64-bit

Impact: cleanup

make code more readable and more like 64-bit

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <49AE48B4.8010907@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 67bdb59..37aeaf3 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -885,29 +885,55 @@ static void __init find_early_table_space(unsigned long end, int use_pse)
 		(table_start << PAGE_SHIFT) + tables);
 }
 
+struct map_range {
+	unsigned long start;
+	unsigned long end;
+	unsigned page_size_mask;
+};
+
+#define NR_RANGE_MR 3
+
+static int save_mr(struct map_range *mr, int nr_range,
+		   unsigned long start_pfn, unsigned long end_pfn,
+		   unsigned long page_size_mask)
+{
+	if (start_pfn < end_pfn) {
+		if (nr_range >= NR_RANGE_MR)
+			panic("run out of range for init_memory_mapping\n");
+		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
+		mr[nr_range].page_size_mask = page_size_mask;
+		nr_range++;
+	}
+
+	return nr_range;
+}
+
 unsigned long __init_refok init_memory_mapping(unsigned long start,
 						unsigned long end)
 {
 	pgd_t *pgd_base = swapper_pg_dir;
+	unsigned long page_size_mask = 0;
 	unsigned long start_pfn, end_pfn;
-	unsigned long big_page_start;
+	unsigned long pos;
+
+	struct map_range mr[NR_RANGE_MR];
+	int nr_range, i;
+	int use_pse;
+
+	printk(KERN_INFO "init_memory_mapping: %08lx-%08lx\n", start, end);
+
 #ifdef CONFIG_DEBUG_PAGEALLOC
 	/*
 	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
 	 * This will simplify cpa(), which otherwise needs to support splitting
 	 * large pages into small in interrupt context, etc.
 	 */
-	int use_pse = 0;
+	use_pse = 0;
 #else
-	int use_pse = cpu_has_pse;
+	use_pse = cpu_has_pse;
 #endif
 
-	/*
-	 * Find space for the kernel direct mapping tables.
-	 */
-	if (!after_init_bootmem)
-		find_early_table_space(end, use_pse);
-
 #ifdef CONFIG_X86_PAE
 	set_nx();
 	if (nx_enabled)
@@ -924,45 +950,81 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 		__supported_pte_mask |= _PAGE_GLOBAL;
 	}
 
+	memset(mr, 0, sizeof(mr));
+	nr_range = 0;
+
+	if (use_pse)
+		page_size_mask |= 1 << PG_LEVEL_2M;
+
 	/*
 	 * Don't use a large page for the first 2/4MB of memory
 	 * because there are often fixed size MTRRs in there
 	 * and overlapping MTRRs into large pages can cause
 	 * slowdowns.
 	 */
-	big_page_start = PMD_SIZE;
-
-	if (start < big_page_start) {
-		start_pfn = start >> PAGE_SHIFT;
-		end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
-	} else {
-		/* head is not big page alignment ? */
-		start_pfn = start >> PAGE_SHIFT;
-		end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+	/* head could not be big page alignment ? */
+	start_pfn = start >> PAGE_SHIFT;
+	pos = start_pfn << PAGE_SHIFT;
+	if (pos == 0)
+		end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+	else
+		end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
 				 << (PMD_SHIFT - PAGE_SHIFT);
+	if (end_pfn > (end>>PAGE_SHIFT))
+		end_pfn = end>>PAGE_SHIFT;
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+		pos = end_pfn << PAGE_SHIFT;
 	}
-	if (start_pfn < end_pfn)
-		kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
 
 	/* big page range */
-	start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
 			 << (PMD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < (big_page_start >> PAGE_SHIFT))
-		start_pfn =  big_page_start >> PAGE_SHIFT;
 	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < end_pfn)
-		kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
-					     use_pse);
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask & (1<<PG_LEVEL_2M));
+		pos = end_pfn << PAGE_SHIFT;
+	}
 
 	/* tail is not big page alignment ? */
-	start_pfn = end_pfn;
-	if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
-		end_pfn = end >> PAGE_SHIFT;
-		if (start_pfn < end_pfn)
-			kernel_physical_mapping_init(pgd_base, start_pfn,
-							 end_pfn, 0);
+	start_pfn = pos>>PAGE_SHIFT;
+	end_pfn = end>>PAGE_SHIFT;
+	if (start_pfn < end_pfn)
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+	/* try to merge same page size and continuous */
+	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+		unsigned long old_start;
+		if (mr[i].end != mr[i+1].start ||
+		    mr[i].page_size_mask != mr[i+1].page_size_mask)
+			continue;
+		/* move it */
+		old_start = mr[i].start;
+		memmove(&mr[i], &mr[i+1],
+			(nr_range - 1 - i) * sizeof(struct map_range));
+		mr[i--].start = old_start;
+		nr_range--;
 	}
 
+	for (i = 0; i < nr_range; i++)
+		printk(KERN_DEBUG " %08lx - %08lx page %s\n",
+			mr[i].start, mr[i].end,
+			(mr[i].page_size_mask & (1<<PG_LEVEL_2M)) ?
+				  "big page" : "4k");
+
+	/*
+	 * Find space for the kernel direct mapping tables.
+	 */
+	if (!after_init_bootmem)
+		find_early_table_space(end, use_pse);
+
+	for (i = 0; i < nr_range; i++)
+		kernel_physical_mapping_init(pgd_base,
+				mr[i].start >> PAGE_SHIFT,
+				mr[i].end >> PAGE_SHIFT,
+				mr[i].page_size_mask == (1<<PG_LEVEL_2M));
+
 	early_ioremap_page_table_range_init(pgd_base);
 
 	load_cr3(swapper_pg_dir);
-- 
cgit v0.10.2


From 8d4dd919b46ed982da6ef6bf6fcec454cd7a5b1b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 4 Mar 2009 01:25:21 -0800
Subject: x86: ioremap mptable

Impact: fix boot with mptable above max_low_mapped

Try to use early_ioremap() to map MPC to make sure it works even it is
at the end of ram.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <49AE4901.3090801@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reported-and-tested-by: Kevin O'Connor <kevin@koconnor.net>

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 37cb1bd..ae9060c 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -558,6 +558,19 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 
 static struct mpf_intel *mpf_found;
 
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+	struct mpc_table *mpc;
+	unsigned long size;
+
+	mpc = early_ioremap(physptr, PAGE_SIZE);
+	size = mpc->length;
+	early_iounmap(mpc, PAGE_SIZE);
+	apic_printk(APIC_VERBOSE, "  mpc: %lx-%lx\n", physptr, physptr + size);
+
+	return size;
+}
+
 /*
  * Scan the memory blocks for an SMP configuration block.
  */
@@ -611,12 +624,16 @@ static void __init __get_smp_config(unsigned int early)
 		construct_default_ISA_mptable(mpf->feature1);
 
 	} else if (mpf->physptr) {
+		struct mpc_table *mpc;
+		unsigned long size;
 
+		size = get_mpc_size(mpf->physptr);
+		mpc = early_ioremap(mpf->physptr, size);
 		/*
 		 * Read the physical hardware table.  Anything here will
 		 * override the defaults.
 		 */
-		if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
+		if (!smp_read_mpc(mpc, early)) {
 #ifdef CONFIG_X86_LOCAL_APIC
 			smp_found_config = 0;
 #endif
@@ -624,8 +641,10 @@ static void __init __get_smp_config(unsigned int early)
 			       "BIOS bug, MP table errors detected!...\n");
 			printk(KERN_ERR "... disabling SMP support. "
 			       "(tell your hw vendor)\n");
+			early_iounmap(mpc, size);
 			return;
 		}
+		early_iounmap(mpc, size);
 
 		if (early)
 			return;
-- 
cgit v0.10.2


From f62432395ec54e93f113091bcb2e2017eeed7683 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 4 Mar 2009 01:25:54 -0800
Subject: x86: reserve exact size of mptable

Impact: save a bit of RAM

Get the exact size for the reserve_bootmem() call.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <49AE4922.605@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index ae9060c..e819240 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -716,10 +716,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 
 			if (!reserve)
 				return 1;
-			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
+			reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
 					BOOTMEM_DEFAULT);
 			if (mpf->physptr) {
-				unsigned long size = PAGE_SIZE;
+				unsigned long size = get_mpc_size(mpf->physptr);
 #ifdef CONFIG_X86_32
 				/*
 				 * We cannot access to MPC table to compute
-- 
cgit v0.10.2


From dd4124a8a06bca89c077a16437edac010f0bb993 Mon Sep 17 00:00:00 2001
From: Leann Ogasawara <leann.ogasawara@canonical.com>
Date: Wed, 4 Mar 2009 11:53:00 -0800
Subject: x86: add Dell XPS710 reboot quirk

Dell XPS710 will hang on reboot.  This is resolved by adding a quirk to
set bios reboot.

Signed-off-by: Leann Ogasawara <leann.ogasawara@canonical.com>
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
Cc: "manoj.iyer" <manoj.iyer@canonical.com>
Cc: <stable@kernel.org>
LKML-Reference: <1236196380.3231.89.camel@emiko>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 2b46eb4..4526b3a 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -217,6 +217,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
 		},
 	},
+	{	/* Handle problems with rebooting on Dell XPS710 */
+		.callback = set_bios_reboot,
+		.ident = "Dell XPS710",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
+		},
+	},
 	{ }
 };
 
-- 
cgit v0.10.2


From 731ddea63600c24ff01e6e5144cea88bf7266ac5 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 4 Mar 2009 11:13:40 +0200
Subject: x86: move free_initrd_mem() to common mm/init.c

Impact: cleanup

The function is identical on 32-bit and 64-bit configurations so move it to the
common mm/init.c file.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1236158020.29024.28.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index f89df52..cc7fe66 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -71,3 +71,10 @@ void free_initmem(void)
 			(unsigned long)(&__init_begin),
 			(unsigned long)(&__init_end));
 }
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+	free_init_pages("initrd memory", start, end);
+}
+#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 37aeaf3..c69c6b1 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -1275,13 +1275,6 @@ void mark_rodata_ro(void)
 }
 #endif
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_init_pages("initrd memory", start, end);
-}
-#endif
-
 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 				   int flags)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 074435e..d325186 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -963,13 +963,6 @@ void mark_rodata_ro(void)
 
 #endif
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_init_pages("initrd memory", start, end);
-}
-#endif
-
 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 				   int flags)
 {
-- 
cgit v0.10.2


From 64ca5ab913f1594ef316556e65f5eae63ff50cee Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 4 Mar 2009 12:11:56 -0800
Subject: rcu: increment quiescent state counter in ksoftirqd()

If a machine is flooded by network frames, a cpu can loop
100% of its time inside ksoftirqd() without calling schedule().
This can delay RCU grace period to insane values.

Adding rcu_qsctr_inc() call in ksoftirqd() solves this problem.

Paul: "This regression was a result of the recent change from
"schedule()" to "cond_resched()", which got rid of that quiescent
state in the common case where a reschedule is not needed".

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de..9041ea7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -626,6 +626,7 @@ static int ksoftirqd(void * __bind_cpu)
 			preempt_enable_no_resched();
 			cond_resched();
 			preempt_disable();
+			rcu_qsctr_inc((long)__bind_cpu);
 		}
 		preempt_enable();
 		set_current_state(TASK_INTERRUPTIBLE);
-- 
cgit v0.10.2


From fc5efe3941c47c0278fe1bbcf8cc02a03a74fcda Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 4 Mar 2009 12:21:24 -0800
Subject: x86: fix bootmem cross node for 32bit numa, cleanup

Impact: clean up

Simplify the code, reuse some lines.
Remove min_low_pfn reference, it is always 0

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <49AEE2C4.2030602@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c69c6b1..c351456 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -776,7 +776,6 @@ static void __init zone_sizes_init(void)
 	free_area_init_nodes(max_zone_pfns);
 }
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
 static unsigned long __init setup_node_bootmem(int nodeid,
 				 unsigned long start_pfn,
 				 unsigned long end_pfn,
@@ -802,7 +801,6 @@ static unsigned long __init setup_node_bootmem(int nodeid,
 
 	return bootmap + bootmap_size;
 }
-#endif
 
 void __init setup_bootmem_allocator(void)
 {
@@ -812,8 +810,7 @@ void __init setup_bootmem_allocator(void)
 	 * Initialize the boot-time allocator (with low memory only):
 	 */
 	bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
-	bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
-				 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+	bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
 				 PAGE_SIZE);
 	if (bootmap == -1L)
 		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
@@ -821,21 +818,14 @@ void __init setup_bootmem_allocator(void)
 
 	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
 		 max_pfn_mapped<<PAGE_SHIFT);
-	printk(KERN_INFO "  low ram: %08lx - %08lx\n",
-		 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
+	printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
 	for_each_online_node(nodeid)
 		bootmap = setup_node_bootmem(nodeid, node_start_pfn[nodeid],
 					node_end_pfn[nodeid], bootmap);
 #else
-	/* don't touch min_low_pfn */
-	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
-					 min_low_pfn, max_low_pfn);
-	printk(KERN_INFO "  bootmap %08lx - %08lx\n",
-		 bootmap, bootmap + bootmap_size);
-	free_bootmem_with_active_regions(0, max_low_pfn);
-	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
+	bootmap = setup_node_bootmem(0, 0, max_low_pfn, bootmap);
 #endif
 
 	after_init_bootmem = 1;
-- 
cgit v0.10.2


From 211a40c0870457b29100cffea0180fa5083caf96 Mon Sep 17 00:00:00 2001
From: etienne <etienne.basset@numericable.fr>
Date: Wed, 4 Mar 2009 07:33:51 +0100
Subject: smack: fixes for unlabeled host support

The following patch (against 2.6.29rc5) fixes a few issues in the
smack/netlabel "unlabeled host support" functionnality that was added in
2.6.29rc.  It should go in before -final.

1) smack_host_label disregard a "0.0.0.0/0 @" rule (or other label),
preventing 'tagged' tasks to access Internet (many systems drop packets with
IP options)

2) netmasks were not handled correctly, they were stored in a way _not
equivalent_ to conversion to be32 (it was equivalent for /0, /8, /16, /24,
/32 masks but not other masks)

3) smack_netlbladdr prefixes (IP/mask) were not consistent (mask&IP was not
done), so there could have been different list entries for the same IP
prefix; if those entries had different labels, well ...

4) they were not sorted

1) 2) 3) are bugs, 4) is a more cosmetic issue.
The patch :

-creates a new helper smk_netlbladdr_insert to insert a smk_netlbladdr,
-sorted by netmask length

-use the new sorted nature of  smack_netlbladdrs list to simplify
 smack_host_label : the first match _will_ be the more specific

-corrects endianness issues in smk_write_netlbladdr &  netlbladdr_seq_show

Signed-off-by: <etienne.basset@numericable.fr>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 0278bc0..e7ded13 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1498,58 +1498,31 @@ static int smack_socket_post_create(struct socket *sock, int family,
  * looks for host based access restrictions
  *
  * This version will only be appropriate for really small
- * sets of single label hosts. Because of the masking
- * it cannot shortcut out on the first match. There are
- * numerious ways to address the problem, but none of them
- * have been applied here.
+ * sets of single label hosts.
  *
  * Returns the label of the far end or NULL if it's not special.
  */
 static char *smack_host_label(struct sockaddr_in *sip)
 {
 	struct smk_netlbladdr *snp;
-	char *bestlabel = NULL;
 	struct in_addr *siap = &sip->sin_addr;
-	struct in_addr *liap;
-	struct in_addr *miap;
-	struct in_addr bestmask;
 
 	if (siap->s_addr == 0)
 		return NULL;
 
-	bestmask.s_addr = 0;
-
 	for (snp = smack_netlbladdrs; snp != NULL; snp = snp->smk_next) {
-		liap = &snp->smk_host.sin_addr;
-		miap = &snp->smk_mask;
-		/*
-		 * If the addresses match after applying the list entry mask
-		 * the entry matches the address. If it doesn't move along to
-		 * the next entry.
-		 */
-		if ((liap->s_addr & miap->s_addr) !=
-		    (siap->s_addr & miap->s_addr))
-			continue;
 		/*
-		 * If the list entry mask identifies a single address
-		 * it can't get any more specific.
+		 * we break after finding the first match because
+		 * the list is sorted from longest to shortest mask
+		 * so we have found the most specific match
 		 */
-		if (miap->s_addr == 0xffffffff)
+		if ((&snp->smk_host.sin_addr)->s_addr  ==
+			(siap->s_addr & (&snp->smk_mask)->s_addr)) {
 			return snp->smk_label;
-		/*
-		 * If the list entry mask is less specific than the best
-		 * already found this entry is uninteresting.
-		 */
-		if ((miap->s_addr | bestmask.s_addr) == bestmask.s_addr)
-			continue;
-		/*
-		 * This is better than any entry found so far.
-		 */
-		bestmask.s_addr = miap->s_addr;
-		bestlabel = snp->smk_label;
+		}
 	}
 
-	return bestlabel;
+	return NULL;
 }
 
 /**
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index 8e42800..51f0efc 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -650,10 +650,6 @@ static void *netlbladdr_seq_next(struct seq_file *s, void *v, loff_t *pos)
 
 	return skp;
 }
-/*
-#define BEMASK	0x80000000
-*/
-#define BEMASK	0x00000001
 #define BEBITS	(sizeof(__be32) * 8)
 
 /*
@@ -663,12 +659,10 @@ static int netlbladdr_seq_show(struct seq_file *s, void *v)
 {
 	struct smk_netlbladdr *skp = (struct smk_netlbladdr *) v;
 	unsigned char *hp = (char *) &skp->smk_host.sin_addr.s_addr;
-	__be32 bebits;
-	int maskn = 0;
+	int maskn;
+	u32 temp_mask = be32_to_cpu(skp->smk_mask.s_addr);
 
-	for (bebits = BEMASK; bebits != 0; maskn++, bebits <<= 1)
-		if ((skp->smk_mask.s_addr & bebits) == 0)
-			break;
+	for (maskn = 0; temp_mask; temp_mask <<= 1, maskn++);
 
 	seq_printf(s, "%u.%u.%u.%u/%d %s\n",
 		hp[0], hp[1], hp[2], hp[3], maskn, skp->smk_label);
@@ -702,6 +696,42 @@ static int smk_open_netlbladdr(struct inode *inode, struct file *file)
 }
 
 /**
+ * smk_netlbladdr_insert
+ * @new : netlabel to insert
+ *
+ * This helper insert netlabel in the smack_netlbladdrs list
+ * sorted by netmask length (longest to smallest)
+ */
+static void smk_netlbladdr_insert(struct smk_netlbladdr *new)
+{
+	struct smk_netlbladdr *m;
+
+	if (smack_netlbladdrs == NULL) {
+		smack_netlbladdrs = new;
+		return;
+	}
+
+	/* the comparison '>' is a bit hacky, but works */
+	if (new->smk_mask.s_addr > smack_netlbladdrs->smk_mask.s_addr) {
+		new->smk_next = smack_netlbladdrs;
+		smack_netlbladdrs = new;
+		return;
+	}
+	for (m = smack_netlbladdrs; m != NULL; m = m->smk_next) {
+		if (m->smk_next == NULL) {
+			m->smk_next = new;
+			return;
+		}
+		if (new->smk_mask.s_addr > m->smk_next->smk_mask.s_addr) {
+			new->smk_next = m->smk_next;
+			m->smk_next = new;
+			return;
+		}
+	}
+}
+
+
+/**
  * smk_write_netlbladdr - write() for /smack/netlabel
  * @filp: file pointer, not actually used
  * @buf: where to get the data from
@@ -724,8 +754,9 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf,
 	struct netlbl_audit audit_info;
 	struct in_addr mask;
 	unsigned int m;
-	__be32 bebits = BEMASK;
+	u32 mask_bits = (1<<31);
 	__be32 nsa;
+	u32 temp_mask;
 
 	/*
 	 * Must have privilege.
@@ -761,10 +792,13 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf,
 	if (sp == NULL)
 		return -EINVAL;
 
-	for (mask.s_addr = 0; m > 0; m--) {
-		mask.s_addr |= bebits;
-		bebits <<= 1;
+	for (temp_mask = 0; m > 0; m--) {
+		temp_mask |= mask_bits;
+		mask_bits >>= 1;
 	}
+	mask.s_addr = cpu_to_be32(temp_mask);
+
+	newname.sin_addr.s_addr &= mask.s_addr;
 	/*
 	 * Only allow one writer at a time. Writes should be
 	 * quite rare and small in any case.
@@ -772,6 +806,7 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf,
 	mutex_lock(&smk_netlbladdr_lock);
 
 	nsa = newname.sin_addr.s_addr;
+	/* try to find if the prefix is already in the list */
 	for (skp = smack_netlbladdrs; skp != NULL; skp = skp->smk_next)
 		if (skp->smk_host.sin_addr.s_addr == nsa &&
 		    skp->smk_mask.s_addr == mask.s_addr)
@@ -787,9 +822,8 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf,
 			rc = 0;
 			skp->smk_host.sin_addr.s_addr = newname.sin_addr.s_addr;
 			skp->smk_mask.s_addr = mask.s_addr;
-			skp->smk_next = smack_netlbladdrs;
 			skp->smk_label = sp;
-			smack_netlbladdrs = skp;
+			smk_netlbladdr_insert(skp);
 		}
 	} else {
 		rc = netlbl_cfg_unlbl_static_del(&init_net, NULL,
-- 
cgit v0.10.2


From 49bc46360d68156ce82b2b1a12badb80078453a0 Mon Sep 17 00:00:00 2001
From: Maciej Sosnowski <maciej.sosnowski@intel.com>
Date: Thu, 26 Feb 2009 11:04:23 +0100
Subject: I/OAT: add verification for proper APICID_TAG_MAP setting by BIOS

BIOS versions for systems with I/OAT ver.2 have been found
which fail to program APICID_TAG_MAP for DCA.
The ioatdma driver should recognize incorrectly set APICID_TAG_MAP
and disable DCA in that case.

Signed-off-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/dma/ioat_dca.c b/drivers/dma/ioat_dca.c
index 6cf622d..78705ca 100644
--- a/drivers/dma/ioat_dca.c
+++ b/drivers/dma/ioat_dca.c
@@ -49,6 +49,23 @@
 
 #define DCA_TAG_MAP_MASK 0xDF
 
+/* expected tag map bytes for I/OAT ver.2 */
+#define DCA2_TAG_MAP_BYTE0 0x80
+#define DCA2_TAG_MAP_BYTE1 0x0
+#define DCA2_TAG_MAP_BYTE2 0x81
+#define DCA2_TAG_MAP_BYTE3 0x82
+#define DCA2_TAG_MAP_BYTE4 0x82
+
+/* verify if tag map matches expected values */
+static inline int dca2_tag_map_valid(u8 *tag_map)
+{
+	return ((tag_map[0] == DCA2_TAG_MAP_BYTE0) &&
+		(tag_map[1] == DCA2_TAG_MAP_BYTE1) &&
+		(tag_map[2] == DCA2_TAG_MAP_BYTE2) &&
+		(tag_map[3] == DCA2_TAG_MAP_BYTE3) &&
+		(tag_map[4] == DCA2_TAG_MAP_BYTE4));
+}
+
 /*
  * "Legacy" DCA systems do not implement the DCA register set in the
  * I/OAT device.  Software needs direct support for their tag mappings.
@@ -452,6 +469,13 @@ struct dca_provider *ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase)
 			ioatdca->tag_map[i] = 0;
 	}
 
+	if (!dca2_tag_map_valid(ioatdca->tag_map)) {
+		dev_err(&pdev->dev, "APICID_TAG_MAP set incorrectly by BIOS, "
+			"disabling DCA\n");
+		free_dca_provider(dca);
+		return NULL;
+	}
+
 	err = register_dca_provider(dca, &pdev->dev);
 	if (err) {
 		free_dca_provider(dca);
-- 
cgit v0.10.2


From ea9c717d0148d4194f9bd04ecfa6b59b20fc0a08 Mon Sep 17 00:00:00 2001
From: Maciej Sosnowski <maciej.sosnowski@intel.com>
Date: Thu, 26 Feb 2009 11:04:38 +0100
Subject: I/OAT: do not set DCACTRL_CMPL_WRITE_ENABLE for I/OAT ver.3

Flag DCACTRL_CMPL_WRITE_ENABLE is valid only for I/OAT ver.2
so it should not be set for I/OAT ver.3.

Signed-off-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index b3759c4..879f4a0 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -189,11 +189,13 @@ static int ioat_dma_enumerate_channels(struct ioatdma_device *device)
 		ioat_chan->xfercap = xfercap;
 		ioat_chan->desccount = 0;
 		INIT_DELAYED_WORK(&ioat_chan->work, ioat_dma_chan_reset_part2);
-		if (ioat_chan->device->version != IOAT_VER_1_2) {
-			writel(IOAT_DCACTRL_CMPL_WRITE_ENABLE
-					| IOAT_DMA_DCA_ANY_CPU,
-				ioat_chan->reg_base + IOAT_DCACTRL_OFFSET);
-		}
+		if (ioat_chan->device->version == IOAT_VER_2_0)
+			writel(IOAT_DCACTRL_CMPL_WRITE_ENABLE |
+			       IOAT_DMA_DCA_ANY_CPU,
+			       ioat_chan->reg_base + IOAT_DCACTRL_OFFSET);
+		else if (ioat_chan->device->version == IOAT_VER_3_0)
+			writel(IOAT_DMA_DCA_ANY_CPU,
+			       ioat_chan->reg_base + IOAT_DCACTRL_OFFSET);
 		spin_lock_init(&ioat_chan->cleanup_lock);
 		spin_lock_init(&ioat_chan->desc_lock);
 		INIT_LIST_HEAD(&ioat_chan->free_desc);
-- 
cgit v0.10.2


From 8b794b141c633083408d0bfb2229b3406d0ebf99 Mon Sep 17 00:00:00 2001
From: Maciej Sosnowski <maciej.sosnowski@intel.com>
Date: Thu, 26 Feb 2009 11:04:54 +0100
Subject: I/OAT: fail initialization on zero channels detection

On some systems with I/OAT ver.2 when DCA is disabled in BIOS
situations have been observed
that zero DMA channels are detected instead of four.
To avoid kernel panic driver should fail gracefully with appropriate message.

Signed-off-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index 879f4a0..9012da7 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -1659,6 +1659,13 @@ struct ioatdma_device *ioat_dma_probe(struct pci_dev *pdev,
 		" %d channels, device version 0x%02x, driver version %s\n",
 		device->common.chancnt, device->version, IOAT_DMA_VERSION);
 
+	if (!device->common.chancnt) {
+		dev_err(&device->pdev->dev,
+			"Intel(R) I/OAT DMA Engine problem found: "
+			"zero channels detected\n");
+		goto err_setup_interrupts;
+	}
+
 	err = ioat_dma_setup_interrupts(device);
 	if (err)
 		goto err_setup_interrupts;
-- 
cgit v0.10.2


From 2b8a6bf896ef47cc7d84c503079cc7b99789f9fa Mon Sep 17 00:00:00 2001
From: Maciej Sosnowski <maciej.sosnowski@intel.com>
Date: Thu, 26 Feb 2009 11:05:07 +0100
Subject: I/OAT: cancel watchdog before dma remove

Channel watchdog should be canceled before the rest of dma remove stuff.

Signed-off-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index 9012da7..fc9b845 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -1705,6 +1705,9 @@ void ioat_dma_remove(struct ioatdma_device *device)
 	struct dma_chan *chan, *_chan;
 	struct ioat_dma_chan *ioat_chan;
 
+	if (device->version != IOAT_VER_3_0)
+		cancel_delayed_work(&device->work);
+
 	ioat_dma_remove_interrupts(device);
 
 	dma_async_device_unregister(&device->common);
@@ -1716,10 +1719,6 @@ void ioat_dma_remove(struct ioatdma_device *device)
 	pci_release_regions(device->pdev);
 	pci_disable_device(device->pdev);
 
-	if (device->version != IOAT_VER_3_0) {
-		cancel_delayed_work(&device->work);
-	}
-
 	list_for_each_entry_safe(chan, _chan,
 				 &device->common.channels, device_node) {
 		ioat_chan = to_ioat_chan(chan);
-- 
cgit v0.10.2


From 5de22343b2303b278ab562e5d166ffe306566d30 Mon Sep 17 00:00:00 2001
From: Maciej Sosnowski <maciej.sosnowski@intel.com>
Date: Thu, 26 Feb 2009 11:05:17 +0100
Subject: I/OAT: set tcp_dma_copybreak to 256k for I/OAT ver.3

Upcoming server platforms from Intel based on the Nehalem performance
have significantly improved CPU based copy performance.
However, the DMA engine can still be effective at higher I/O sizes
for TCP traffic and at this time copybreak
should be set to 256k for TCP traffic only.

Signed-off-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h
index a3306d0..dcf8db5 100644
--- a/drivers/dma/ioatdma.h
+++ b/drivers/dma/ioatdma.h
@@ -135,12 +135,14 @@ static inline void ioat_set_tcp_copy_break(struct ioatdma_device *dev)
 	#ifdef CONFIG_NET_DMA
 	switch (dev->version) {
 	case IOAT_VER_1_2:
-	case IOAT_VER_3_0:
 		sysctl_tcp_dma_copybreak = 4096;
 		break;
 	case IOAT_VER_2_0:
 		sysctl_tcp_dma_copybreak = 2048;
 		break;
+	case IOAT_VER_3_0:
+		sysctl_tcp_dma_copybreak = 262144;
+		break;
 	}
 	#endif
 }
-- 
cgit v0.10.2


From aa2d0b8b97efa1033609ca89b9faa5d3a1232959 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Thu, 26 Feb 2009 11:05:30 +0100
Subject: I/OAT: list usage cleanup

Trivial cleanup, list_del(); list_add_tail() is equivalent
to list_move_tail(). Semantic patch for coccinelle can be
found at www.cccmz.de/~snakebyte/list_move_tail.spatch

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index fc9b845..ae8c0ce 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -1171,9 +1171,8 @@ static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *ioat_chan)
 				 * up if the client is done with the descriptor
 				 */
 				if (async_tx_test_ack(&desc->async_tx)) {
-					list_del(&desc->node);
-					list_add_tail(&desc->node,
-						      &ioat_chan->free_desc);
+					list_move_tail(&desc->node,
+						       &ioat_chan->free_desc);
 				} else
 					desc->async_tx.cookie = 0;
 			} else {
-- 
cgit v0.10.2


From 211a22ce08dbb27eb1a66df8a4bdae5e96092bc8 Mon Sep 17 00:00:00 2001
From: Maciej Sosnowski <maciej.sosnowski@intel.com>
Date: Thu, 26 Feb 2009 11:05:43 +0100
Subject: I/OAT: update driver version and copyright dates

Together with new fixes update driver version
and extend copyright dates ranges.

Signed-off-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/dca/dca-core.c b/drivers/dca/dca-core.c
index 33bd753..25b743a 100644
--- a/drivers/dca/dca-core.c
+++ b/drivers/dca/dca-core.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2007 Intel Corporation. All rights reserved.
+ * Copyright(c) 2007 - 2009 Intel Corporation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
diff --git a/drivers/dma/ioat.c b/drivers/dma/ioat.c
index 4105d65..ed83dd9 100644
--- a/drivers/dma/ioat.c
+++ b/drivers/dma/ioat.c
@@ -1,6 +1,6 @@
 /*
  * Intel I/OAT DMA Linux driver
- * Copyright(c) 2007 Intel Corporation.
+ * Copyright(c) 2007 - 2009 Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
diff --git a/drivers/dma/ioat_dca.c b/drivers/dma/ioat_dca.c
index 78705ca..c012a1e 100644
--- a/drivers/dma/ioat_dca.c
+++ b/drivers/dma/ioat_dca.c
@@ -1,6 +1,6 @@
 /*
  * Intel I/OAT DMA Linux driver
- * Copyright(c) 2007 Intel Corporation.
+ * Copyright(c) 2007 - 2009 Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index ae8c0ce..068b635 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -1,6 +1,6 @@
 /*
  * Intel I/OAT DMA Linux driver
- * Copyright(c) 2004 - 2007 Intel Corporation.
+ * Copyright(c) 2004 - 2009 Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h
index dcf8db5..a52ff4b 100644
--- a/drivers/dma/ioatdma.h
+++ b/drivers/dma/ioatdma.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2004 - 2007 Intel Corporation. All rights reserved.
+ * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -29,7 +29,7 @@
 #include <linux/pci_ids.h>
 #include <net/tcp.h>
 
-#define IOAT_DMA_VERSION  "3.30"
+#define IOAT_DMA_VERSION  "3.64"
 
 enum ioat_interrupt {
 	none = 0,
diff --git a/drivers/dma/ioatdma_hw.h b/drivers/dma/ioatdma_hw.h
index f1ae2c7..afa57ee 100644
--- a/drivers/dma/ioatdma_hw.h
+++ b/drivers/dma/ioatdma_hw.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2004 - 2007 Intel Corporation. All rights reserved.
+ * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
diff --git a/drivers/dma/ioatdma_registers.h b/drivers/dma/ioatdma_registers.h
index 827cb50..49bc277 100644
--- a/drivers/dma/ioatdma_registers.h
+++ b/drivers/dma/ioatdma_registers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2004 - 2007 Intel Corporation. All rights reserved.
+ * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
-- 
cgit v0.10.2


From 0c33e1ca3d80647f2e72e44524fd21e79214da20 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 2 Mar 2009 13:31:35 -0700
Subject: I/OAT: fail self-test if callback test reaches timeout

If we miss interrupts in the self test then fail registration of this
channel as it is unsuitable for use as a public channel.

Signed-off-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index 068b635..5905cd3 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -1363,6 +1363,7 @@ static int ioat_dma_self_test(struct ioatdma_device *device)
 	dma_cookie_t cookie;
 	int err = 0;
 	struct completion cmp;
+	unsigned long tmo;
 
 	src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
 	if (!src)
@@ -1414,9 +1415,10 @@ static int ioat_dma_self_test(struct ioatdma_device *device)
 	}
 	device->common.device_issue_pending(dma_chan);
 
-	wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
+	tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
 
-	if (device->common.device_is_tx_complete(dma_chan, cookie, NULL, NULL)
+	if (tmo == 0 ||
+	    device->common.device_is_tx_complete(dma_chan, cookie, NULL, NULL)
 					!= DMA_SUCCESS) {
 		dev_err(&device->pdev->dev,
 			"Self-test copy timed out, disabling\n");
-- 
cgit v0.10.2


From 900325a6ce33995688b7a680a34e7698f16f4d72 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 2 Mar 2009 15:33:46 -0700
Subject: fsldma: fix off by one in dma_halt

Prevent dev_err from firing even if we successfully detected 'dma-idle'
before the full 1ms timeout has elapsed.

Acked-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 70126a6..86d6da4 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -158,7 +158,8 @@ static void dma_start(struct fsl_dma_chan *fsl_chan)
 
 static void dma_halt(struct fsl_dma_chan *fsl_chan)
 {
-	int i = 0;
+	int i;
+
 	DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr,
 		DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32) | FSL_DMA_MR_CA,
 		32);
@@ -166,8 +167,11 @@ static void dma_halt(struct fsl_dma_chan *fsl_chan)
 		DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32) & ~(FSL_DMA_MR_CS
 		| FSL_DMA_MR_EMS_EN | FSL_DMA_MR_CA), 32);
 
-	while (!dma_is_idle(fsl_chan) && (i++ < 100))
+	for (i = 0; i < 100; i++) {
+		if (dma_is_idle(fsl_chan))
+			break;
 		udelay(10);
+	}
 	if (i >= 100 && !dma_is_idle(fsl_chan))
 		dev_err(fsl_chan->dev, "DMA halt timeout!\n");
 }
-- 
cgit v0.10.2


From a09b09ae51ace43d28cd9bc1c8bb97986f2b55a6 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Wed, 25 Feb 2009 13:56:21 +0100
Subject: iop-adma, mv_xor: fix mem leak on self-test setup failure

iop_adma_zero_sum_self_test has the brackets in the wrong place for the
setup failure deallocation path.  This error was duplicated in
mv_xor_xor_self_test.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index ea5440d..4131ab8 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -928,19 +928,19 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
 
 	for (src_idx = 0; src_idx < IOP_ADMA_NUM_SRC_TEST; src_idx++) {
 		xor_srcs[src_idx] = alloc_page(GFP_KERNEL);
-		if (!xor_srcs[src_idx])
-			while (src_idx--) {
+		if (!xor_srcs[src_idx]) {
+			while (src_idx--)
 				__free_page(xor_srcs[src_idx]);
-				return -ENOMEM;
-			}
+			return -ENOMEM;
+		}
 	}
 
 	dest = alloc_page(GFP_KERNEL);
-	if (!dest)
-		while (src_idx--) {
+	if (!dest) {
+		while (src_idx--)
 			__free_page(xor_srcs[src_idx]);
-			return -ENOMEM;
-		}
+		return -ENOMEM;
+	}
 
 	/* Fill in src buffers */
 	for (src_idx = 0; src_idx < IOP_ADMA_NUM_SRC_TEST; src_idx++) {
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index d35cbd1..2a4e3e3 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -1019,19 +1019,19 @@ mv_xor_xor_self_test(struct mv_xor_device *device)
 
 	for (src_idx = 0; src_idx < MV_XOR_NUM_SRC_TEST; src_idx++) {
 		xor_srcs[src_idx] = alloc_page(GFP_KERNEL);
-		if (!xor_srcs[src_idx])
-			while (src_idx--) {
+		if (!xor_srcs[src_idx]) {
+			while (src_idx--)
 				__free_page(xor_srcs[src_idx]);
-				return -ENOMEM;
-			}
+			return -ENOMEM;
+		}
 	}
 
 	dest = alloc_page(GFP_KERNEL);
-	if (!dest)
-		while (src_idx--) {
+	if (!dest) {
+		while (src_idx--)
 			__free_page(xor_srcs[src_idx]);
-			return -ENOMEM;
-		}
+		return -ENOMEM;
+	}
 
 	/* Fill in src buffers */
 	for (src_idx = 0; src_idx < MV_XOR_NUM_SRC_TEST; src_idx++) {
-- 
cgit v0.10.2


From c74ef1f867d18171c8617519ee5fe40b02903934 Mon Sep 17 00:00:00 2001
From: Luotao Fu <l.fu@pengutronix.de>
Date: Thu, 26 Feb 2009 12:29:20 +0100
Subject: ipu_idmac: fix spinlock type

fix a probably accidently dropped reference operator while calling
spin_unlock_restore to an ipu lock.

Signed-off-by: Luotao Fu <l.fu@pengutronix.de>
Cc: Guennadi Liakhovetski <lg@denx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c
index 1f154d0..ae50a9d 100644
--- a/drivers/dma/ipu/ipu_idmac.c
+++ b/drivers/dma/ipu/ipu_idmac.c
@@ -729,7 +729,7 @@ static int ipu_init_channel_buffer(struct idmac_channel *ichan,
 
 	ichan->status = IPU_CHANNEL_READY;
 
-	spin_unlock_irqrestore(ipu->lock, flags);
+	spin_unlock_irqrestore(&ipu->lock, flags);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 7cbd4877e5b167b56a3d6033b926a9f925186e12 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 4 Mar 2009 16:06:03 -0700
Subject: dmatest: fix use after free in dmatest_exit

dmatest_cleanup_chanel will free dtc, so grab ->chan before it goes away
and use it to do the release.

Reported-by: Thierry Reding <thierry.reding@avionic-design.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index 732fa1e..e190d8b 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -430,13 +430,15 @@ late_initcall(dmatest_init);
 static void __exit dmatest_exit(void)
 {
 	struct dmatest_chan *dtc, *_dtc;
+	struct dma_chan *chan;
 
 	list_for_each_entry_safe(dtc, _dtc, &dmatest_channels, node) {
 		list_del(&dtc->node);
+		chan = dtc->chan;
 		dmatest_cleanup_channel(dtc);
 		pr_debug("dmatest: dropped channel %s\n",
-			 dma_chan_name(dtc->chan));
-		dma_release_channel(dtc->chan);
+			 dma_chan_name(chan));
+		dma_release_channel(chan);
 	}
 }
 module_exit(dmatest_exit);
-- 
cgit v0.10.2


From 9f8ac0b7b063be77f0de7a27fe5e6a0aa2cce58d Mon Sep 17 00:00:00 2001
From: Matt Carlson <mcarlson@broadcom.com>
Date: Wed, 25 Feb 2009 14:21:20 +0000
Subject: tg3: Fix 5906 link problems

Commit 6833c043f9fc03696fde623914c4a0277df2a0bc introduced the phy
auto-powerdown capability.  While the APD feature only works for 5761
and 5784 asic revisions, the (harmless portion of the) code was applied
to all 5705 and newer devices.  However, the 5906 phy departs from the
usual design.  This commit was interfering with the 5906's ability to
negotiate link against some switches.  This patch corrects the problem.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Signed-off-by: Benjamin Li <benli@broadcom.com>
Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index b080f94..dabdf59 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -1473,7 +1473,8 @@ static void tg3_phy_toggle_apd(struct tg3 *tp, bool enable)
 {
 	u32 reg;
 
-	if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS))
+	if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS) ||
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5906)
 		return;
 
 	reg = MII_TG3_MISC_SHDW_WREN |
-- 
cgit v0.10.2


From 7ce9d5d1f3c8736511daa413c64985a05b2feee3 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 4 Mar 2009 18:38:18 -0500
Subject: ext4: fix ext4_free_inode() vs. ext4_claim_inode() race

I was seeing fsck errors on inode bitmaps after a 4 thread
dbench run on a 4 cpu machine:

Inode bitmap differences: -50736 -(50752--50753) etc...

I believe that this is because ext4_free_inode() uses atomic
bitops, and although ext4_new_inode() *used* to also use atomic
bitops for synchronization, commit
393418676a7602e1d7d3f6e560159c65c8cbd50e changed this to use
the sb_bgl_lock, so that we could also synchronize against
read_inode_bitmap and initialization of uninit inode tables.

However, that change left ext4_free_inode using atomic bitops,
which I think leaves no synchronization between setting &
unsetting bits in the inode table.

The below patch fixes it for me, although I wonder if we're
getting at all heavy-handed with this spinlock...

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f18a919..627f8c3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -188,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	struct ext4_group_desc *gdp;
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
-	int fatal = 0, err, count;
+	int fatal = 0, err, count, cleared;
 	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
@@ -248,8 +248,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 		goto error_return;
 
 	/* Ok, now we can actually update the inode bitmaps.. */
-	if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
-					bit, bitmap_bh->b_data))
+	spin_lock(sb_bgl_lock(sbi, block_group));
+	cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+	spin_unlock(sb_bgl_lock(sbi, block_group));
+	if (!cleared)
 		ext4_error(sb, "ext4_free_inode",
 			   "bit already cleared for inode %lu", ino);
 	else {
-- 
cgit v0.10.2


From 118e1ef6fabfc023126e6075f6ac0fc729cb5285 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Thu, 5 Mar 2009 00:31:12 +0000
Subject: Squashfs: Fix oops when reading fsfuzzer corrupted filesystems

This fixes a code regression caused by the recent mainlining changes.
The recent code changes call zlib_inflate repeatedly, decompressing into
separate 4K buffers, this code didn't check for the possibility that
zlib_inflate might ask for too many buffers when decompressing corrupted
data.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index c837dfc..321728f 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -80,7 +80,7 @@ static struct buffer_head *get_block_length(struct super_block *sb,
  * generated a larger block - this does occasionally happen with zlib).
  */
 int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
-			int length, u64 *next_index, int srclength)
+			int length, u64 *next_index, int srclength, int pages)
 {
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
 	struct buffer_head **bh;
@@ -185,6 +185,14 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 			}
 
 			if (msblk->stream.avail_out == 0) {
+				if (page == pages) {
+					ERROR("zlib_inflate tried to "
+						"decompress too much data, "
+						"expected %d bytes.  Zlib "
+						"data probably corrupt\n",
+						srclength);
+					goto release_mutex;
+				}
 				msblk->stream.next_out = buffer[page++];
 				msblk->stream.avail_out = PAGE_CACHE_SIZE;
 			}
@@ -268,7 +276,8 @@ block_release:
 		put_bh(bh[k]);
 
 read_failure:
-	ERROR("sb_bread failed reading block 0x%llx\n", cur_index);
+	ERROR("squashfs_read_data failed to read block 0x%llx\n",
+					(unsigned long long) index);
 	kfree(bh);
 	return -EIO;
 }
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f29eda1..1c4739e 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -119,7 +119,7 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
 
 			entry->length = squashfs_read_data(sb, entry->data,
 				block, length, &entry->next_index,
-				cache->block_size);
+				cache->block_size, cache->pages);
 
 			spin_lock(&cache->lock);
 
@@ -406,7 +406,7 @@ int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
 	for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
 		data[i] = buffer;
 	res = squashfs_read_data(sb, data, block, length |
-		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length);
+		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages);
 	kfree(data);
 	return res;
 }
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 6b2515d..0e9feb6 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -34,7 +34,7 @@ static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
 
 /* block.c */
 extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
-				int);
+				int, int);
 
 /* cache.c */
 extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 071df5b..681ec0d 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -389,7 +389,7 @@ static int __init init_squashfs_fs(void)
 		return err;
 	}
 
-	printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) "
+	printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) "
 		"Phillip Lougher\n");
 
 	return 0;
-- 
cgit v0.10.2


From edf2e2811efa9304ebe14f778d33b764cfd58b7a Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Thu, 5 Mar 2009 00:40:13 +0000
Subject: Squashfs: fix documentation typo, Cramfs filesystem limit is 256 MiB

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>

diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt
index 3e79e4a..b324c03 100644
--- a/Documentation/filesystems/squashfs.txt
+++ b/Documentation/filesystems/squashfs.txt
@@ -22,7 +22,7 @@ Squashfs filesystem features versus Cramfs:
 
 				Squashfs		Cramfs
 
-Max filesystem size:		2^64			16 MiB
+Max filesystem size:		2^64			256 MiB
 Max file size:			~ 2 TiB			16 MiB
 Max files:			unlimited		unlimited
 Max directories:		unlimited		unlimited
-- 
cgit v0.10.2


From f4f8056a862a9950320429dfda708c88b4ce6025 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 5 Mar 2009 00:55:31 +0000
Subject: Squashfs: frag_size should be signed, as it can hold an error result

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>

diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 7a63398..9101dbd 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -133,7 +133,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 	type = le16_to_cpu(sqshb_ino->inode_type);
 	switch (type) {
 	case SQUASHFS_REG_TYPE: {
-		unsigned int frag_offset, frag_size, frag;
+		unsigned int frag_offset, frag;
+		int frag_size;
 		u64 frag_blk;
 		struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
 
@@ -175,7 +176,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		break;
 	}
 	case SQUASHFS_LREG_TYPE: {
-		unsigned int frag_offset, frag_size, frag;
+		unsigned int frag_offset, frag;
+		int frag_size;
 		u64 frag_blk;
 		struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
 
-- 
cgit v0.10.2


From a883bf564ea555447a76682bb2d8d4bc92e23e0e Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Wed, 4 Mar 2009 17:38:10 -0800
Subject: pkt_sched: act_police: Fix a rate estimator test.

A commit c1b56878fb68e9c14070939ea4537ad4db79ffae "tc: policing requires
a rate estimator" introduced a test which invalidates previously working
configs, based on examples from iproute2: doc/actions/actions-general.
This is too rigorous: a rate estimator is needed only when police's
"avrate" option is used.

Reported-by: Joao Correia <joaomiguelcorreia@gmail.com>
Diagnosed-by: John Dykstra <john.dykstra1@gmail.com>
Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 5c72a11..f8f047b6 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -183,13 +183,6 @@ override:
 		if (R_tab == NULL)
 			goto failure;
 
-		if (!est && (ret == ACT_P_CREATED ||
-			     !gen_estimator_active(&police->tcf_bstats,
-						   &police->tcf_rate_est))) {
-			err = -EINVAL;
-			goto failure;
-		}
-
 		if (parm->peakrate.rate) {
 			P_tab = qdisc_get_rtab(&parm->peakrate,
 					       tb[TCA_POLICE_PEAKRATE]);
@@ -205,6 +198,12 @@ override:
 					    &police->tcf_lock, est);
 		if (err)
 			goto failure_unlock;
+	} else if (tb[TCA_POLICE_AVRATE] &&
+		   (ret == ACT_P_CREATED ||
+		    !gen_estimator_active(&police->tcf_bstats,
+					  &police->tcf_rate_est))) {
+		err = -EINVAL;
+		goto failure_unlock;
 	}
 
 	/* No failure allowed after this point */
-- 
cgit v0.10.2


From 87786945fe4b0e60e8f1db62d5ee8a3cec539a67 Mon Sep 17 00:00:00 2001
From: Meelis Roos <mroos@linux.ee>
Date: Wed, 4 Mar 2009 04:59:41 +0000
Subject: tmspci: fix request_irq race

Currently, tmspci tokenring driver crashes on device initialization
because it requests its irq before initializing corresponding data
structures. Fix this by moving request_irq call to a safer place.

Signed-off-by: Meelis Roos <mroos@linux.ee>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/tokenring/tmspci.c b/drivers/net/tokenring/tmspci.c
index 5f60177..e2150b3 100644
--- a/drivers/net/tokenring/tmspci.c
+++ b/drivers/net/tokenring/tmspci.c
@@ -121,11 +121,6 @@ static int __devinit tms_pci_attach(struct pci_dev *pdev, const struct pci_devic
 		goto err_out_trdev;
 	}
 
-	ret = request_irq(pdev->irq, tms380tr_interrupt, IRQF_SHARED,
-			  dev->name, dev);
-	if (ret)
-		goto err_out_region;
-
 	dev->base_addr	= pci_ioaddr;
 	dev->irq 	= pci_irq_line;
 	dev->dma	= 0;
@@ -142,7 +137,7 @@ static int __devinit tms_pci_attach(struct pci_dev *pdev, const struct pci_devic
 	ret = tmsdev_init(dev, &pdev->dev);
 	if (ret) {
 		printk("%s: unable to get memory for dev->priv.\n", dev->name);
-		goto err_out_irq;
+		goto err_out_region;
 	}
 
 	tp = netdev_priv(dev);
@@ -157,6 +152,11 @@ static int __devinit tms_pci_attach(struct pci_dev *pdev, const struct pci_devic
 
 	tp->tmspriv = cardinfo;
 
+	ret = request_irq(pdev->irq, tms380tr_interrupt, IRQF_SHARED,
+			  dev->name, dev);
+	if (ret)
+		goto err_out_tmsdev;
+
 	dev->open = tms380tr_open;
 	dev->stop = tms380tr_close;
 	pci_set_drvdata(pdev, dev);
@@ -164,15 +164,15 @@ static int __devinit tms_pci_attach(struct pci_dev *pdev, const struct pci_devic
 
 	ret = register_netdev(dev);
 	if (ret)
-		goto err_out_tmsdev;
+		goto err_out_irq;
 	
 	return 0;
 
+err_out_irq:
+	free_irq(pdev->irq, dev);
 err_out_tmsdev:
 	pci_set_drvdata(pdev, NULL);
 	tmsdev_term(dev);
-err_out_irq:
-	free_irq(pdev->irq, dev);
 err_out_region:
 	release_region(pci_ioaddr, TMS_PCI_IO_EXTENT);
 err_out_trdev:
-- 
cgit v0.10.2


From 7acab7a9ca6b0c5b820f083424c57e6ac2031d9d Mon Sep 17 00:00:00 2001
From: Enrik Berkhan <Enrik.Berkhan@ge.com>
Date: Thu, 5 Mar 2009 14:42:30 +0800
Subject: Blackfin arch: fix bug - The SPORT_HYS bit is not set for BF561 0.5

IMHO the setting should depend on ANOMALY_05000305 which is about the
availability of the bit, not ANOMALY_05000265 which only describes the
SPORT sensitivity to noise (checked for BF561 only, though).

If that's not true for other BF variants, maybe the definition of
ANOMALY_05000265 for BF561 should be changed to '(1)' instead.

Signed-off-by: Enrik Berkhan <Enrik.Berkhan@ge.com>
Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/mach-common/clocks-init.c b/arch/blackfin/mach-common/clocks-init.c
index 9dddb6f..3539365 100644
--- a/arch/blackfin/mach-common/clocks-init.c
+++ b/arch/blackfin/mach-common/clocks-init.c
@@ -17,7 +17,7 @@
 #define SDGCTL_WIDTH (1 << 31)	/* SDRAM external data path width */
 #define PLL_CTL_VAL \
 	(((CONFIG_VCO_MULT & 63) << 9) | CLKIN_HALF | \
-	 (PLL_BYPASS << 8) | (ANOMALY_05000265 ? 0x8000 : 0))
+	 (PLL_BYPASS << 8) | (ANOMALY_05000305 ? 0 : 0x8000))
 
 __attribute__((l1_text))
 static void do_sync(void)
-- 
cgit v0.10.2


From 54acd0efab072cb70e87206329d561b297f93bbb Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 4 Mar 2009 23:01:02 -0800
Subject: net: Fix missing dev->neigh_setup in register_netdevice().

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/core/dev.c b/net/core/dev.c
index 9e4afe6..2dd484e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4339,6 +4339,7 @@ int register_netdevice(struct net_device *dev)
 		dev->do_ioctl = ops->ndo_do_ioctl;
 		dev->set_config = ops->ndo_set_config;
 		dev->change_mtu = ops->ndo_change_mtu;
+		dev->neigh_setup = ops->ndo_neigh_setup;
 		dev->tx_timeout = ops->ndo_tx_timeout;
 		dev->get_stats = ops->ndo_get_stats;
 		dev->vlan_rx_register = ops->ndo_vlan_rx_register;
-- 
cgit v0.10.2


From 9d40bbda599def1e1d155d7f7dca14fe8744bd2b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 4 Mar 2009 23:46:25 -0800
Subject: vlan: Fix vlan-in-vlan crashes.

As analyzed by Patrick McHardy, vlan needs to reset it's
netdev_ops pointer in it's ->init() function but this
leaves the compat method pointers stale.

Add a netdev_resync_ops() and call it from the vlan code.

Any other driver which changes ->netdev_ops after register_netdevice()
will need to call this new function after doing so too.

With help from Patrick McHardy.

Tested-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ec54785..6593667 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1079,6 +1079,7 @@ extern void		synchronize_net(void);
 extern int 		register_netdevice_notifier(struct notifier_block *nb);
 extern int		unregister_netdevice_notifier(struct notifier_block *nb);
 extern int		init_dummy_netdev(struct net_device *dev);
+extern void		netdev_resync_ops(struct net_device *dev);
 
 extern int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
 extern struct net_device	*dev_get_by_index(struct net *net, int ifindex);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 4a19acd..1b34135 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -553,7 +553,7 @@ static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa)
 	int err = 0;
 
 	if (netif_device_present(real_dev) && ops->ndo_neigh_setup)
-		err = ops->ndo_neigh_setup(dev, pa);
+		err = ops->ndo_neigh_setup(real_dev, pa);
 
 	return err;
 }
@@ -639,6 +639,7 @@ static int vlan_dev_init(struct net_device *dev)
 		dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN;
 		dev->netdev_ops         = &vlan_netdev_ops;
 	}
+	netdev_resync_ops(dev);
 
 	if (is_vlan_dev(real_dev))
 		subclass = 1;
diff --git a/net/core/dev.c b/net/core/dev.c
index 2dd484e..f112970 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4282,6 +4282,39 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)
 }
 EXPORT_SYMBOL(netdev_fix_features);
 
+/* Some devices need to (re-)set their netdev_ops inside
+ * ->init() or similar.  If that happens, we have to setup
+ * the compat pointers again.
+ */
+void netdev_resync_ops(struct net_device *dev)
+{
+#ifdef CONFIG_COMPAT_NET_DEV_OPS
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	dev->init = ops->ndo_init;
+	dev->uninit = ops->ndo_uninit;
+	dev->open = ops->ndo_open;
+	dev->change_rx_flags = ops->ndo_change_rx_flags;
+	dev->set_rx_mode = ops->ndo_set_rx_mode;
+	dev->set_multicast_list = ops->ndo_set_multicast_list;
+	dev->set_mac_address = ops->ndo_set_mac_address;
+	dev->validate_addr = ops->ndo_validate_addr;
+	dev->do_ioctl = ops->ndo_do_ioctl;
+	dev->set_config = ops->ndo_set_config;
+	dev->change_mtu = ops->ndo_change_mtu;
+	dev->neigh_setup = ops->ndo_neigh_setup;
+	dev->tx_timeout = ops->ndo_tx_timeout;
+	dev->get_stats = ops->ndo_get_stats;
+	dev->vlan_rx_register = ops->ndo_vlan_rx_register;
+	dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
+	dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	dev->poll_controller = ops->ndo_poll_controller;
+#endif
+#endif
+}
+EXPORT_SYMBOL(netdev_resync_ops);
+
 /**
  *	register_netdevice	- register a network device
  *	@dev: device to register
@@ -4326,28 +4359,7 @@ int register_netdevice(struct net_device *dev)
 	 * This is temporary until all network devices are converted.
 	 */
 	if (dev->netdev_ops) {
-		const struct net_device_ops *ops = dev->netdev_ops;
-
-		dev->init = ops->ndo_init;
-		dev->uninit = ops->ndo_uninit;
-		dev->open = ops->ndo_open;
-		dev->change_rx_flags = ops->ndo_change_rx_flags;
-		dev->set_rx_mode = ops->ndo_set_rx_mode;
-		dev->set_multicast_list = ops->ndo_set_multicast_list;
-		dev->set_mac_address = ops->ndo_set_mac_address;
-		dev->validate_addr = ops->ndo_validate_addr;
-		dev->do_ioctl = ops->ndo_do_ioctl;
-		dev->set_config = ops->ndo_set_config;
-		dev->change_mtu = ops->ndo_change_mtu;
-		dev->neigh_setup = ops->ndo_neigh_setup;
-		dev->tx_timeout = ops->ndo_tx_timeout;
-		dev->get_stats = ops->ndo_get_stats;
-		dev->vlan_rx_register = ops->ndo_vlan_rx_register;
-		dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
-		dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
-#ifdef CONFIG_NET_POLL_CONTROLLER
-		dev->poll_controller = ops->ndo_poll_controller;
-#endif
+		netdev_resync_ops(dev);
 	} else {
 		char drivername[64];
 		pr_info("%s (%s): not using net_device_ops yet\n",
-- 
cgit v0.10.2


From f7e989ab64f926e34bafea0752431e1fd6a618f3 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier.adi@gmail.com>
Date: Thu, 5 Mar 2009 17:33:36 +0800
Subject: Blackfin arch: make sure people do not set the kernel load address
 too high

Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/mach-common/arch_checks.c b/arch/blackfin/mach-common/arch_checks.c
index 98133b9..b3a2e3f 100644
--- a/arch/blackfin/mach-common/arch_checks.c
+++ b/arch/blackfin/mach-common/arch_checks.c
@@ -62,3 +62,8 @@
 #if (CONFIG_BOOT_LOAD & 0x3)
 # error "The kernel load address must be 4 byte aligned"
 #endif
+
+/* The entire kernel must be able to make a 24bit pcrel call to start of L1 */
+#if ((0xffffffff - L1_CODE_START + 1) + CONFIG_BOOT_LOAD) > 0x1000000
+# error "The kernel load address is too high; keep it below 10meg for safety"
+#endif
-- 
cgit v0.10.2


From c19577e34e641f802ad35fc0204ff68ea16fe7a3 Mon Sep 17 00:00:00 2001
From: Graf Yang <graf.yang@analog.com>
Date: Thu, 5 Mar 2009 17:35:59 +0800
Subject: Blackfin arch: Fix bug - make ksz8893m driver available when bfin_mac
 is enabled

Signed-off-by: Graf Yang <graf.yang@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/mach-bf518/boards/ezbrd.c b/arch/blackfin/mach-bf518/boards/ezbrd.c
index 0e17534..b044de4 100644
--- a/arch/blackfin/mach-bf518/boards/ezbrd.c
+++ b/arch/blackfin/mach-bf518/boards/ezbrd.c
@@ -113,7 +113,6 @@ static struct platform_device bfin_mac_device = {
 	.name = "bfin_mac",
 	.dev.platform_data = &bfin_mii_bus,
 };
-#endif
 
 #if defined(CONFIG_NET_DSA_KSZ8893M) || defined(CONFIG_NET_DSA_KSZ8893M_MODULE)
 static struct dsa_platform_data ksz8893m_switch_data = {
@@ -132,6 +131,7 @@ static struct platform_device ksz8893m_switch_device = {
 	.dev.platform_data = &ksz8893m_switch_data,
 };
 #endif
+#endif
 
 #if defined(CONFIG_MTD_M25P80) \
 	|| defined(CONFIG_MTD_M25P80_MODULE)
@@ -171,6 +171,7 @@ static struct bfin5xx_spi_chip spi_adc_chip_info = {
 };
 #endif
 
+#if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE)
 #if defined(CONFIG_NET_DSA_KSZ8893M) \
 	|| defined(CONFIG_NET_DSA_KSZ8893M_MODULE)
 /* SPI SWITCH CHIP */
@@ -179,6 +180,7 @@ static struct bfin5xx_spi_chip spi_switch_info = {
 	.bits_per_word = 8,
 };
 #endif
+#endif
 
 #if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
 static struct bfin5xx_spi_chip spi_mmc_chip_info = {
@@ -259,6 +261,7 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = {
 	},
 #endif
 
+#if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE)
 #if defined(CONFIG_NET_DSA_KSZ8893M) \
 	|| defined(CONFIG_NET_DSA_KSZ8893M_MODULE)
 	{
@@ -271,6 +274,7 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = {
 		.mode = SPI_MODE_3,
 	},
 #endif
+#endif
 
 #if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
 	{
@@ -630,11 +634,10 @@ static struct platform_device *stamp_devices[] __initdata = {
 #if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE)
 	&bfin_mii_bus,
 	&bfin_mac_device,
-#endif
-
 #if defined(CONFIG_NET_DSA_KSZ8893M) || defined(CONFIG_NET_DSA_KSZ8893M_MODULE)
 	&ksz8893m_switch_device,
 #endif
+#endif
 
 #if defined(CONFIG_SPI_BFIN) || defined(CONFIG_SPI_BFIN_MODULE)
 	&bfin_spi0_device,
-- 
cgit v0.10.2


From d7ff1a90b2d3d122b896056d63db919617e9b6cd Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Thu, 5 Mar 2009 18:26:59 +0800
Subject: Blackfin arch: Fix bug - KGDB single step into the middle of a 4
 bytes instruction on bf561 after soft bp is hit

Run IFLUSH twice to avoid loading wrong instruction
after invalidating icache and following sequence is met.

1) The one instruction address is cached in the icache.
2) This instruction in SDRAM is changed.
3) IFLASH[P0] is executed only once in lackfin_icache_flush_range().
4) This instruction is executed again, but not the changed new one.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/mach-common/cache.S b/arch/blackfin/mach-common/cache.S
index 3c98dac..aa0648c 100644
--- a/arch/blackfin/mach-common/cache.S
+++ b/arch/blackfin/mach-common/cache.S
@@ -66,11 +66,33 @@
 
 /* Invalidate all instruction cache lines assocoiated with this memory area */
 ENTRY(_blackfin_icache_flush_range)
+/*
+ * Walkaround to avoid loading wrong instruction after invalidating icache
+ * and following sequence is met.
+ *
+ * 1) One instruction address is cached in the instruction cache.
+ * 2) This instruction in SDRAM is changed.
+ * 3) IFLASH[P0] is executed only once in blackfin_icache_flush_range().
+ * 4) This instruction is executed again, but the old one is loaded.
+ */
+	P0 = R0;
+	IFLUSH[P0];
 	do_flush IFLUSH, , nop
 ENDPROC(_blackfin_icache_flush_range)
 
 /* Flush all cache lines assocoiated with this area of memory. */
 ENTRY(_blackfin_icache_dcache_flush_range)
+/*
+ * Walkaround to avoid loading wrong instruction after invalidating icache
+ * and following sequence is met.
+ *
+ * 1) One instruction address is cached in the instruction cache.
+ * 2) This instruction in SDRAM is changed.
+ * 3) IFLASH[P0] is executed only once in blackfin_icache_flush_range().
+ * 4) This instruction is executed again, but the old one is loaded.
+ */
+	P0 = R0;
+	IFLUSH[P0];
 	do_flush FLUSH, IFLUSH
 ENDPROC(_blackfin_icache_dcache_flush_range)
 
-- 
cgit v0.10.2


From 5047607f0170b1f797a2fe37bb45310c1e1d5ce0 Mon Sep 17 00:00:00 2001
From: Graf Yang <graf.yang@analog.com>
Date: Thu, 5 Mar 2009 17:32:41 +0800
Subject: Blackfin arch: update default kernel config, select KSZ8893M driver
 for BF518

Signed-off-by: Graf Yang <graf.yang@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/configs/BF518F-EZBRD_defconfig b/arch/blackfin/configs/BF518F-EZBRD_defconfig
index 4fdb9e0..281f4b6 100644
--- a/arch/blackfin/configs/BF518F-EZBRD_defconfig
+++ b/arch/blackfin/configs/BF518F-EZBRD_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.28-rc2
-# Fri Jan  9 17:58:41 2009
+# Linux kernel version: 2.6.28
+# Fri Feb 20 10:01:44 2009
 #
 # CONFIG_MMU is not set
 # CONFIG_FPU is not set
@@ -133,10 +133,15 @@ CONFIG_BF518=y
 # CONFIG_BF538 is not set
 # CONFIG_BF539 is not set
 # CONFIG_BF542 is not set
+# CONFIG_BF542M is not set
 # CONFIG_BF544 is not set
+# CONFIG_BF544M is not set
 # CONFIG_BF547 is not set
+# CONFIG_BF547M is not set
 # CONFIG_BF548 is not set
+# CONFIG_BF548M is not set
 # CONFIG_BF549 is not set
+# CONFIG_BF549M is not set
 # CONFIG_BF561 is not set
 CONFIG_BF_REV_MIN=0
 CONFIG_BF_REV_MAX=2
@@ -426,7 +431,17 @@ CONFIG_DEFAULT_TCP_CONG="cubic"
 # CONFIG_TIPC is not set
 # CONFIG_ATM is not set
 # CONFIG_BRIDGE is not set
-# CONFIG_NET_DSA is not set
+CONFIG_NET_DSA=y
+# CONFIG_NET_DSA_TAG_DSA is not set
+# CONFIG_NET_DSA_TAG_EDSA is not set
+# CONFIG_NET_DSA_TAG_TRAILER is not set
+CONFIG_NET_DSA_TAG_STPID=y
+# CONFIG_NET_DSA_MV88E6XXX is not set
+# CONFIG_NET_DSA_MV88E6060 is not set
+# CONFIG_NET_DSA_MV88E6XXX_NEED_PPU is not set
+# CONFIG_NET_DSA_MV88E6131 is not set
+# CONFIG_NET_DSA_MV88E6123_61_65 is not set
+CONFIG_NET_DSA_KSZ8893M=y
 # CONFIG_VLAN_8021Q is not set
 # CONFIG_DECNET is not set
 # CONFIG_LLC2 is not set
@@ -529,6 +544,8 @@ CONFIG_MTD_COMPLEX_MAPPINGS=y
 #
 # Self-contained MTD device drivers
 #
+# CONFIG_MTD_DATAFLASH is not set
+# CONFIG_MTD_M25P80 is not set
 # CONFIG_MTD_SLRAM is not set
 # CONFIG_MTD_PHRAM is not set
 # CONFIG_MTD_MTDRAM is not set
@@ -561,7 +578,9 @@ CONFIG_BLK_DEV_RAM_SIZE=4096
 # CONFIG_BLK_DEV_HD is not set
 CONFIG_MISC_DEVICES=y
 # CONFIG_EEPROM_93CX6 is not set
+# CONFIG_ICS932S401 is not set
 # CONFIG_ENCLOSURE_SERVICES is not set
+# CONFIG_C2PORT is not set
 CONFIG_HAVE_IDE=y
 # CONFIG_IDE is not set
 
@@ -607,6 +626,7 @@ CONFIG_BFIN_RX_DESC_NUM=20
 # CONFIG_SMC91X is not set
 # CONFIG_SMSC911X is not set
 # CONFIG_DM9000 is not set
+# CONFIG_ENC28J60 is not set
 # CONFIG_IBM_NEW_EMAC_ZMII is not set
 # CONFIG_IBM_NEW_EMAC_RGMII is not set
 # CONFIG_IBM_NEW_EMAC_TAH is not set
@@ -764,7 +784,23 @@ CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100
 # CONFIG_I2C_DEBUG_ALGO is not set
 # CONFIG_I2C_DEBUG_BUS is not set
 # CONFIG_I2C_DEBUG_CHIP is not set
-# CONFIG_SPI is not set
+CONFIG_SPI=y
+# CONFIG_SPI_DEBUG is not set
+CONFIG_SPI_MASTER=y
+
+#
+# SPI Master Controller Drivers
+#
+CONFIG_SPI_BFIN=y
+# CONFIG_SPI_BFIN_LOCK is not set
+# CONFIG_SPI_BITBANG is not set
+
+#
+# SPI Protocol Masters
+#
+# CONFIG_SPI_AT25 is not set
+# CONFIG_SPI_SPIDEV is not set
+# CONFIG_SPI_TLE62X0 is not set
 CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
 # CONFIG_GPIOLIB is not set
 # CONFIG_W1 is not set
@@ -788,8 +824,10 @@ CONFIG_BFIN_WDT=y
 # CONFIG_MFD_SM501 is not set
 # CONFIG_HTC_PASIC3 is not set
 # CONFIG_MFD_TMIO is not set
+# CONFIG_PMIC_DA903X is not set
 # CONFIG_MFD_WM8400 is not set
 # CONFIG_MFD_WM8350_I2C is not set
+# CONFIG_REGULATOR is not set
 
 #
 # Multimedia devices
@@ -861,10 +899,18 @@ CONFIG_RTC_INTF_DEV=y
 # CONFIG_RTC_DRV_M41T80 is not set
 # CONFIG_RTC_DRV_S35390A is not set
 # CONFIG_RTC_DRV_FM3130 is not set
+# CONFIG_RTC_DRV_RX8581 is not set
 
 #
 # SPI RTC drivers
 #
+# CONFIG_RTC_DRV_M41T94 is not set
+# CONFIG_RTC_DRV_DS1305 is not set
+# CONFIG_RTC_DRV_DS1390 is not set
+# CONFIG_RTC_DRV_MAX6902 is not set
+# CONFIG_RTC_DRV_R9701 is not set
+# CONFIG_RTC_DRV_RS5C348 is not set
+# CONFIG_RTC_DRV_DS3234 is not set
 
 #
 # Platform RTC drivers
@@ -1062,12 +1108,20 @@ CONFIG_DEBUG_INFO=y
 # CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
 # CONFIG_FAULT_INJECTION is not set
 CONFIG_SYSCTL_SYSCALL_CHECK=y
+
+#
+# Tracers
+#
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
 # CONFIG_DYNAMIC_PRINTK_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
 # CONFIG_DEBUG_STACKOVERFLOW is not set
 # CONFIG_DEBUG_STACK_USAGE is not set
+# CONFIG_KGDB_TESTCASE is not set
 CONFIG_DEBUG_VERBOSE=y
 CONFIG_DEBUG_MMRS=y
 # CONFIG_DEBUG_HWERR is not set
@@ -1100,6 +1154,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_FIPS is not set
 # CONFIG_CRYPTO_MANAGER is not set
+# CONFIG_CRYPTO_MANAGER2 is not set
 # CONFIG_CRYPTO_GF128MUL is not set
 # CONFIG_CRYPTO_NULL is not set
 # CONFIG_CRYPTO_CRYPTD is not set
-- 
cgit v0.10.2


From 72e2240f181871675d3a979766330c91d48a1673 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 5 Mar 2009 01:57:44 -0800
Subject: bonding: Fix device passed into ->ndo_neigh_setup().

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9fb3883..e0578fe 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4113,7 +4113,7 @@ static int bond_neigh_setup(struct net_device *dev, struct neigh_parms *parms)
 		const struct net_device_ops *slave_ops
 			= slave->dev->netdev_ops;
 		if (slave_ops->ndo_neigh_setup)
-			return slave_ops->ndo_neigh_setup(dev, parms);
+			return slave_ops->ndo_neigh_setup(slave->dev, parms);
 	}
 	return 0;
 }
-- 
cgit v0.10.2


From a1aade478862fca8710904532cbc6efed97fd05a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 4 Mar 2009 16:11:35 -0800
Subject: x86/doc: mini-howto for using earlyprintk=dbgp

[ mingo: small edits and extensions. ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg KH <gregkh@suse.de>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Sarah Sharp <sarah.a.sharp@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <49AF18B7.4050305@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/x86/earlyprintk.txt b/Documentation/x86/earlyprintk.txt
new file mode 100644
index 0000000..607b1a0
--- /dev/null
+++ b/Documentation/x86/earlyprintk.txt
@@ -0,0 +1,101 @@
+
+Mini-HOWTO for using the earlyprintk=dbgp boot option with a
+USB2 Debug port key and a debug cable, on x86 systems.
+
+You need two computers, the 'USB debug key' special gadget and
+and two USB cables, connected like this:
+
+  [host/target] <-------> [USB debug key] <-------> [client/console]
+
+1. There are three specific hardware requirements:
+
+ a.) Host/target system needs to have USB debug port capability.
+
+ You can check this capability by looking at a 'Debug port' bit in
+ the lspci -vvv output:
+
+ # lspci -vvv
+ ...
+ 00:1d.7 USB Controller: Intel Corporation 82801H (ICH8 Family) USB2 EHCI Controller #1 (rev 03) (prog-if 20 [EHCI])
+         Subsystem: Lenovo ThinkPad T61
+         Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx-
+         Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
+         Latency: 0
+         Interrupt: pin D routed to IRQ 19
+         Region 0: Memory at fe227000 (32-bit, non-prefetchable) [size=1K]
+         Capabilities: [50] Power Management version 2
+                 Flags: PMEClk- DSI- D1- D2- AuxCurrent=375mA PME(D0+,D1-,D2-,D3hot+,D3cold+)
+                 Status: D0 PME-Enable- DSel=0 DScale=0 PME+
+         Capabilities: [58] Debug port: BAR=1 offset=00a0
+                            ^^^^^^^^^^^ <==================== [ HERE ]
+	 Kernel driver in use: ehci_hcd
+         Kernel modules: ehci-hcd
+ ...
+
+( If your system does not list a debug port capability then you probably
+  wont be able to use the USB debug key. )
+
+ b.) You also need a Netchip USB debug cable/key:
+
+        http://www.plxtech.com/products/NET2000/NET20DC/default.asp
+
+     This is a small blue plastic connector with two USB connections,
+     it draws power from its USB connections.
+
+ c.) Thirdly, you need a second client/console system with a regular USB port.
+
+2. Software requirements:
+
+ a.) On the host/target system:
+
+    You need to enable the following kernel config option:
+
+      CONFIG_EARLY_PRINTK_DBGP=y
+
+    And you need to add the boot command line: "earlyprintk=dbgp".
+    (If you are using Grub, append it to the 'kernel' line in
+     /etc/grub.conf)
+
+    NOTE: normally earlyprintk console gets turned off once the
+    regular console is alive - use "earlyprintk=dbgp,keep" to keep
+    this channel open beyond early bootup. This can be useful for
+    debugging crashes under Xorg, etc.
+
+ b.) On the client/console system:
+
+    You should enable the following kernel config option:
+
+      CONFIG_USB_SERIAL_DEBUG=y
+
+    On the next bootup with the modified kernel you should
+    get a /dev/ttyUSBx device(s).
+
+    Now this channel of kernel messages is ready to be used: start
+    your favorite terminal emulator (minicom, etc.) and set
+    it up to use /dev/ttyUSB0 - or use a raw 'cat /dev/ttyUSBx' to
+    see the raw output.
+
+ c.) On Nvidia Southbridge based systems: the kernel will try to probe
+     and find out which port has debug device connected.
+
+3. Testing that it works fine:
+
+   You can test the output by using earlyprintk=dbgp,keep and provoking
+   kernel messages on the host/target system. You can provoke a harmless
+   kernel message by for example doing:
+
+     echo h > /proc/sysrq-trigger
+
+   On the host/target system you should see this help line in "dmesg" output:
+
+     SysRq : HELP : loglevel(0-9) reBoot Crashdump terminate-all-tasks(E) memory-full-oom-kill(F) kill-all-tasks(I) saK show-backtrace-all-active-cpus(L) show-memory-usage(M) nice-all-RT-tasks(N) powerOff show-registers(P) show-all-timers(Q) unRaw Sync show-task-states(T) Unmount show-blocked-tasks(W) dump-ftrace-buffer(Z)
+
+   On the client/console system do:
+
+       cat /dev/ttyUSB0
+
+   And you should see the help line above displayed shortly after you've
+   provoked it on the host system.
+
+If it does not work then please ask about it on the linux-kernel@vger.kernel.org
+mailing list or contact the x86 maintainers.
-- 
cgit v0.10.2


From 00049522425e8390d1815e1579733644ad2bb0c7 Mon Sep 17 00:00:00 2001
From: Robin Getz <rgetz@blackfin.uclinux.org>
Date: Thu, 5 Mar 2009 18:18:49 +0800
Subject: Blackfin arch: Random read/write errors are a bad thing

Random read/write errors are a bad thing - so don't let anyone
(including the test bench) run on something we know is bad.

Signed-off-by: Robin Getz <rgetz@blackfin.uclinux.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/kernel/setup.c b/arch/blackfin/kernel/setup.c
index e5c1162..4646b98 100644
--- a/arch/blackfin/kernel/setup.c
+++ b/arch/blackfin/kernel/setup.c
@@ -889,6 +889,10 @@ void __init setup_arch(char **cmdline_p)
 			       CPU, bfin_revid());
 	}
 
+	/* We can't run on BF548-0.1 due to ANOMALY 05000448 */
+	if (bfin_cpuid() == 0x27de && bfin_revid() == 1)
+		panic("You can't run on this processor due to 05000448\n");
+
 	printk(KERN_INFO "Blackfin Linux support by http://blackfin.uclinux.org/\n");
 
 	printk(KERN_INFO "Processor Speed: %lu MHz core clock and %lu MHz System Clock\n",
diff --git a/arch/blackfin/mach-common/arch_checks.c b/arch/blackfin/mach-common/arch_checks.c
index b3a2e3f..a2ca26a 100644
--- a/arch/blackfin/mach-common/arch_checks.c
+++ b/arch/blackfin/mach-common/arch_checks.c
@@ -67,3 +67,9 @@
 #if ((0xffffffff - L1_CODE_START + 1) + CONFIG_BOOT_LOAD) > 0x1000000
 # error "The kernel load address is too high; keep it below 10meg for safety"
 #endif
+
+#ifdef ANOMALY_05000448
+# if ANOMALY_05000448
+#  error You are using a part with anomaly 05000448, this issue causes random memory read/write failures - that means random crashes.
+# endif
+#endif
-- 
cgit v0.10.2


From 1400b3faab8fedfffde5a7fe47098e2732d4aa76 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Wed, 4 Mar 2009 16:02:46 -0600
Subject: x86: UV, SGI RTC: fix uv_time.c for UP

Fix non-smp build of uv_time.c.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
LKML-Reference: <20090304220246.GC6288@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 6f8e325..2ffb6c5 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -24,6 +24,8 @@
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/bios.h>
 #include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/cpu.h>
 
 #define RTC_NAME		"sgi_rtc"
 
@@ -84,7 +86,7 @@ static void uv_rtc_send_IPI(int cpu)
 	unsigned long apicid, val;
 	int pnode;
 
-	apicid = per_cpu(x86_cpu_to_apicid, cpu);
+	apicid = cpu_physical_id(cpu);
 	pnode = uv_apicid_to_pnode(apicid);
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
 	      (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
-- 
cgit v0.10.2


From ba0dade4bbe6bb26b975323726f7214278d5e994 Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Thu, 5 Mar 2009 18:41:24 +0800
Subject: Blackfin arch: fix bug - On bf548-ezkit, ethernet fails to work after
 wakeup from "mem"

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/mach-common/dpmc_modes.S b/arch/blackfin/mach-common/dpmc_modes.S
index 4da50bc..8009a51 100644
--- a/arch/blackfin/mach-common/dpmc_modes.S
+++ b/arch/blackfin/mach-common/dpmc_modes.S
@@ -376,10 +376,22 @@ ENTRY(_do_hibernate)
 #endif
 
 #ifdef PINT0_ASSIGN
+	PM_SYS_PUSH(PINT0_MASK_SET)
+	PM_SYS_PUSH(PINT1_MASK_SET)
+	PM_SYS_PUSH(PINT2_MASK_SET)
+	PM_SYS_PUSH(PINT3_MASK_SET)
 	PM_SYS_PUSH(PINT0_ASSIGN)
 	PM_SYS_PUSH(PINT1_ASSIGN)
 	PM_SYS_PUSH(PINT2_ASSIGN)
 	PM_SYS_PUSH(PINT3_ASSIGN)
+	PM_SYS_PUSH(PINT0_INVERT_SET)
+	PM_SYS_PUSH(PINT1_INVERT_SET)
+	PM_SYS_PUSH(PINT2_INVERT_SET)
+	PM_SYS_PUSH(PINT3_INVERT_SET)
+	PM_SYS_PUSH(PINT0_EDGE_SET)
+	PM_SYS_PUSH(PINT1_EDGE_SET)
+	PM_SYS_PUSH(PINT2_EDGE_SET)
+	PM_SYS_PUSH(PINT3_EDGE_SET)
 #endif
 
 	PM_SYS_PUSH(EBIU_AMBCTL0)
@@ -714,10 +726,22 @@ ENTRY(_do_hibernate)
 	PM_SYS_POP(EBIU_AMBCTL0)
 
 #ifdef PINT0_ASSIGN
+	PM_SYS_POP(PINT3_EDGE_SET)
+	PM_SYS_POP(PINT2_EDGE_SET)
+	PM_SYS_POP(PINT1_EDGE_SET)
+	PM_SYS_POP(PINT0_EDGE_SET)
+	PM_SYS_POP(PINT3_INVERT_SET)
+	PM_SYS_POP(PINT2_INVERT_SET)
+	PM_SYS_POP(PINT1_INVERT_SET)
+	PM_SYS_POP(PINT0_INVERT_SET)
 	PM_SYS_POP(PINT3_ASSIGN)
 	PM_SYS_POP(PINT2_ASSIGN)
 	PM_SYS_POP(PINT1_ASSIGN)
 	PM_SYS_POP(PINT0_ASSIGN)
+	PM_SYS_POP(PINT3_MASK_SET)
+	PM_SYS_POP(PINT2_MASK_SET)
+	PM_SYS_POP(PINT1_MASK_SET)
+	PM_SYS_POP(PINT0_MASK_SET)
 #endif
 
 #ifdef SICA_IWR1
-- 
cgit v0.10.2


From 8e2a769414604e016fcb3d0f15a2050b5d0d90c0 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier.adi@gmail.com>
Date: Thu, 5 Mar 2009 18:45:07 +0800
Subject: Blackfin arch: mark init_pda as __init as only __init funcs all it

Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/mm/init.c b/arch/blackfin/mm/init.c
index d0532b7..9c3629b 100644
--- a/arch/blackfin/mm/init.c
+++ b/arch/blackfin/mm/init.c
@@ -104,7 +104,7 @@ void __init paging_init(void)
 	}
 }
 
-asmlinkage void init_pda(void)
+asmlinkage void __init init_pda(void)
 {
 	unsigned int cpu = raw_smp_processor_id();
 
-- 
cgit v0.10.2


From 27276ba21fe590edc43305f483565aa02010bbbb Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier.adi@gmail.com>
Date: Thu, 5 Mar 2009 18:47:20 +0800
Subject: Blackfin arch: remove spurious dash when dcache is off

Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/kernel/setup.c b/arch/blackfin/kernel/setup.c
index 4646b98..a58687b 100644
--- a/arch/blackfin/kernel/setup.c
+++ b/arch/blackfin/kernel/setup.c
@@ -1145,12 +1145,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		icache_size = 0;
 
 	seq_printf(m, "cache size\t: %d KB(L1 icache) "
-		"%d KB(L1 dcache-%s) %d KB(L2 cache)\n",
+		"%d KB(L1 dcache%s) %d KB(L2 cache)\n",
 		icache_size, dcache_size,
 #if defined CONFIG_BFIN_WB
-		"wb"
+		"-wb"
 #elif defined CONFIG_BFIN_WT
-		"wt"
+		"-wt"
 #endif
 		"", 0);
 
-- 
cgit v0.10.2


From 7786ce823b1c9a3de01a63857e3451890cb4e81c Mon Sep 17 00:00:00 2001
From: Jie Zhang <jie.zhang@analog.com>
Date: Thu, 5 Mar 2009 18:50:26 +0800
Subject: Blackfin arch: fix bug - gdb signull case make trunk kernel panic
 frequently

Use copy_to_user_page and copy_from_user_page instead of
memcpy. copy_to_user_page does cache flush when necessary.

Signed-off-by: Jie Zhang <jie.zhang@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c
index 594e325..d76618d 100644
--- a/arch/blackfin/kernel/ptrace.c
+++ b/arch/blackfin/kernel/ptrace.c
@@ -45,6 +45,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/dma.h>
 #include <asm/fixed_code.h>
+#include <asm/cacheflush.h>
 #include <asm/mem_map.h>
 
 #define TEXT_OFFSET 0
@@ -240,7 +241,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 			} else if (addr >= FIXED_CODE_START
 			    && addr + sizeof(tmp) <= FIXED_CODE_END) {
-				memcpy(&tmp, (const void *)(addr), sizeof(tmp));
+				copy_from_user_page(0, 0, 0, &tmp, (const void *)(addr), sizeof(tmp));
 				copied = sizeof(tmp);
 
 			} else
@@ -320,7 +321,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 			} else if (addr >= FIXED_CODE_START
 			    && addr + sizeof(data) <= FIXED_CODE_END) {
-				memcpy((void *)(addr), &data, sizeof(data));
+				copy_to_user_page(0, 0, 0, (void *)(addr), &data, sizeof(data));
 				copied = sizeof(data);
 
 			} else
-- 
cgit v0.10.2


From a1a15ac5f9aeee521c048a88fc1aec848e623de7 Mon Sep 17 00:00:00 2001
From: Kris Shannon <kris@shannon.id.au>
Date: Mon, 2 Mar 2009 19:47:37 +1100
Subject: Fix kernel NULL pointer dereference in xen-blkfront

When booting Xen Dom0 on a pre-release 3.2.1 hypervisor the system Oopses on a
"Unable to handle kernel NULL pointer dereference" in xenwatch.

From the backtrace it looks like backend_changed is calling bdget_disk
with a NULL pointer.  Checking for NULL and returning ENODEV instead
allows the kernel to boot.

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index b6c8ce2..8f90508 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -977,6 +977,8 @@ static void backend_changed(struct xenbus_device *dev,
 		break;
 
 	case XenbusStateClosing:
+		if (info->gd == NULL)
+			xenbus_dev_fatal(dev, -ENODEV, "gd is NULL");
 		bd = bdget_disk(info->gd, 0);
 		if (bd == NULL)
 			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
-- 
cgit v0.10.2


From 5e18cfd04feca78cc08a6b8b71a60a610de81eaa Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 27 Feb 2009 08:10:26 +0100
Subject: cciss: remove 30 second initial timeout on controller reset

Commit 5e4c91c84b194b26cf592779e451f4b5be777cba forgot to remove the
initial sleep, get rid of it.

Thanks to Randy Dunlap <randy.dunlap@oracle.com> for spotting this error.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index b5a0611..4f9b6d7 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -3606,11 +3606,9 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 		if (cciss_hard_reset_controller(pdev) || cciss_reset_msi(pdev))
 			return -ENODEV;
 
-		/* Some devices (notably the HP Smart Array 5i Controller)
-		   need a little pause here */
-		schedule_timeout_uninterruptible(30*HZ);
-
-		/* Now try to get the controller to respond to a no-op */
+		/* Now try to get the controller to respond to a no-op. Some
+		   devices (notably the HP Smart Array 5i Controller) need
+		   up to 30 seconds to respond. */
 		for (i=0; i<30; i++) {
 			if (cciss_noop(pdev) == 0)
 				break;
-- 
cgit v0.10.2


From a3941ec101a5ec54c1e929730afeb196441a171e Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 5 Mar 2009 08:03:53 +0100
Subject: loop: don't increment p->offset with (size_t) -EINVAL

Upon a 'transfer error block' size is set to -EINVAL, but this becomes positive
since size is unsigned: p->offset still gets incremented.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index edbaac6..bf03455 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -392,8 +392,7 @@ lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	struct loop_device *lo = p->lo;
 	struct page *page = buf->page;
 	sector_t IV;
-	size_t size;
-	int ret;
+	int size, ret;
 
 	ret = buf->ops->confirm(pipe, buf);
 	if (unlikely(ret))
-- 
cgit v0.10.2


From fb7b75abe58acfa586445e430c29412090ad2058 Mon Sep 17 00:00:00 2001
From: Alon Bar-Lev <alon.barlev@gmail.com>
Date: Thu, 5 Mar 2009 19:42:43 +0800
Subject: Blackfin arch: cleanup bfin_sport.h header and export it to userspace

Signed-off-by: Alon Bar-Lev <alon.barlev@gmail.com>
Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild
index 606ecfd..09c3141 100644
--- a/arch/blackfin/include/asm/Kbuild
+++ b/arch/blackfin/include/asm/Kbuild
@@ -1,3 +1,4 @@
 include include/asm-generic/Kbuild.asm
 
+unifdef-y += bfin_sport.h
 unifdef-y += fixed_code.h
diff --git a/arch/blackfin/include/asm/bfin_sport.h b/arch/blackfin/include/asm/bfin_sport.h
index fe88a2c..65a651d 100644
--- a/arch/blackfin/include/asm/bfin_sport.h
+++ b/arch/blackfin/include/asm/bfin_sport.h
@@ -1,30 +1,9 @@
 /*
- * File:         include/asm-blackfin/bfin_sport.h
- * Based on:
- * Author:       Roy Huang (roy.huang@analog.com)
+ * bfin_sport.h - userspace header for bfin sport driver
  *
- * Created:      Thu Aug. 24 2006
- * Description:
+ * Copyright 2004-2008 Analog Devices Inc.
  *
- * Modified:
- *               Copyright 2004-2006 Analog Devices Inc.
- *
- * Bugs:         Enter bugs at http://blackfin.uclinux.org/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see the file COPYING, or write
- * to the Free Software Foundation, Inc.,
- * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ * Licensed under the GPL-2 or later.
  */
 
 #ifndef __BFIN_SPORT_H__
@@ -42,11 +21,10 @@
 #define NORM_FORMAT	0x0
 #define ALAW_FORMAT	0x2
 #define ULAW_FORMAT	0x3
-struct sport_register;
 
 /* Function driver which use sport must initialize the structure */
 struct sport_config {
-	/*TDM (multichannels), I2S or other mode */
+	/* TDM (multichannels), I2S or other mode */
 	unsigned int mode:3;
 
 	/* if TDM mode is selected, channels must be set */
@@ -72,12 +50,18 @@ struct sport_config {
 	int serial_clk;
 	int fsync_clk;
 
-	unsigned int data_format:2;	/*Normal, u-law or a-law */
+	unsigned int data_format:2;	/* Normal, u-law or a-law */
 
 	int word_len;		/* How length of the word in bits, 3-32 bits */
 	int dma_enabled;
 };
 
+/* Userspace interface */
+#define SPORT_IOC_MAGIC		'P'
+#define SPORT_IOC_CONFIG	_IOWR('P', 0x01, struct sport_config)
+
+#ifdef __KERNEL__
+
 struct sport_register {
 	unsigned short tcr1;
 	unsigned short reserved0;
@@ -117,9 +101,6 @@ struct sport_register {
 	unsigned long mrcs3;
 };
 
-#define SPORT_IOC_MAGIC		'P'
-#define SPORT_IOC_CONFIG	_IOWR('P', 0x01, struct sport_config)
-
 struct sport_dev {
 	struct cdev cdev;	/* Char device structure */
 
@@ -149,6 +130,8 @@ struct sport_dev {
 	struct sport_config config;
 };
 
+#endif
+
 #define SPORT_TCR1	0
 #define	SPORT_TCR2	1
 #define	SPORT_TCLKDIV	2
@@ -169,4 +152,4 @@ struct sport_dev {
 #define SPORT_MRCS2	22
 #define SPORT_MRCS3	23
 
-#endif				/*__BFIN_SPORT_H__*/
+#endif
-- 
cgit v0.10.2


From e7d3ef13d52a126438f687a1a32da65ff926ed57 Mon Sep 17 00:00:00 2001
From: Stuart Hayes <stuart_hayes@dell.com>
Date: Wed, 4 Mar 2009 11:59:46 -0800
Subject: libata: change drive ready wait after hard reset to 5s

This fixes problems during resume with drives that take longer than 1s to
be ready.  The ATA-6 spec appears to allow 5 seconds for a drive to be
ready.

On one affected system, this patch changes "PM: resume devices took..."
message from 17 seconds to 4 seconds, and gets rid of a lot of ugly
timeout/error messages.

Without this patch, the libata code moves on after 1s, tries to send a
soft reset (which the drive doesn't see because it isn't ready) which also
times out, then an IDENTIFY command is sent to the drive which times out,
and finally the error handler will try to send another hard reset which
will finally get things working.

Signed-off-by: Stuart Hayes <stuart_hayes@dell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 5d87bc0..dd818c7 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -275,7 +275,7 @@ enum {
 	 * advised to wait only for the following duration before
 	 * doing SRST.
 	 */
-	ATA_TMOUT_PMP_SRST_WAIT	= 1000,
+	ATA_TMOUT_PMP_SRST_WAIT	= 5000,
 
 	/* ATA bus states */
 	BUS_UNKNOWN		= 0,
-- 
cgit v0.10.2


From 5825627c9463581fd9e70f8285685889ae5bb9bb Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 27 Feb 2009 17:35:43 +0900
Subject: libata: fix dma_unmap_sg misuse

libata passes the returned value of dma_map_sg() to
dma_unmap_sg(),which is the misuse of dma_unmap_sg().

DMA-mapping.txt says:

To unmap a scatterlist, just call:

	pci_unmap_sg(pdev, sglist, nents, direction);

Again, make sure DMA activity has already finished.

PLEASE NOTE:  The 'nents' argument to the pci_unmap_sg call must be
              the _same_ one you passed into the pci_map_sg call,
	      it should _NOT_ be the 'count' value _returned_ from the
              pci_map_sg call.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 9fbf059..5e324ce 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4612,7 +4612,7 @@ void ata_sg_clean(struct ata_queued_cmd *qc)
 	VPRINTK("unmapping %u sg elements\n", qc->n_elem);
 
 	if (qc->n_elem)
-		dma_unmap_sg(ap->dev, sg, qc->n_elem, dir);
+		dma_unmap_sg(ap->dev, sg, qc->orig_n_elem, dir);
 
 	qc->flags &= ~ATA_QCFLAG_DMAMAP;
 	qc->sg = NULL;
@@ -4727,7 +4727,7 @@ static int ata_sg_setup(struct ata_queued_cmd *qc)
 		return -1;
 
 	DPRINTK("%d sg elements mapped\n", n_elem);
-
+	qc->orig_n_elem = qc->n_elem;
 	qc->n_elem = n_elem;
 	qc->flags |= ATA_QCFLAG_DMAMAP;
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index dd818c7..fbf064e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -530,6 +530,7 @@ struct ata_queued_cmd {
 	unsigned long		flags;		/* ATA_QCFLAG_xxx */
 	unsigned int		tag;
 	unsigned int		n_elem;
+	unsigned int		orig_n_elem;
 
 	int			dma_dir;
 
-- 
cgit v0.10.2


From 84bda12af31f930e4200c5244aa111de2485d7b0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 Mar 2009 18:53:26 +0900
Subject: libata: align ap->sector_buf

ap->sector_buf is used as DMA target and should at least be aligned on
cacheline.  This caused problems on some embedded machines.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/include/linux/libata.h b/include/linux/libata.h
index fbf064e..dc18b87 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -751,7 +751,8 @@ struct ata_port {
 	acpi_handle		acpi_handle;
 	struct ata_acpi_gtm	__acpi_init_gtm; /* use ata_acpi_init_gtm() */
 #endif
-	u8			sector_buf[ATA_SECT_SIZE]; /* owned by EH */
+	/* owned by EH */
+	u8			sector_buf[ATA_SECT_SIZE] ____cacheline_aligned;
 };
 
 /* The following initializer overrides a method to NULL whether one of
-- 
cgit v0.10.2


From b53570814692db79c3525523b6e9ec9874416c04 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 Mar 2009 18:55:16 +0900
Subject: libata: don't use on-stack sense buffer

sense_buffer is used as DMA target and shouldn't be allocated on
stack.  Use ap->sector_buf instead.  This problem is spotted by Chuck
Ebbert.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Chuck Ebbert <cebbert@redhat.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index ce2ef04..009ccc7 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2901,7 +2901,7 @@ static int atapi_eh_clear_ua(struct ata_device *dev)
 	int i;
 
 	for (i = 0; i < ATA_EH_UA_TRIES; i++) {
-		u8 sense_buffer[SCSI_SENSE_BUFFERSIZE];
+		u8 *sense_buffer = dev->link->ap->sector_buf;
 		u8 sense_key = 0;
 		unsigned int err_mask;
 
-- 
cgit v0.10.2


From 7adbe46b9289794f8fe629cd78c876169741177f Mon Sep 17 00:00:00 2001
From: peerchen <peerchen@gmail.com>
Date: Fri, 27 Feb 2009 16:58:41 +0800
Subject: ahci: Add the Device IDs for MCP89 and remove IDs of MCP7B to/from
 ahci.c

Added the Device IDs for MCP89 AHCI controller.

Removed the IDs of MCP7B because this chipset had been cancelled.

Signed-off-by: Peer Chen <peerchen@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index a603bbf..66e012c 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -582,18 +582,18 @@ static const struct pci_device_id ahci_pci_tbl[] = {
 	{ PCI_VDEVICE(NVIDIA, 0x0abd), board_ahci },		/* MCP79 */
 	{ PCI_VDEVICE(NVIDIA, 0x0abe), board_ahci },		/* MCP79 */
 	{ PCI_VDEVICE(NVIDIA, 0x0abf), board_ahci },		/* MCP79 */
-	{ PCI_VDEVICE(NVIDIA, 0x0bc8), board_ahci },		/* MCP7B */
-	{ PCI_VDEVICE(NVIDIA, 0x0bc9), board_ahci },		/* MCP7B */
-	{ PCI_VDEVICE(NVIDIA, 0x0bca), board_ahci },		/* MCP7B */
-	{ PCI_VDEVICE(NVIDIA, 0x0bcb), board_ahci },		/* MCP7B */
-	{ PCI_VDEVICE(NVIDIA, 0x0bcc), board_ahci },		/* MCP7B */
-	{ PCI_VDEVICE(NVIDIA, 0x0bcd), board_ahci },		/* MCP7B */
-	{ PCI_VDEVICE(NVIDIA, 0x0bce), board_ahci },		/* MCP7B */
-	{ PCI_VDEVICE(NVIDIA, 0x0bcf), board_ahci },		/* MCP7B */
-	{ PCI_VDEVICE(NVIDIA, 0x0bc4), board_ahci },		/* MCP7B */
-	{ PCI_VDEVICE(NVIDIA, 0x0bc5), board_ahci },		/* MCP7B */
-	{ PCI_VDEVICE(NVIDIA, 0x0bc6), board_ahci },		/* MCP7B */
-	{ PCI_VDEVICE(NVIDIA, 0x0bc7), board_ahci },		/* MCP7B */
+	{ PCI_VDEVICE(NVIDIA, 0x0d84), board_ahci },		/* MCP89 */
+	{ PCI_VDEVICE(NVIDIA, 0x0d85), board_ahci },		/* MCP89 */
+	{ PCI_VDEVICE(NVIDIA, 0x0d86), board_ahci },		/* MCP89 */
+	{ PCI_VDEVICE(NVIDIA, 0x0d87), board_ahci },		/* MCP89 */
+	{ PCI_VDEVICE(NVIDIA, 0x0d88), board_ahci },		/* MCP89 */
+	{ PCI_VDEVICE(NVIDIA, 0x0d89), board_ahci },		/* MCP89 */
+	{ PCI_VDEVICE(NVIDIA, 0x0d8a), board_ahci },		/* MCP89 */
+	{ PCI_VDEVICE(NVIDIA, 0x0d8b), board_ahci },		/* MCP89 */
+	{ PCI_VDEVICE(NVIDIA, 0x0d8c), board_ahci },		/* MCP89 */
+	{ PCI_VDEVICE(NVIDIA, 0x0d8d), board_ahci },		/* MCP89 */
+	{ PCI_VDEVICE(NVIDIA, 0x0d8e), board_ahci },		/* MCP89 */
+	{ PCI_VDEVICE(NVIDIA, 0x0d8f), board_ahci },		/* MCP89 */
 
 	/* SiS */
 	{ PCI_VDEVICE(SI, 0x1184), board_ahci },		/* SiS 966 */
-- 
cgit v0.10.2


From 55f784c826af2506e417bcc484d7e0e4d27f1977 Mon Sep 17 00:00:00 2001
From: Brandon Ehle <azverkan@yahoo.com>
Date: Sun, 1 Mar 2009 00:02:49 -0800
Subject: sata_nv: fix module parameter description

Update MODULE_PARM_DESC for ADMA to reflect the fact that the
option is disabled by default.

Signed-off-by: Brandon Ehle <azverkan@yahoo.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index 55a8eed..f65b537 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -2523,7 +2523,7 @@ static void __exit nv_exit(void)
 module_init(nv_init);
 module_exit(nv_exit);
 module_param_named(adma, adma_enabled, bool, 0444);
-MODULE_PARM_DESC(adma, "Enable use of ADMA (Default: true)");
+MODULE_PARM_DESC(adma, "Enable use of ADMA (Default: false)");
 module_param_named(swncq, swncq_enabled, bool, 0444);
 MODULE_PARM_DESC(swncq, "Enable use of SWNCQ (Default: true)");
 
-- 
cgit v0.10.2


From d6515e6ff4ad3db4bd5ef2dd4e1026a7aca2482e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 4 Mar 2009 15:59:30 +0900
Subject: libata: make sure port is thawed when skipping resets

When SCR access is available and the link is offline, softreset is
skipped as it only wastes time and some controllers don't respond very
well.  However, the skip path forgot to thaw the port, which not only
blocks further event notification from the port but also causes
repeated EH invocations on the same event on drivers which rely on
->thaw() to clear events if the IRQ is shared with another device or
port.

This problem has always been there but is uncovered by recent sata_nv
nf2/3 change which dropped hardreset support while maintaining SCR
access.  nf2/3 doesn't clear hotplug event mask from the interrupt
handler but relies on ->thaw() to clear them.  When the hardreset was
there, the reset action was never skipped and the port was always
thawed but, with the hardreset gone, ->prereset() determines that
there's no need for softreset and both ->softreset() and ->thaw() are
skipped.  This leads to stuck hotplug event in the IRQ status register
triggering hotplug event whenever IRQ is delieverd on the same IRQ.
As the controller shares the same IRQ for both ports, this happens on
every IO if one port is occpupied and the other isn't.

This patch fixes the problem by making sure that the port is thawed on
reset-skip path.

bko#11615 reports this problem.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Robert Hancock <hancockrwd@gmail.com>
Reported-by: Dan Andresan <danyer@gmail.com>
Reported-by: Arne Woerner <arne_woerner@yahoo.com>
Reported-by: Stefan Lippers-Hollmann <s.L-H@gmx.de>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 009ccc7..ea89091 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2423,11 +2423,14 @@ int ata_eh_reset(struct ata_link *link, int classify,
 		}
 
 		/* prereset() might have cleared ATA_EH_RESET.  If so,
-		 * bang classes and return.
+		 * bang classes, thaw and return.
 		 */
 		if (reset && !(ehc->i.action & ATA_EH_RESET)) {
 			ata_for_each_dev(dev, link, ALL)
 				classes[dev->devno] = ATA_DEV_NONE;
+			if ((ap->pflags & ATA_PFLAG_FROZEN) &&
+			    ata_is_host_link(link))
+				ata_eh_thaw_port(ap);
 			rc = 0;
 			goto out;
 		}
-- 
cgit v0.10.2


From 968e594afdbc40b4270f9d4032ae8350475749d6 Mon Sep 17 00:00:00 2001
From: Robert Hancock <hancockrwd@gmail.com>
Date: Mon, 16 Feb 2009 20:15:08 -0600
Subject: libata: Don't trust current capacity values in identify words 57-58
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hanno Böck reported a problem where an old Conner CP30254 240MB hard drive
was reported as 1.1TB in capacity by libata:

http://lkml.org/lkml/2009/2/13/134

This was caused by libata trusting the drive's reported current capacity in
sectors in identify words 57 and 58 if the drive does not support LBA and the
current CHS translation values appear valid. Unfortunately it seems older
ATA specs were vague about what this field should contain and a number of drives
used values with wrong byte order or that were totally bogus. There's no
unique information that it conveys and so we can just calculate the number
of sectors from the reported current CHS values.

While we're at it, clean up this function to use named constants for the
identify word values.

Signed-off-by: Robert Hancock <hancockrwd@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 5e324ce..060bcd6 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1322,14 +1322,16 @@ static u64 ata_id_n_sectors(const u16 *id)
 {
 	if (ata_id_has_lba(id)) {
 		if (ata_id_has_lba48(id))
-			return ata_id_u64(id, 100);
+			return ata_id_u64(id, ATA_ID_LBA_CAPACITY_2);
 		else
-			return ata_id_u32(id, 60);
+			return ata_id_u32(id, ATA_ID_LBA_CAPACITY);
 	} else {
 		if (ata_id_current_chs_valid(id))
-			return ata_id_u32(id, 57);
+			return id[ATA_ID_CUR_CYLS] * id[ATA_ID_CUR_HEADS] *
+			       id[ATA_ID_CUR_SECTORS];
 		else
-			return id[1] * id[3] * id[6];
+			return id[ATA_ID_CYLS] * id[ATA_ID_HEADS] *
+			       id[ATA_ID_SECTORS];
 	}
 }
 
-- 
cgit v0.10.2


From c3f5d2d8b5fa6eb0cc1c47fd162bf6432f206f42 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:54:52 +0200
Subject: x86: init_memory_mapping() trivial cleanups

Impact: cleanup

To reduce the diff between the 32-bit and 64-bit versions of
init_memory_mapping(), fix up all trivial issues.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-1-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c351456..ad4e03c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -868,11 +868,10 @@ static void __init find_early_table_space(unsigned long end, int use_pse)
 
 	table_start >>= PAGE_SHIFT;
 	table_end = table_start;
-	table_top = table_start + (tables>>PAGE_SHIFT);
+	table_top = table_start + (tables >> PAGE_SHIFT);
 
 	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, table_start << PAGE_SHIFT,
-		(table_start << PAGE_SHIFT) + tables);
+		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
 }
 
 struct map_range {
@@ -899,8 +898,13 @@ static int save_mr(struct map_range *mr, int nr_range,
 	return nr_range;
 }
 
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
 unsigned long __init_refok init_memory_mapping(unsigned long start,
-						unsigned long end)
+					       unsigned long end)
 {
 	pgd_t *pgd_base = swapper_pg_dir;
 	unsigned long page_size_mask = 0;
@@ -911,7 +915,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	int nr_range, i;
 	int use_pse;
 
-	printk(KERN_INFO "init_memory_mapping: %08lx-%08lx\n", start, end);
+	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 	/*
@@ -940,19 +944,19 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 		__supported_pte_mask |= _PAGE_GLOBAL;
 	}
 
-	memset(mr, 0, sizeof(mr));
-	nr_range = 0;
-
 	if (use_pse)
 		page_size_mask |= 1 << PG_LEVEL_2M;
 
+	memset(mr, 0, sizeof(mr));
+	nr_range = 0;
+
 	/*
 	 * Don't use a large page for the first 2/4MB of memory
 	 * because there are often fixed size MTRRs in there
 	 * and overlapping MTRRs into large pages can cause
 	 * slowdowns.
 	 */
-	/* head could not be big page alignment ? */
+	/* head if not big page alignment ? */
 	start_pfn = start >> PAGE_SHIFT;
 	pos = start_pfn << PAGE_SHIFT;
 	if (pos == 0)
@@ -960,14 +964,14 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	else
 		end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
 				 << (PMD_SHIFT - PAGE_SHIFT);
-	if (end_pfn > (end>>PAGE_SHIFT))
-		end_pfn = end>>PAGE_SHIFT;
+	if (end_pfn > (end >> PAGE_SHIFT))
+		end_pfn = end >> PAGE_SHIFT;
 	if (start_pfn < end_pfn) {
 		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
 		pos = end_pfn << PAGE_SHIFT;
 	}
 
-	/* big page range */
+	/* big page (2M) range */
 	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
 			 << (PMD_SHIFT - PAGE_SHIFT);
 	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
@@ -977,7 +981,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 		pos = end_pfn << PAGE_SHIFT;
 	}
 
-	/* tail is not big page alignment ? */
+	/* tail is not big page (2M) alignment */
 	start_pfn = pos>>PAGE_SHIFT;
 	end_pfn = end>>PAGE_SHIFT;
 	if (start_pfn < end_pfn)
@@ -998,13 +1002,17 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	}
 
 	for (i = 0; i < nr_range; i++)
-		printk(KERN_DEBUG " %08lx - %08lx page %s\n",
-			mr[i].start, mr[i].end,
-			(mr[i].page_size_mask & (1<<PG_LEVEL_2M)) ?
-				  "big page" : "4k");
+		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+				mr[i].start, mr[i].end,
+			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
 
 	/*
 	 * Find space for the kernel direct mapping tables.
+	 *
+	 * Later we should allocate these tables in the local node of the
+	 * memory mapped. Unfortunately this is done currently before the
+	 * nodes are discovered.
 	 */
 	if (!after_init_bootmem)
 		find_early_table_space(end, use_pse);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d325186..cdb3be1 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -554,20 +554,25 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 
 	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
 	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+
 	if (use_gbpages) {
 		unsigned long extra;
+
 		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
 		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
 	} else
 		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+
 	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
 
 	if (use_pse) {
 		unsigned long extra;
+
 		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
 		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	} else
 		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
 	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
 
 	/*
@@ -647,7 +652,6 @@ static int save_mr(struct map_range *mr, int nr_range,
 		   unsigned long start_pfn, unsigned long end_pfn,
 		   unsigned long page_size_mask)
 {
-
 	if (start_pfn < end_pfn) {
 		if (nr_range >= NR_RANGE_MR)
 			panic("run out of range for init_memory_mapping\n");
@@ -679,13 +683,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 
 	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
 
-	/*
-	 * Find space for the kernel direct mapping tables.
-	 *
-	 * Later we should allocate these tables in the local node of the
-	 * memory mapped. Unfortunately this is done currently before the
-	 * nodes are discovered.
-	 */
 	if (!after_bootmem)
 		init_gbpages();
 
@@ -709,7 +706,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	memset(mr, 0, sizeof(mr));
 	nr_range = 0;
 
-	/* head if not big page alignment ?*/
+	/* head if not big page alignment ? */
 	start_pfn = start >> PAGE_SHIFT;
 	pos = start_pfn << PAGE_SHIFT;
 	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
@@ -721,7 +718,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 		pos = end_pfn << PAGE_SHIFT;
 	}
 
-	/* big page (2M) range*/
+	/* big page (2M) range */
 	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
 			 << (PMD_SHIFT - PAGE_SHIFT);
 	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
@@ -769,7 +766,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 		/* move it */
 		old_start = mr[i].start;
 		memmove(&mr[i], &mr[i+1],
-			 (nr_range - 1 - i) * sizeof (struct map_range));
+			(nr_range - 1 - i) * sizeof(struct map_range));
 		mr[i--].start = old_start;
 		nr_range--;
 	}
@@ -780,6 +777,13 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
 			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
 
+	/*
+	 * Find space for the kernel direct mapping tables.
+	 *
+	 * Later we should allocate these tables in the local node of the
+	 * memory mapped. Unfortunately this is done currently before the
+	 * nodes are discovered.
+	 */
 	if (!after_bootmem)
 		find_early_table_space(end, use_pse, use_gbpages);
 
-- 
cgit v0.10.2


From 4bbd4fa03832208f0e6e0b9e73a0ffa2620a626a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:54:53 +0200
Subject: x86: add gbpages support to 32-bit init_memory_mapping()

Impact: cleanup

To reduce the diff between the 32-bit and 64-bit versions of
init_memory_mapping(), add gbpages support to the 32-bit version.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-2-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ad4e03c..5fad0f9 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -65,6 +65,8 @@ static unsigned long __meminitdata table_top;
 
 static int __initdata after_init_bootmem;
 
+int direct_gbpages;
+
 static __init void *alloc_low_page(void)
 {
 	unsigned long pfn = table_end++;
@@ -831,14 +833,22 @@ void __init setup_bootmem_allocator(void)
 	after_init_bootmem = 1;
 }
 
-static void __init find_early_table_space(unsigned long end, int use_pse)
+static void __init find_early_table_space(unsigned long end, int use_pse,
+					  int use_gbpages)
 {
 	unsigned long puds, pmds, ptes, tables, start;
 
 	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
 	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
 
-	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+	if (use_gbpages) {
+		unsigned long extra;
+
+		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+	} else
+		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+
 	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
 
 	if (use_pse) {
@@ -913,7 +923,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 
 	struct map_range mr[NR_RANGE_MR];
 	int nr_range, i;
-	int use_pse;
+	int use_pse, use_gbpages;
 
 	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
 
@@ -923,9 +933,10 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	 * This will simplify cpa(), which otherwise needs to support splitting
 	 * large pages into small in interrupt context, etc.
 	 */
-	use_pse = 0;
+	use_pse = use_gbpages = 0;
 #else
 	use_pse = cpu_has_pse;
+	use_gbpages = direct_gbpages;
 #endif
 
 #ifdef CONFIG_X86_PAE
@@ -944,6 +955,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 		__supported_pte_mask |= _PAGE_GLOBAL;
 	}
 
+	if (use_gbpages)
+		page_size_mask |= 1 << PG_LEVEL_1G;
 	if (use_pse)
 		page_size_mask |= 1 << PG_LEVEL_2M;
 
@@ -1015,7 +1028,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	 * nodes are discovered.
 	 */
 	if (!after_init_bootmem)
-		find_early_table_space(end, use_pse);
+		find_early_table_space(end, use_pse, use_gbpages);
 
 	for (i = 0; i < nr_range; i++)
 		kernel_physical_mapping_init(pgd_base,
-- 
cgit v0.10.2


From 49a2bf7303b0dc5fccbb3ff7cf2e7751f0e3953d Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:54:54 +0200
Subject: x86: find_early_table_space() unification

Impact: cleanup

There are some minor differences between the 32-bit and 64-bit
find_early_table_space() functions. This patch wraps those
differences under CONFIG_X86_32 to make the function identical
on both configurations.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-3-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 5fad0f9..86a9994 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -855,24 +855,33 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 		unsigned long extra;
 
 		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
 		extra += PMD_SIZE;
+#endif
 		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	} else
 		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
 
+#ifdef CONFIG_X86_32
 	/* for fixmap */
 	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+#endif
 
 	/*
 	 * RED-PEN putting page tables only on node 0 could
 	 * cause a hotspot and fill up ZONE_DMA. The page tables
 	 * need roughly 0.5KB per GB.
 	 */
+#ifdef CONFIG_X86_32
 	start = 0x7000;
 	table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
 					tables, PAGE_SIZE);
+#else /* CONFIG_X86_64 */
+	start = 0x8000;
+	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
+#endif
 	if (table_start == -1UL)
 		panic("Cannot find space for the kernel page tables");
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index cdb3be1..151e5ba 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -569,19 +569,33 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 		unsigned long extra;
 
 		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+		extra += PMD_SIZE;
+#endif
 		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	} else
 		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
 
+#ifdef CONFIG_X86_32
+	/* for fixmap */
+	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+#endif
+
 	/*
 	 * RED-PEN putting page tables only on node 0 could
 	 * cause a hotspot and fill up ZONE_DMA. The page tables
 	 * need roughly 0.5KB per GB.
 	 */
+#ifdef CONFIG_X86_32
+	start = 0x7000;
+	table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+					tables, PAGE_SIZE);
+#else /* CONFIG_X86_64 */
 	start = 0x8000;
 	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
+#endif
 	if (table_start == -1UL)
 		panic("Cannot find space for the kernel page tables");
 
-- 
cgit v0.10.2


From e7179853e7552ba6631e2cdd9f5c374383403b4b Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:54:55 +0200
Subject: x86: move pgd_base out of init_memory_mapping()

Impact: cleanup

This patch moves pgd_base out of init_memory_mapping() to reduce
the diff between the 32-bit version and the 64-bit version of the
function.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-4-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 86a9994..cfc68d6 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -227,11 +227,11 @@ static inline int is_kernel_text(unsigned long addr)
  * of max_low_pfn pages, by creating page tables starting from address
  * PAGE_OFFSET:
  */
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
-						unsigned long start_pfn,
+static void __init kernel_physical_mapping_init(unsigned long start_pfn,
 						unsigned long end_pfn,
 						int use_pse)
 {
+	pgd_t *pgd_base = swapper_pg_dir;
 	int pgd_idx, pmd_idx, pte_ofs;
 	unsigned long pfn;
 	pgd_t *pgd;
@@ -509,8 +509,9 @@ void __init native_pagetable_setup_done(pgd_t *base)
  * be partially populated, and so it avoids stomping on any existing
  * mappings.
  */
-static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
+static void __init early_ioremap_page_table_range_init(void)
 {
+	pgd_t *pgd_base = swapper_pg_dir;
 	unsigned long vaddr, end;
 
 	/*
@@ -925,7 +926,6 @@ static int save_mr(struct map_range *mr, int nr_range,
 unsigned long __init_refok init_memory_mapping(unsigned long start,
 					       unsigned long end)
 {
-	pgd_t *pgd_base = swapper_pg_dir;
 	unsigned long page_size_mask = 0;
 	unsigned long start_pfn, end_pfn;
 	unsigned long pos;
@@ -1040,12 +1040,12 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 		find_early_table_space(end, use_pse, use_gbpages);
 
 	for (i = 0; i < nr_range; i++)
-		kernel_physical_mapping_init(pgd_base,
+		kernel_physical_mapping_init(
 				mr[i].start >> PAGE_SHIFT,
 				mr[i].end >> PAGE_SHIFT,
 				mr[i].page_size_mask == (1<<PG_LEVEL_2M));
 
-	early_ioremap_page_table_range_init(pgd_base);
+	early_ioremap_page_table_range_init();
 
 	load_cr3(swapper_pg_dir);
 
-- 
cgit v0.10.2


From 54e63f3a4282a8bc5b39db29095f076ece2b1073 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:54:56 +0200
Subject: x86: ifdef 32-bit specific setup in init_memory_mapping()

Impact: cleanup

Enabling NX, PSE, and PGE are only required on 32-bit so ifdef them
in both versions of the function.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-5-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index cfc68d6..eb98cb9 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -948,6 +948,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	use_gbpages = direct_gbpages;
 #endif
 
+#ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_PAE
 	set_nx();
 	if (nx_enabled)
@@ -963,6 +964,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 		set_in_cr4(X86_CR4_PGE);
 		__supported_pte_mask |= _PAGE_GLOBAL;
 	}
+#endif
 
 	if (use_gbpages)
 		page_size_mask |= 1 << PG_LEVEL_1G;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 151e5ba..c3c0be5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -712,6 +712,24 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	use_gbpages = direct_gbpages;
 #endif
 
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_PAE
+	set_nx();
+	if (nx_enabled)
+		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#endif
+
+	/* Enable PSE if available */
+	if (cpu_has_pse)
+		set_in_cr4(X86_CR4_PSE);
+
+	/* Enable PGE if available */
+	if (cpu_has_pge) {
+		set_in_cr4(X86_CR4_PGE);
+		__supported_pte_mask |= _PAGE_GLOBAL;
+	}
+#endif
+
 	if (use_gbpages)
 		page_size_mask |= 1 << PG_LEVEL_1G;
 	if (use_pse)
-- 
cgit v0.10.2


From 96083ca11bc85265c7ef9e791a57e3514d8f605a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:54:57 +0200
Subject: x86: remove unnecessary save_mr() sanity check

Impact: cleanup

The save_mr() function already checks that start_pfn is less than
end_pfn so we can remove the unnecessary check which reduces the
diff between the 32-bit and the 64-bit versions of init_memory_mapping().

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-6-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index eb98cb9..559715b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -1008,8 +1008,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	/* tail is not big page (2M) alignment */
 	start_pfn = pos>>PAGE_SHIFT;
 	end_pfn = end>>PAGE_SHIFT;
-	if (start_pfn < end_pfn)
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
 
 	/* try to merge same page size and continuous */
 	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
-- 
cgit v0.10.2


From c464573cb3d3bdd45eed8f5f59596f84ede95a0c Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:54:58 +0200
Subject: x86: rename after_init_bootmem to after_bootmem in mm/init_32.c

Impact: cleanup

This patch renames after_init_bootmem to after_bootmem in
mm/init_32.c to reduce the diff to the 64-bit version of of
init_memory_mapping().

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-7-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 559715b..cc5c399 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -63,7 +63,7 @@ static unsigned long __initdata table_start;
 static unsigned long __meminitdata table_end;
 static unsigned long __meminitdata table_top;
 
-static int __initdata after_init_bootmem;
+int after_bootmem;
 
 int direct_gbpages;
 
@@ -92,7 +92,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 
 #ifdef CONFIG_X86_PAE
 	if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
-		if (after_init_bootmem)
+		if (after_bootmem)
 			pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
 		else
 			pmd_table = (pmd_t *)alloc_low_page();
@@ -119,7 +119,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 	if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
 		pte_t *page_table = NULL;
 
-		if (after_init_bootmem) {
+		if (after_bootmem) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 			page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
 #endif
@@ -158,7 +158,7 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
 		pte_t *newpte;
 		int i;
 
-		BUG_ON(after_init_bootmem);
+		BUG_ON(after_bootmem);
 		newpte = alloc_low_page();
 		for (i = 0; i < PTRS_PER_PTE; i++)
 			set_pte(newpte + i, pte[i]);
@@ -831,7 +831,7 @@ void __init setup_bootmem_allocator(void)
 	bootmap = setup_node_bootmem(0, 0, max_low_pfn, bootmap);
 #endif
 
-	after_init_bootmem = 1;
+	after_bootmem = 1;
 }
 
 static void __init find_early_table_space(unsigned long end, int use_pse,
@@ -1037,7 +1037,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	 * memory mapped. Unfortunately this is done currently before the
 	 * nodes are discovered.
 	 */
-	if (!after_init_bootmem)
+	if (!after_bootmem)
 		find_early_table_space(end, use_pse, use_gbpages);
 
 	for (i = 0; i < nr_range; i++)
@@ -1052,11 +1052,11 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 
 	__flush_tlb_all();
 
-	if (!after_init_bootmem)
+	if (!after_bootmem)
 		reserve_early(table_start << PAGE_SHIFT,
 				 table_end << PAGE_SHIFT, "PGTABLE");
 
-	if (!after_init_bootmem)
+	if (!after_bootmem)
 		early_memtest(start, end);
 
 	return end >> PAGE_SHIFT;
-- 
cgit v0.10.2


From cbba65796df99f3ca9bf70d14e5a19384c54b6a1 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:54:59 +0200
Subject: x86: unify kernel_physical_mapping_init() call in
 init_memory_mapping()

Impact: cleanup

The 64-bit version of init_memory_mapping() uses the last mapped
address returned from kernel_physical_mapping_init() whereas the
32-bit version doesn't. This patch adds relevant ifdefs to both
versions of the function to reduce the diff between them.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-8-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index cc5c399..00c1d85 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -929,6 +929,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	unsigned long page_size_mask = 0;
 	unsigned long start_pfn, end_pfn;
 	unsigned long pos;
+	unsigned long ret;
 
 	struct map_range mr[NR_RANGE_MR];
 	int nr_range, i;
@@ -1040,11 +1041,18 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	if (!after_bootmem)
 		find_early_table_space(end, use_pse, use_gbpages);
 
+#ifdef CONFIG_X86_32
 	for (i = 0; i < nr_range; i++)
 		kernel_physical_mapping_init(
 				mr[i].start >> PAGE_SHIFT,
 				mr[i].end >> PAGE_SHIFT,
 				mr[i].page_size_mask == (1<<PG_LEVEL_2M));
+	ret = end;
+#else /* CONFIG_X86_64 */
+	for (i = 0; i < nr_range; i++)
+		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+						   mr[i].page_size_mask);
+#endif
 
 	early_ioremap_page_table_range_init();
 
@@ -1059,7 +1067,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	if (!after_bootmem)
 		early_memtest(start, end);
 
-	return end >> PAGE_SHIFT;
+	return ret >> PAGE_SHIFT;
 }
 
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index c3c0be5..e4fadea 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -686,10 +686,10 @@ static int save_mr(struct map_range *mr, int nr_range,
 unsigned long __init_refok init_memory_mapping(unsigned long start,
 					       unsigned long end)
 {
-	unsigned long last_map_addr = 0;
 	unsigned long page_size_mask = 0;
 	unsigned long start_pfn, end_pfn;
 	unsigned long pos;
+	unsigned long ret;
 
 	struct map_range mr[NR_RANGE_MR];
 	int nr_range, i;
@@ -819,10 +819,18 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	if (!after_bootmem)
 		find_early_table_space(end, use_pse, use_gbpages);
 
+#ifdef CONFIG_X86_32
+	for (i = 0; i < nr_range; i++)
+		kernel_physical_mapping_init(
+				mr[i].start >> PAGE_SHIFT,
+				mr[i].end >> PAGE_SHIFT,
+				mr[i].page_size_mask == (1<<PG_LEVEL_2M));
+	ret = end;
+#else /* CONFIG_X86_64 */
 	for (i = 0; i < nr_range; i++)
-		last_map_addr = kernel_physical_mapping_init(
-					mr[i].start, mr[i].end,
-					mr[i].page_size_mask);
+		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+						   mr[i].page_size_mask);
+#endif
 
 	if (!after_bootmem)
 		mmu_cr4_features = read_cr4();
@@ -832,13 +840,10 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 		reserve_early(table_start << PAGE_SHIFT,
 				 table_end << PAGE_SHIFT, "PGTABLE");
 
-	printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
-			 last_map_addr, end);
-
 	if (!after_bootmem)
 		early_memtest(start, end);
 
-	return last_map_addr >> PAGE_SHIFT;
+	return ret >> PAGE_SHIFT;
 }
 
 #ifndef CONFIG_NUMA
-- 
cgit v0.10.2


From d58e854e36ddf241ebc243e4122c5ab087bf38df Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:55:00 +0200
Subject: x86: add table start and end sanity checks to 32-bit
 init_memory_mapping()

Impact: cleanup

This patch adds a sanity check to the 32-bit version of
init_memory_mapping() to reduce the diff to the 64-bit version.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-9-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 00c1d85..0a3707f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -1060,7 +1060,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 
 	__flush_tlb_all();
 
-	if (!after_bootmem)
+	if (!after_bootmem && table_end > table_start)
 		reserve_early(table_start << PAGE_SHIFT,
 				 table_end << PAGE_SHIFT, "PGTABLE");
 
-- 
cgit v0.10.2


From 01ced9ec14ad1b4f8a533c2f2b5a4fe4c92c1099 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:55:01 +0200
Subject: x86: ifdef 32-bit and 64-bit setup in init_memory_mapping()

Impact: cleanup

To reduce the diff between the 32-bit and 64-bit versions of
init_memory_mapping(), ifdef configuration specific setup code
in the function.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-10-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0a3707f..3f91bdc 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -1054,10 +1054,16 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 						   mr[i].page_size_mask);
 #endif
 
+#ifdef CONFIG_X86_32
 	early_ioremap_page_table_range_init();
 
 	load_cr3(swapper_pg_dir);
+#endif
 
+#ifdef CONFIG_X86_64
+	if (!after_bootmem)
+		mmu_cr4_features = read_cr4();
+#endif
 	__flush_tlb_all();
 
 	if (!after_bootmem && table_end > table_start)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e4fadea..5ecb23a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -832,8 +832,16 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 						   mr[i].page_size_mask);
 #endif
 
+#ifdef CONFIG_X86_32
+	early_ioremap_page_table_range_init();
+
+	load_cr3(swapper_pg_dir);
+#endif
+
+#ifdef CONFIG_X86_64
 	if (!after_bootmem)
 		mmu_cr4_features = read_cr4();
+#endif
 	__flush_tlb_all();
 
 	if (!after_bootmem && table_end > table_start)
-- 
cgit v0.10.2


From c338d6f60fc29dfc74bd82b91526ef43ba992bab Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:55:02 +0200
Subject: x86: ifdef 32-bit and 64-bit pfn setup in init_memory_mapping()

Impact: cleanup

To reduce the diff between the 32-bit and 64-bit versions of
init_memory_mapping(), ifdef configuration specific pfn setup
code in the function.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-11-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3f91bdc..34760e4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -975,20 +975,25 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	memset(mr, 0, sizeof(mr));
 	nr_range = 0;
 
+	/* head if not big page alignment ? */
+	start_pfn = start >> PAGE_SHIFT;
+	pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
 	/*
 	 * Don't use a large page for the first 2/4MB of memory
 	 * because there are often fixed size MTRRs in there
 	 * and overlapping MTRRs into large pages can cause
 	 * slowdowns.
 	 */
-	/* head if not big page alignment ? */
-	start_pfn = start >> PAGE_SHIFT;
-	pos = start_pfn << PAGE_SHIFT;
 	if (pos == 0)
 		end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
 	else
 		end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
 				 << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
+			<< (PMD_SHIFT - PAGE_SHIFT);
+#endif
 	if (end_pfn > (end >> PAGE_SHIFT))
 		end_pfn = end >> PAGE_SHIFT;
 	if (start_pfn < end_pfn) {
@@ -999,12 +1004,43 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	/* big page (2M) range */
 	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
 			 << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
 	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+			 << (PUD_SHIFT - PAGE_SHIFT);
+	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask & (1<<PG_LEVEL_2M));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+#ifdef CONFIG_X86_64
+	/* big page (1G) range */
+	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+			 << (PUD_SHIFT - PAGE_SHIFT);
+	end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask &
+				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+	/* tail is not big page (1G) alignment */
+	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+			 << (PMD_SHIFT - PAGE_SHIFT);
+	end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
 	if (start_pfn < end_pfn) {
 		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
 				page_size_mask & (1<<PG_LEVEL_2M));
 		pos = end_pfn << PAGE_SHIFT;
 	}
+#endif
 
 	/* tail is not big page (2M) alignment */
 	start_pfn = pos>>PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5ecb23a..d99bc6a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -741,8 +741,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	/* head if not big page alignment ? */
 	start_pfn = start >> PAGE_SHIFT;
 	pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
+	/*
+	 * Don't use a large page for the first 2/4MB of memory
+	 * because there are often fixed size MTRRs in there
+	 * and overlapping MTRRs into large pages can cause
+	 * slowdowns.
+	 */
+	if (pos == 0)
+		end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+	else
+		end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+				 << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
 	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
 			<< (PMD_SHIFT - PAGE_SHIFT);
+#endif
 	if (end_pfn > (end >> PAGE_SHIFT))
 		end_pfn = end >> PAGE_SHIFT;
 	if (start_pfn < end_pfn) {
@@ -753,16 +767,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	/* big page (2M) range */
 	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
 			 << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
 	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
 			 << (PUD_SHIFT - PAGE_SHIFT);
 	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
 		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+
 	if (start_pfn < end_pfn) {
 		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
 				page_size_mask & (1<<PG_LEVEL_2M));
 		pos = end_pfn << PAGE_SHIFT;
 	}
 
+#ifdef CONFIG_X86_64
 	/* big page (1G) range */
 	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
 			 << (PUD_SHIFT - PAGE_SHIFT);
@@ -783,6 +803,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 				page_size_mask & (1<<PG_LEVEL_2M));
 		pos = end_pfn << PAGE_SHIFT;
 	}
+#endif
 
 	/* tail is not big page (2M) alignment */
 	start_pfn = pos>>PAGE_SHIFT;
-- 
cgit v0.10.2


From b47e3418c52b26f6143fc696326ae52a21324551 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:55:03 +0200
Subject: x86: ifdef 32-bit and 64-bit NR_RANGE_MR for save_mr() unification

Impact: cleanup

As a trivial preparation for moving common code to arc/x86/mm/init.c,
ifdef the 32-bit and 64-bit versions of NR_RANGE_MR.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-12-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 34760e4..f59e9b8 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -900,7 +900,11 @@ struct map_range {
 	unsigned page_size_mask;
 };
 
+#ifdef CONFIG_X86_32
 #define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
 
 static int save_mr(struct map_range *mr, int nr_range,
 		   unsigned long start_pfn, unsigned long end_pfn,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d99bc6a..d101990 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -660,7 +660,11 @@ struct map_range {
 	unsigned page_size_mask;
 };
 
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
 #define NR_RANGE_MR 5
+#endif
 
 static int save_mr(struct map_range *mr, int nr_range,
 		   unsigned long start_pfn, unsigned long end_pfn,
-- 
cgit v0.10.2


From 0c0f756fd679d9747d52dad51fce3a5bb362eec3 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:55:04 +0200
Subject: x86: add stub init_gbpages() for 32-bit init_memory_mapping()

Impact: cleanup

This patch adds an empty static inline init_gbpages() for the 32-bit
version of init_memory_mapping() making both versions identical.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-13-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index f59e9b8..cd3c24b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -922,6 +922,10 @@ static int save_mr(struct map_range *mr, int nr_range,
 	return nr_range;
 }
 
+static inline void init_gbpages(void)
+{
+}
+
 /*
  * Setup the direct mapping of the physical memory at PAGE_OFFSET.
  * This runs before bootmem is initialized and gets pages directly from
@@ -941,6 +945,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 
 	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
 
+	if (!after_bootmem)
+		init_gbpages();
+
 #ifdef CONFIG_DEBUG_PAGEALLOC
 	/*
 	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-- 
cgit v0.10.2


From f765090a2617b8d9cb73b71e0aa850c29460d8be Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:55:05 +0200
Subject: x86: move init_memory_mapping() to common mm/init.c

Impact: cleanup

This patch moves the init_memory_mapping() function to common mm/init.c.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-14-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index cc7fe66..3a21b13 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -2,10 +2,338 @@
 #include <linux/swap.h>
 
 #include <asm/cacheflush.h>
+#include <asm/e820.h>
 #include <asm/page.h>
 #include <asm/page_types.h>
 #include <asm/sections.h>
 #include <asm/system.h>
+#include <asm/tlbflush.h>
+
+#ifdef CONFIG_X86_32
+extern void __init early_ioremap_page_table_range_init(void);
+extern void __init kernel_physical_mapping_init(unsigned long start_pfn,
+						unsigned long end_pfn,
+						int use_pse);
+#endif
+
+#ifdef CONFIG_X86_64
+extern unsigned long __meminit
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask);
+#endif
+
+unsigned long __initdata table_start;
+unsigned long __meminitdata table_end;
+unsigned long __meminitdata table_top;
+
+int after_bootmem;
+
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+				= 1
+#endif
+;
+
+static void __init find_early_table_space(unsigned long end, int use_pse,
+					  int use_gbpages)
+{
+	unsigned long puds, pmds, ptes, tables, start;
+
+	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+
+	if (use_gbpages) {
+		unsigned long extra;
+
+		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+	} else
+		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+
+	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+	if (use_pse) {
+		unsigned long extra;
+
+		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+		extra += PMD_SIZE;
+#endif
+		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	} else
+		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+
+#ifdef CONFIG_X86_32
+	/* for fixmap */
+	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+#endif
+
+	/*
+	 * RED-PEN putting page tables only on node 0 could
+	 * cause a hotspot and fill up ZONE_DMA. The page tables
+	 * need roughly 0.5KB per GB.
+	 */
+#ifdef CONFIG_X86_32
+	start = 0x7000;
+	table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+					tables, PAGE_SIZE);
+#else /* CONFIG_X86_64 */
+	start = 0x8000;
+	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
+#endif
+	if (table_start == -1UL)
+		panic("Cannot find space for the kernel page tables");
+
+	table_start >>= PAGE_SHIFT;
+	table_end = table_start;
+	table_top = table_start + (tables >> PAGE_SHIFT);
+
+	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
+}
+
+struct map_range {
+	unsigned long start;
+	unsigned long end;
+	unsigned page_size_mask;
+};
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int save_mr(struct map_range *mr, int nr_range,
+		   unsigned long start_pfn, unsigned long end_pfn,
+		   unsigned long page_size_mask)
+{
+	if (start_pfn < end_pfn) {
+		if (nr_range >= NR_RANGE_MR)
+			panic("run out of range for init_memory_mapping\n");
+		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
+		mr[nr_range].page_size_mask = page_size_mask;
+		nr_range++;
+	}
+
+	return nr_range;
+}
+
+#ifdef CONFIG_X86_64
+static void __init init_gbpages(void)
+{
+	if (direct_gbpages && cpu_has_gbpages)
+		printk(KERN_INFO "Using GB pages for direct mapping\n");
+	else
+		direct_gbpages = 0;
+}
+#else
+static inline void init_gbpages(void)
+{
+}
+#endif
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+					       unsigned long end)
+{
+	unsigned long page_size_mask = 0;
+	unsigned long start_pfn, end_pfn;
+	unsigned long pos;
+	unsigned long ret;
+
+	struct map_range mr[NR_RANGE_MR];
+	int nr_range, i;
+	int use_pse, use_gbpages;
+
+	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+
+	if (!after_bootmem)
+		init_gbpages();
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+	 * This will simplify cpa(), which otherwise needs to support splitting
+	 * large pages into small in interrupt context, etc.
+	 */
+	use_pse = use_gbpages = 0;
+#else
+	use_pse = cpu_has_pse;
+	use_gbpages = direct_gbpages;
+#endif
+
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_PAE
+	set_nx();
+	if (nx_enabled)
+		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#endif
+
+	/* Enable PSE if available */
+	if (cpu_has_pse)
+		set_in_cr4(X86_CR4_PSE);
+
+	/* Enable PGE if available */
+	if (cpu_has_pge) {
+		set_in_cr4(X86_CR4_PGE);
+		__supported_pte_mask |= _PAGE_GLOBAL;
+	}
+#endif
+
+	if (use_gbpages)
+		page_size_mask |= 1 << PG_LEVEL_1G;
+	if (use_pse)
+		page_size_mask |= 1 << PG_LEVEL_2M;
+
+	memset(mr, 0, sizeof(mr));
+	nr_range = 0;
+
+	/* head if not big page alignment ? */
+	start_pfn = start >> PAGE_SHIFT;
+	pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
+	/*
+	 * Don't use a large page for the first 2/4MB of memory
+	 * because there are often fixed size MTRRs in there
+	 * and overlapping MTRRs into large pages can cause
+	 * slowdowns.
+	 */
+	if (pos == 0)
+		end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+	else
+		end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+				 << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
+			<< (PMD_SHIFT - PAGE_SHIFT);
+#endif
+	if (end_pfn > (end >> PAGE_SHIFT))
+		end_pfn = end >> PAGE_SHIFT;
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+	/* big page (2M) range */
+	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+			 << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+			 << (PUD_SHIFT - PAGE_SHIFT);
+	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask & (1<<PG_LEVEL_2M));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+#ifdef CONFIG_X86_64
+	/* big page (1G) range */
+	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+			 << (PUD_SHIFT - PAGE_SHIFT);
+	end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask &
+				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+	/* tail is not big page (1G) alignment */
+	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+			 << (PMD_SHIFT - PAGE_SHIFT);
+	end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask & (1<<PG_LEVEL_2M));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+#endif
+
+	/* tail is not big page (2M) alignment */
+	start_pfn = pos>>PAGE_SHIFT;
+	end_pfn = end>>PAGE_SHIFT;
+	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+	/* try to merge same page size and continuous */
+	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+		unsigned long old_start;
+		if (mr[i].end != mr[i+1].start ||
+		    mr[i].page_size_mask != mr[i+1].page_size_mask)
+			continue;
+		/* move it */
+		old_start = mr[i].start;
+		memmove(&mr[i], &mr[i+1],
+			(nr_range - 1 - i) * sizeof(struct map_range));
+		mr[i--].start = old_start;
+		nr_range--;
+	}
+
+	for (i = 0; i < nr_range; i++)
+		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+				mr[i].start, mr[i].end,
+			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+
+	/*
+	 * Find space for the kernel direct mapping tables.
+	 *
+	 * Later we should allocate these tables in the local node of the
+	 * memory mapped. Unfortunately this is done currently before the
+	 * nodes are discovered.
+	 */
+	if (!after_bootmem)
+		find_early_table_space(end, use_pse, use_gbpages);
+
+#ifdef CONFIG_X86_32
+	for (i = 0; i < nr_range; i++)
+		kernel_physical_mapping_init(
+				mr[i].start >> PAGE_SHIFT,
+				mr[i].end >> PAGE_SHIFT,
+				mr[i].page_size_mask == (1<<PG_LEVEL_2M));
+	ret = end;
+#else /* CONFIG_X86_64 */
+	for (i = 0; i < nr_range; i++)
+		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+						   mr[i].page_size_mask);
+#endif
+
+#ifdef CONFIG_X86_32
+	early_ioremap_page_table_range_init();
+
+	load_cr3(swapper_pg_dir);
+#endif
+
+#ifdef CONFIG_X86_64
+	if (!after_bootmem)
+		mmu_cr4_features = read_cr4();
+#endif
+	__flush_tlb_all();
+
+	if (!after_bootmem && table_end > table_start)
+		reserve_early(table_start << PAGE_SHIFT,
+				 table_end << PAGE_SHIFT, "PGTABLE");
+
+	if (!after_bootmem)
+		early_memtest(start, end);
+
+	return ret >> PAGE_SHIFT;
+}
+
 
 /*
  * devmem_is_allowed() checks to see if /dev/mem access to a certain address
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index cd3c24b..187522a 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -59,13 +59,9 @@ unsigned long highstart_pfn, highend_pfn;
 static noinline int do_test_wp_bit(void);
 
 
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-
-int after_bootmem;
-
-int direct_gbpages;
+extern unsigned long __initdata table_start;
+extern unsigned long __meminitdata table_end;
+extern unsigned long __meminitdata table_top;
 
 static __init void *alloc_low_page(void)
 {
@@ -227,9 +223,9 @@ static inline int is_kernel_text(unsigned long addr)
  * of max_low_pfn pages, by creating page tables starting from address
  * PAGE_OFFSET:
  */
-static void __init kernel_physical_mapping_init(unsigned long start_pfn,
-						unsigned long end_pfn,
-						int use_pse)
+void __init kernel_physical_mapping_init(unsigned long start_pfn,
+					 unsigned long end_pfn,
+					 int use_pse)
 {
 	pgd_t *pgd_base = swapper_pg_dir;
 	int pgd_idx, pmd_idx, pte_ofs;
@@ -509,7 +505,7 @@ void __init native_pagetable_setup_done(pgd_t *base)
  * be partially populated, and so it avoids stomping on any existing
  * mappings.
  */
-static void __init early_ioremap_page_table_range_init(void)
+void __init early_ioremap_page_table_range_init(void)
 {
 	pgd_t *pgd_base = swapper_pg_dir;
 	unsigned long vaddr, end;
@@ -834,296 +830,6 @@ void __init setup_bootmem_allocator(void)
 	after_bootmem = 1;
 }
 
-static void __init find_early_table_space(unsigned long end, int use_pse,
-					  int use_gbpages)
-{
-	unsigned long puds, pmds, ptes, tables, start;
-
-	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-
-	if (use_gbpages) {
-		unsigned long extra;
-
-		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
-		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
-	} else
-		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-
-	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-
-	if (use_pse) {
-		unsigned long extra;
-
-		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
-#ifdef CONFIG_X86_32
-		extra += PMD_SIZE;
-#endif
-		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	} else
-		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
-
-	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
-#ifdef CONFIG_X86_32
-	/* for fixmap */
-	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
-
-	/*
-	 * RED-PEN putting page tables only on node 0 could
-	 * cause a hotspot and fill up ZONE_DMA. The page tables
-	 * need roughly 0.5KB per GB.
-	 */
-#ifdef CONFIG_X86_32
-	start = 0x7000;
-	table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
-					tables, PAGE_SIZE);
-#else /* CONFIG_X86_64 */
-	start = 0x8000;
-	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
-#endif
-	if (table_start == -1UL)
-		panic("Cannot find space for the kernel page tables");
-
-	table_start >>= PAGE_SHIFT;
-	table_end = table_start;
-	table_top = table_start + (tables >> PAGE_SHIFT);
-
-	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
-}
-
-struct map_range {
-	unsigned long start;
-	unsigned long end;
-	unsigned page_size_mask;
-};
-
-#ifdef CONFIG_X86_32
-#define NR_RANGE_MR 3
-#else /* CONFIG_X86_64 */
-#define NR_RANGE_MR 5
-#endif
-
-static int save_mr(struct map_range *mr, int nr_range,
-		   unsigned long start_pfn, unsigned long end_pfn,
-		   unsigned long page_size_mask)
-{
-	if (start_pfn < end_pfn) {
-		if (nr_range >= NR_RANGE_MR)
-			panic("run out of range for init_memory_mapping\n");
-		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
-		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
-		mr[nr_range].page_size_mask = page_size_mask;
-		nr_range++;
-	}
-
-	return nr_range;
-}
-
-static inline void init_gbpages(void)
-{
-}
-
-/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-					       unsigned long end)
-{
-	unsigned long page_size_mask = 0;
-	unsigned long start_pfn, end_pfn;
-	unsigned long pos;
-	unsigned long ret;
-
-	struct map_range mr[NR_RANGE_MR];
-	int nr_range, i;
-	int use_pse, use_gbpages;
-
-	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
-
-	if (!after_bootmem)
-		init_gbpages();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	/*
-	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-	 * This will simplify cpa(), which otherwise needs to support splitting
-	 * large pages into small in interrupt context, etc.
-	 */
-	use_pse = use_gbpages = 0;
-#else
-	use_pse = cpu_has_pse;
-	use_gbpages = direct_gbpages;
-#endif
-
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_PAE
-	set_nx();
-	if (nx_enabled)
-		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-#endif
-
-	/* Enable PSE if available */
-	if (cpu_has_pse)
-		set_in_cr4(X86_CR4_PSE);
-
-	/* Enable PGE if available */
-	if (cpu_has_pge) {
-		set_in_cr4(X86_CR4_PGE);
-		__supported_pte_mask |= _PAGE_GLOBAL;
-	}
-#endif
-
-	if (use_gbpages)
-		page_size_mask |= 1 << PG_LEVEL_1G;
-	if (use_pse)
-		page_size_mask |= 1 << PG_LEVEL_2M;
-
-	memset(mr, 0, sizeof(mr));
-	nr_range = 0;
-
-	/* head if not big page alignment ? */
-	start_pfn = start >> PAGE_SHIFT;
-	pos = start_pfn << PAGE_SHIFT;
-#ifdef CONFIG_X86_32
-	/*
-	 * Don't use a large page for the first 2/4MB of memory
-	 * because there are often fixed size MTRRs in there
-	 * and overlapping MTRRs into large pages can cause
-	 * slowdowns.
-	 */
-	if (pos == 0)
-		end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
-	else
-		end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-				 << (PMD_SHIFT - PAGE_SHIFT);
-#else /* CONFIG_X86_64 */
-	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
-			<< (PMD_SHIFT - PAGE_SHIFT);
-#endif
-	if (end_pfn > (end >> PAGE_SHIFT))
-		end_pfn = end >> PAGE_SHIFT;
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* big page (2M) range */
-	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-			 << (PMD_SHIFT - PAGE_SHIFT);
-#ifdef CONFIG_X86_32
-	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-#else /* CONFIG_X86_64 */
-	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-			 << (PUD_SHIFT - PAGE_SHIFT);
-	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
-		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
-#endif
-
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask & (1<<PG_LEVEL_2M));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-#ifdef CONFIG_X86_64
-	/* big page (1G) range */
-	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-			 << (PUD_SHIFT - PAGE_SHIFT);
-	end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask &
-				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* tail is not big page (1G) alignment */
-	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-			 << (PMD_SHIFT - PAGE_SHIFT);
-	end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask & (1<<PG_LEVEL_2M));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-#endif
-
-	/* tail is not big page (2M) alignment */
-	start_pfn = pos>>PAGE_SHIFT;
-	end_pfn = end>>PAGE_SHIFT;
-	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-
-	/* try to merge same page size and continuous */
-	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
-		unsigned long old_start;
-		if (mr[i].end != mr[i+1].start ||
-		    mr[i].page_size_mask != mr[i+1].page_size_mask)
-			continue;
-		/* move it */
-		old_start = mr[i].start;
-		memmove(&mr[i], &mr[i+1],
-			(nr_range - 1 - i) * sizeof(struct map_range));
-		mr[i--].start = old_start;
-		nr_range--;
-	}
-
-	for (i = 0; i < nr_range; i++)
-		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
-				mr[i].start, mr[i].end,
-			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
-			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
-
-	/*
-	 * Find space for the kernel direct mapping tables.
-	 *
-	 * Later we should allocate these tables in the local node of the
-	 * memory mapped. Unfortunately this is done currently before the
-	 * nodes are discovered.
-	 */
-	if (!after_bootmem)
-		find_early_table_space(end, use_pse, use_gbpages);
-
-#ifdef CONFIG_X86_32
-	for (i = 0; i < nr_range; i++)
-		kernel_physical_mapping_init(
-				mr[i].start >> PAGE_SHIFT,
-				mr[i].end >> PAGE_SHIFT,
-				mr[i].page_size_mask == (1<<PG_LEVEL_2M));
-	ret = end;
-#else /* CONFIG_X86_64 */
-	for (i = 0; i < nr_range; i++)
-		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
-						   mr[i].page_size_mask);
-#endif
-
-#ifdef CONFIG_X86_32
-	early_ioremap_page_table_range_init();
-
-	load_cr3(swapper_pg_dir);
-#endif
-
-#ifdef CONFIG_X86_64
-	if (!after_bootmem)
-		mmu_cr4_features = read_cr4();
-#endif
-	__flush_tlb_all();
-
-	if (!after_bootmem && table_end > table_start)
-		reserve_early(table_start << PAGE_SHIFT,
-				 table_end << PAGE_SHIFT, "PGTABLE");
-
-	if (!after_bootmem)
-		early_memtest(start, end);
-
-	return ret >> PAGE_SHIFT;
-}
-
-
 /*
  * paging_init() sets up the page tables - note that the first 8MB are
  * already mapped by head.S.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d101990..a32fe07 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -61,12 +61,6 @@ static unsigned long dma_reserve __initdata;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
-				= 1
-#endif
-;
-
 static int __init parse_direct_gbpages_off(char *arg)
 {
 	direct_gbpages = 0;
@@ -87,8 +81,6 @@ early_param("gbpages", parse_direct_gbpages_on);
  * around without checking the pgd every time.
  */
 
-int after_bootmem;
-
 pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
@@ -291,9 +283,9 @@ void __init cleanup_highmap(void)
 	}
 }
 
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
+extern unsigned long __initdata table_start;
+extern unsigned long __meminitdata table_end;
+extern unsigned long __meminitdata table_top;
 
 static __ref void *alloc_low_page(unsigned long *phys)
 {
@@ -547,77 +539,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
 	return phys_pud_init(pud, addr, end, page_size_mask);
 }
 
-static void __init find_early_table_space(unsigned long end, int use_pse,
-					  int use_gbpages)
-{
-	unsigned long puds, pmds, ptes, tables, start;
-
-	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-
-	if (use_gbpages) {
-		unsigned long extra;
-
-		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
-		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
-	} else
-		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-
-	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-
-	if (use_pse) {
-		unsigned long extra;
-
-		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
-#ifdef CONFIG_X86_32
-		extra += PMD_SIZE;
-#endif
-		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	} else
-		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
-
-	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
-#ifdef CONFIG_X86_32
-	/* for fixmap */
-	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
-
-	/*
-	 * RED-PEN putting page tables only on node 0 could
-	 * cause a hotspot and fill up ZONE_DMA. The page tables
-	 * need roughly 0.5KB per GB.
-	 */
-#ifdef CONFIG_X86_32
-	start = 0x7000;
-	table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
-					tables, PAGE_SIZE);
-#else /* CONFIG_X86_64 */
-	start = 0x8000;
-	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
-#endif
-	if (table_start == -1UL)
-		panic("Cannot find space for the kernel page tables");
-
-	table_start >>= PAGE_SHIFT;
-	table_end = table_start;
-	table_top = table_start + (tables >> PAGE_SHIFT);
-
-	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
-}
-
-static void __init init_gbpages(void)
-{
-	if (direct_gbpages && cpu_has_gbpages)
-		printk(KERN_INFO "Using GB pages for direct mapping\n");
-	else
-		direct_gbpages = 0;
-}
-
-static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
-						unsigned long end,
-						unsigned long page_size_mask)
+unsigned long __meminit
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask)
 {
 
 	unsigned long next, last_map_addr = end;
@@ -654,231 +579,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
 	return last_map_addr;
 }
 
-struct map_range {
-	unsigned long start;
-	unsigned long end;
-	unsigned page_size_mask;
-};
-
-#ifdef CONFIG_X86_32
-#define NR_RANGE_MR 3
-#else /* CONFIG_X86_64 */
-#define NR_RANGE_MR 5
-#endif
-
-static int save_mr(struct map_range *mr, int nr_range,
-		   unsigned long start_pfn, unsigned long end_pfn,
-		   unsigned long page_size_mask)
-{
-	if (start_pfn < end_pfn) {
-		if (nr_range >= NR_RANGE_MR)
-			panic("run out of range for init_memory_mapping\n");
-		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
-		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
-		mr[nr_range].page_size_mask = page_size_mask;
-		nr_range++;
-	}
-
-	return nr_range;
-}
-
-/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-					       unsigned long end)
-{
-	unsigned long page_size_mask = 0;
-	unsigned long start_pfn, end_pfn;
-	unsigned long pos;
-	unsigned long ret;
-
-	struct map_range mr[NR_RANGE_MR];
-	int nr_range, i;
-	int use_pse, use_gbpages;
-
-	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
-
-	if (!after_bootmem)
-		init_gbpages();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	/*
-	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-	 * This will simplify cpa(), which otherwise needs to support splitting
-	 * large pages into small in interrupt context, etc.
-	 */
-	use_pse = use_gbpages = 0;
-#else
-	use_pse = cpu_has_pse;
-	use_gbpages = direct_gbpages;
-#endif
-
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_PAE
-	set_nx();
-	if (nx_enabled)
-		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-#endif
-
-	/* Enable PSE if available */
-	if (cpu_has_pse)
-		set_in_cr4(X86_CR4_PSE);
-
-	/* Enable PGE if available */
-	if (cpu_has_pge) {
-		set_in_cr4(X86_CR4_PGE);
-		__supported_pte_mask |= _PAGE_GLOBAL;
-	}
-#endif
-
-	if (use_gbpages)
-		page_size_mask |= 1 << PG_LEVEL_1G;
-	if (use_pse)
-		page_size_mask |= 1 << PG_LEVEL_2M;
-
-	memset(mr, 0, sizeof(mr));
-	nr_range = 0;
-
-	/* head if not big page alignment ? */
-	start_pfn = start >> PAGE_SHIFT;
-	pos = start_pfn << PAGE_SHIFT;
-#ifdef CONFIG_X86_32
-	/*
-	 * Don't use a large page for the first 2/4MB of memory
-	 * because there are often fixed size MTRRs in there
-	 * and overlapping MTRRs into large pages can cause
-	 * slowdowns.
-	 */
-	if (pos == 0)
-		end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
-	else
-		end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-				 << (PMD_SHIFT - PAGE_SHIFT);
-#else /* CONFIG_X86_64 */
-	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
-			<< (PMD_SHIFT - PAGE_SHIFT);
-#endif
-	if (end_pfn > (end >> PAGE_SHIFT))
-		end_pfn = end >> PAGE_SHIFT;
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* big page (2M) range */
-	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-			 << (PMD_SHIFT - PAGE_SHIFT);
-#ifdef CONFIG_X86_32
-	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-#else /* CONFIG_X86_64 */
-	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-			 << (PUD_SHIFT - PAGE_SHIFT);
-	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
-		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
-#endif
-
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask & (1<<PG_LEVEL_2M));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-#ifdef CONFIG_X86_64
-	/* big page (1G) range */
-	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-			 << (PUD_SHIFT - PAGE_SHIFT);
-	end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask &
-				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* tail is not big page (1G) alignment */
-	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-			 << (PMD_SHIFT - PAGE_SHIFT);
-	end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask & (1<<PG_LEVEL_2M));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-#endif
-
-	/* tail is not big page (2M) alignment */
-	start_pfn = pos>>PAGE_SHIFT;
-	end_pfn = end>>PAGE_SHIFT;
-	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-
-	/* try to merge same page size and continuous */
-	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
-		unsigned long old_start;
-		if (mr[i].end != mr[i+1].start ||
-		    mr[i].page_size_mask != mr[i+1].page_size_mask)
-			continue;
-		/* move it */
-		old_start = mr[i].start;
-		memmove(&mr[i], &mr[i+1],
-			(nr_range - 1 - i) * sizeof(struct map_range));
-		mr[i--].start = old_start;
-		nr_range--;
-	}
-
-	for (i = 0; i < nr_range; i++)
-		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
-				mr[i].start, mr[i].end,
-			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
-			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
-
-	/*
-	 * Find space for the kernel direct mapping tables.
-	 *
-	 * Later we should allocate these tables in the local node of the
-	 * memory mapped. Unfortunately this is done currently before the
-	 * nodes are discovered.
-	 */
-	if (!after_bootmem)
-		find_early_table_space(end, use_pse, use_gbpages);
-
-#ifdef CONFIG_X86_32
-	for (i = 0; i < nr_range; i++)
-		kernel_physical_mapping_init(
-				mr[i].start >> PAGE_SHIFT,
-				mr[i].end >> PAGE_SHIFT,
-				mr[i].page_size_mask == (1<<PG_LEVEL_2M));
-	ret = end;
-#else /* CONFIG_X86_64 */
-	for (i = 0; i < nr_range; i++)
-		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
-						   mr[i].page_size_mask);
-#endif
-
-#ifdef CONFIG_X86_32
-	early_ioremap_page_table_range_init();
-
-	load_cr3(swapper_pg_dir);
-#endif
-
-#ifdef CONFIG_X86_64
-	if (!after_bootmem)
-		mmu_cr4_features = read_cr4();
-#endif
-	__flush_tlb_all();
-
-	if (!after_bootmem && table_end > table_start)
-		reserve_early(table_start << PAGE_SHIFT,
-				 table_end << PAGE_SHIFT, "PGTABLE");
-
-	if (!after_bootmem)
-		early_memtest(start, end);
-
-	return ret >> PAGE_SHIFT;
-}
-
 #ifndef CONFIG_NUMA
 void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 {
-- 
cgit v0.10.2


From 298af9d89f3f5292e81a0a00f729c415adc4d8fb Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:55:06 +0200
Subject: x86: fix up some bad global variable names in mm/init.c

Impact: cleanup

The table_start, table_end, and table_top are too generic for global
namespace so rename them to be more specific.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-15-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 3a21b13..5bbdfe7 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -23,9 +23,9 @@ kernel_physical_mapping_init(unsigned long start,
 			     unsigned long page_size_mask);
 #endif
 
-unsigned long __initdata table_start;
-unsigned long __meminitdata table_end;
-unsigned long __meminitdata table_top;
+unsigned long __initdata e820_table_start;
+unsigned long __meminitdata e820_table_end;
+unsigned long __meminitdata e820_table_top;
 
 int after_bootmem;
 
@@ -78,21 +78,21 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 	 */
 #ifdef CONFIG_X86_32
 	start = 0x7000;
-	table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+	e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
 					tables, PAGE_SIZE);
 #else /* CONFIG_X86_64 */
 	start = 0x8000;
-	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
+	e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
 #endif
-	if (table_start == -1UL)
+	if (e820_table_start == -1UL)
 		panic("Cannot find space for the kernel page tables");
 
-	table_start >>= PAGE_SHIFT;
-	table_end = table_start;
-	table_top = table_start + (tables >> PAGE_SHIFT);
+	e820_table_start >>= PAGE_SHIFT;
+	e820_table_end = e820_table_start;
+	e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
 
 	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
+		end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
 }
 
 struct map_range {
@@ -324,9 +324,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 #endif
 	__flush_tlb_all();
 
-	if (!after_bootmem && table_end > table_start)
-		reserve_early(table_start << PAGE_SHIFT,
-				 table_end << PAGE_SHIFT, "PGTABLE");
+	if (!after_bootmem && e820_table_end > e820_table_start)
+		reserve_early(e820_table_start << PAGE_SHIFT,
+				 e820_table_end << PAGE_SHIFT, "PGTABLE");
 
 	if (!after_bootmem)
 		early_memtest(start, end);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 187522a..e9df0d9 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -59,16 +59,16 @@ unsigned long highstart_pfn, highend_pfn;
 static noinline int do_test_wp_bit(void);
 
 
-extern unsigned long __initdata table_start;
-extern unsigned long __meminitdata table_end;
-extern unsigned long __meminitdata table_top;
+extern unsigned long __initdata e820_table_start;
+extern unsigned long __meminitdata e820_table_end;
+extern unsigned long __meminitdata e820_table_top;
 
 static __init void *alloc_low_page(void)
 {
-	unsigned long pfn = table_end++;
+	unsigned long pfn = e820_table_end++;
 	void *adr;
 
-	if (pfn >= table_top)
+	if (pfn >= e820_table_top)
 		panic("alloc_low_page: ran out of memory");
 
 	adr = __va(pfn * PAGE_SIZE);
@@ -149,8 +149,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
 	if (pmd_idx_kmap_begin != pmd_idx_kmap_end
 	    && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
 	    && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
-	    && ((__pa(pte) >> PAGE_SHIFT) < table_start
-		|| (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+	    && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
+		|| (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
 		pte_t *newpte;
 		int i;
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a32fe07..a1d33c5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -283,13 +283,13 @@ void __init cleanup_highmap(void)
 	}
 }
 
-extern unsigned long __initdata table_start;
-extern unsigned long __meminitdata table_end;
-extern unsigned long __meminitdata table_top;
+extern unsigned long __initdata e820_table_start;
+extern unsigned long __meminitdata e820_table_end;
+extern unsigned long __meminitdata e820_table_top;
 
 static __ref void *alloc_low_page(unsigned long *phys)
 {
-	unsigned long pfn = table_end++;
+	unsigned long pfn = e820_table_end++;
 	void *adr;
 
 	if (after_bootmem) {
@@ -299,7 +299,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
 		return adr;
 	}
 
-	if (pfn >= table_top)
+	if (pfn >= e820_table_top)
 		panic("alloc_low_page: ran out of memory");
 
 	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
-- 
cgit v0.10.2


From e53fb04fce6d246ebed755b904ed1b0b814a754c Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:55:07 +0200
Subject: x86: unify kernel_physical_mapping_init() function signatures

Impact: cleanup

In preparation for moving the function declaration to a header file,
unify 32-bit and 64-bit signatures.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-16-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 5bbdfe7..6475693 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -11,17 +11,12 @@
 
 #ifdef CONFIG_X86_32
 extern void __init early_ioremap_page_table_range_init(void);
-extern void __init kernel_physical_mapping_init(unsigned long start_pfn,
-						unsigned long end_pfn,
-						int use_pse);
 #endif
 
-#ifdef CONFIG_X86_64
-extern unsigned long __meminit
+extern unsigned long __init
 kernel_physical_mapping_init(unsigned long start,
 			     unsigned long end,
 			     unsigned long page_size_mask);
-#endif
 
 unsigned long __initdata e820_table_start;
 unsigned long __meminitdata e820_table_end;
@@ -301,10 +296,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 
 #ifdef CONFIG_X86_32
 	for (i = 0; i < nr_range; i++)
-		kernel_physical_mapping_init(
-				mr[i].start >> PAGE_SHIFT,
-				mr[i].end >> PAGE_SHIFT,
-				mr[i].page_size_mask == (1<<PG_LEVEL_2M));
+		kernel_physical_mapping_init(mr[i].start, mr[i].end,
+					     mr[i].page_size_mask);
 	ret = end;
 #else /* CONFIG_X86_64 */
 	for (i = 0; i < nr_range; i++)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index e9df0d9..5ca9c6c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -223,10 +223,13 @@ static inline int is_kernel_text(unsigned long addr)
  * of max_low_pfn pages, by creating page tables starting from address
  * PAGE_OFFSET:
  */
-void __init kernel_physical_mapping_init(unsigned long start_pfn,
-					 unsigned long end_pfn,
-					 int use_pse)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask)
 {
+	int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+	unsigned long start_pfn, end_pfn;
 	pgd_t *pgd_base = swapper_pg_dir;
 	int pgd_idx, pmd_idx, pte_ofs;
 	unsigned long pfn;
@@ -236,6 +239,9 @@ void __init kernel_physical_mapping_init(unsigned long start_pfn,
 	unsigned pages_2m, pages_4k;
 	int mapping_iter;
 
+	start_pfn = start >> PAGE_SHIFT;
+	end_pfn = end >> PAGE_SHIFT;
+
 	/*
 	 * First iteration will setup identity mapping using large/small pages
 	 * based on use_pse, with other attributes same as set by
@@ -350,6 +356,7 @@ repeat:
 		mapping_iter = 2;
 		goto repeat;
 	}
+	return 0;
 }
 
 pte_t *kmap_pte;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a1d33c5..f441ae3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -539,7 +539,7 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
 	return phys_pud_init(pud, addr, end, page_size_mask);
 }
 
-unsigned long __meminit
+unsigned long __init
 kernel_physical_mapping_init(unsigned long start,
 			     unsigned long end,
 			     unsigned long page_size_mask)
-- 
cgit v0.10.2


From 4fcb208391be5cf82c6fe2779c5eb9245ac97e91 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 14:55:08 +0200
Subject: x86: move function and variable declarations to asm/init.h

Impact: cleanup

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-17-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
new file mode 100644
index 0000000..36fb1a6a
--- /dev/null
+++ b/arch/x86/include/asm/init.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_INIT_32_H
+#define _ASM_X86_INIT_32_H
+
+#ifdef CONFIG_X86_32
+extern void __init early_ioremap_page_table_range_init(void);
+#endif
+
+extern unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask);
+
+
+extern unsigned long __initdata e820_table_start;
+extern unsigned long __meminitdata e820_table_end;
+extern unsigned long __meminitdata e820_table_top;
+
+#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6475693..6d63e3d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -3,21 +3,13 @@
 
 #include <asm/cacheflush.h>
 #include <asm/e820.h>
+#include <asm/init.h>
 #include <asm/page.h>
 #include <asm/page_types.h>
 #include <asm/sections.h>
 #include <asm/system.h>
 #include <asm/tlbflush.h>
 
-#ifdef CONFIG_X86_32
-extern void __init early_ioremap_page_table_range_init(void);
-#endif
-
-extern unsigned long __init
-kernel_physical_mapping_init(unsigned long start,
-			     unsigned long end,
-			     unsigned long page_size_mask);
-
 unsigned long __initdata e820_table_start;
 unsigned long __meminitdata e820_table_end;
 unsigned long __meminitdata e820_table_top;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 5ca9c6c..1669693 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,6 +49,7 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
+#include <asm/init.h>
 
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
@@ -58,11 +59,6 @@ unsigned long highstart_pfn, highend_pfn;
 
 static noinline int do_test_wp_bit(void);
 
-
-extern unsigned long __initdata e820_table_start;
-extern unsigned long __meminitdata e820_table_end;
-extern unsigned long __meminitdata e820_table_top;
-
 static __init void *alloc_low_page(void)
 {
 	unsigned long pfn = e820_table_end++;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f441ae3..7dd7ce4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -48,6 +48,7 @@
 #include <asm/kdebug.h>
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
+#include <asm/init.h>
 
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -283,10 +284,6 @@ void __init cleanup_highmap(void)
 	}
 }
 
-extern unsigned long __initdata e820_table_start;
-extern unsigned long __meminitdata e820_table_end;
-extern unsigned long __meminitdata e820_table_top;
-
 static __ref void *alloc_low_page(unsigned long *phys)
 {
 	unsigned long pfn = e820_table_end++;
-- 
cgit v0.10.2


From 62436fe9ee10f5e0dd087b106d69d93c9549935a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 14:39:03 +0100
Subject: x86: move init_memory_mapping() to common mm/init.c, build fix on
 32-bit PAE

Impact: build fix

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236257708-27269-14-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad..b8238dc 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,6 +273,7 @@ typedef struct page *pgtable_t;
 
 extern pteval_t __supported_pte_mask;
 extern int nx_enabled;
+extern void set_nx(void);
 
 #define pgprot_writecombine	pgprot_writecombine
 extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 1669693..5e5126e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -605,7 +605,7 @@ static int __init noexec_setup(char *str)
 }
 early_param("noexec", noexec_setup);
 
-static void __init set_nx(void)
+void __init set_nx(void)
 {
 	unsigned int v[4], l, h;
 
-- 
cgit v0.10.2


From a964e33c5d7c0ea46376d20c2f02edf01c9db251 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 17:14:30 -0800
Subject: x86: clean up old gcc warnings

gcc 3.2.2 reports:

In file included from /usr/src/all/linux-next/arch/x86/include/asm/page.h:8,
                 from /usr/src/all/linux-next/arch/x86/include/asm/processor.h:18,
                 from /usr/src/all/linux-next/arch/x86/include/asm/atomic_32.h:6,
                 from /usr/src/all/linux-next/arch/x86/include/asm/atomic.h:2,
                 from include/linux/crypto.h:20,
                 from arch/x86/kernel/asm-offsets_32.c:7,
                 from arch/x86/kernel/asm-offsets.c:2:
/usr/src/all/linux-next/arch/x86/include/asm/page_types.h:54: warning: parameter has incomplete type
/usr/src/all/linux-next/arch/x86/include/asm/page_types.h:56: warning: parameter has incomplete type
In file included from /usr/src/all/linux-next/arch/x86/include/asm/page.h:8,
                 from /usr/src/all/linux-next/arch/x86/include/asm/processor.h:18,
                 from include/linux/prefetch.h:14,
                 from include/linux/list.h:6,
                 from include/linux/module.h:9,
                 from init/main.c:13:
/usr/src/all/linux-next/arch/x86/include/asm/page_types.h:54: warning: parameter has incomplete type
/usr/src/all/linux-next/arch/x86/include/asm/page_types.h:56: warning: parameter has incomplete type

This is a bogus warning, but moving the pat-related functions
into asm/pat.h and including asm/pgtable_types.h should fix it.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 2d625da..826ad37 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,14 +40,8 @@
 
 #ifndef __ASSEMBLY__
 
-struct pgprot;
-
 extern int page_is_ram(unsigned long pagenr);
 extern int devmem_is_allowed(unsigned long pagenr);
-extern void map_devmem(unsigned long pfn, unsigned long size,
-		       struct pgprot vma_prot);
-extern void unmap_devmem(unsigned long pfn, unsigned long size,
-			 struct pgprot vma_prot);
 
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index b0e7005..2cd07b9 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_PAT_H
 
 #include <linux/types.h>
+#include <asm/pgtable_types.h>
 
 #ifdef CONFIG_X86_PAT
 extern int pat_enabled;
@@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end);
 
 extern int kernel_map_sync_memtype(u64 base, unsigned long size,
 		unsigned long flag);
+extern void map_devmem(unsigned long pfn, unsigned long size,
+		       struct pgprot vma_prot);
+extern void unmap_devmem(unsigned long pfn, unsigned long size,
+			 struct pgprot vma_prot);
 
 #endif /* _ASM_X86_PAT_H */
-- 
cgit v0.10.2


From dc16ecf7fd1fad7436832121435d4926a81d469e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 16:10:44 -0800
Subject: x86-32: use specific __vmalloc_start_set flag in __virt_addr_valid

Rather than relying on the ever-unreliable system_state,
add a specific __vmalloc_start_set flag to indicate whether
the vmalloc area has meaningful boundaries yet, and use that
in x86-32's __phys_addr and __virt_addr_valid.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index bd8df3b..2733fad 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -25,6 +25,11 @@
  * area for the same reason. ;)
  */
 #define VMALLOC_OFFSET	(8 * 1024 * 1024)
+
+#ifndef __ASSEMBLER__
+extern bool __vmalloc_start_set; /* set once high_memory is set */
+#endif
+
 #define VMALLOC_START	((unsigned long)high_memory + VMALLOC_OFFSET)
 #ifdef CONFIG_X86_PAE
 #define LAST_PKMAP 512
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 5e5126e..d57dfff 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -59,6 +59,8 @@ unsigned long highstart_pfn, highend_pfn;
 
 static noinline int do_test_wp_bit(void);
 
+bool __read_mostly __vmalloc_start_set = false;
+
 static __init void *alloc_low_page(void)
 {
 	unsigned long pfn = e820_table_end++;
@@ -757,6 +759,8 @@ void __init initmem_init(unsigned long start_pfn,
 #ifdef CONFIG_FLATMEM
 	max_mapnr = num_physpages;
 #endif
+	__vmalloc_start_set = true;
+
 	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
 			pages_to_mb(max_low_pfn));
 
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 433f7bd..a23ca5b 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -76,10 +76,9 @@ static inline int phys_addr_valid(unsigned long addr)
 #ifdef CONFIG_DEBUG_VIRTUAL
 unsigned long __phys_addr(unsigned long x)
 {
-	/* VMALLOC_* aren't constants; not available at the boot time */
+	/* VMALLOC_* aren't constants  */
 	VIRTUAL_BUG_ON(x < PAGE_OFFSET);
-	VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
-		is_vmalloc_addr((void *) x));
+	VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
 	return x - PAGE_OFFSET;
 }
 EXPORT_SYMBOL(__phys_addr);
@@ -89,7 +88,7 @@ bool __virt_addr_valid(unsigned long x)
 {
 	if (x < PAGE_OFFSET)
 		return false;
-	if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
+	if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
 		return false;
 	return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
 }
-- 
cgit v0.10.2


From ed26dbe5ae045e5bf95c6dc27497397a3fde52e1 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 16:16:51 -0800
Subject: x86: pre-initialize boot_cpu_data.x86_phys_bits to avoid system_state
 tests

Impact: cleanup, micro-optimization

Pre-initialize boot_cpu_data.x86_phys_bits to a reasonable default
to remove the use of system_state tests in __virt_addr_valid()
and __phys_addr().

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b746deb..f28c56e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -202,7 +202,9 @@ struct ist_info ist_info;
 #endif
 
 #else
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {
+	.x86_phys_bits = MAX_PHYSMEM_BITS,
+};
 EXPORT_SYMBOL(boot_cpu_data);
 #endif
 
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index a23ca5b..62773ab 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -38,8 +38,7 @@ unsigned long __phys_addr(unsigned long x)
 	} else {
 		VIRTUAL_BUG_ON(x < PAGE_OFFSET);
 		x -= PAGE_OFFSET;
-		VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
-					!phys_addr_valid(x));
+		VIRTUAL_BUG_ON(!phys_addr_valid(x));
 	}
 	return x;
 }
@@ -56,10 +55,8 @@ bool __virt_addr_valid(unsigned long x)
 		if (x < PAGE_OFFSET)
 			return false;
 		x -= PAGE_OFFSET;
-		if (system_state == SYSTEM_BOOTING ?
-				x > MAXMEM : !phys_addr_valid(x)) {
+		if (!phys_addr_valid(x))
 			return false;
-		}
 	}
 
 	return pfn_valid(x >> PAGE_SHIFT);
-- 
cgit v0.10.2


From b02c387892fc6b3cc59c78ab2f79413d55f50190 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 12 Feb 2009 13:42:41 +0300
Subject: [WATCHDOG] ks8695_wdt.c: 'CLOCK_TICK_RATE' undeclared

On arm-acs5k_tiny:

drivers/watchdog/ks8695_wdt.c:68: error: 'CLOCK_TICK_RATE' undeclared
	(first use in this function)

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
Cc: stable <stable@kernel.org>

diff --git a/drivers/watchdog/ks8695_wdt.c b/drivers/watchdog/ks8695_wdt.c
index 0b798fd..74c92d3 100644
--- a/drivers/watchdog/ks8695_wdt.c
+++ b/drivers/watchdog/ks8695_wdt.c
@@ -21,6 +21,7 @@
 #include <linux/watchdog.h>
 #include <linux/io.h>
 #include <linux/uaccess.h>
+#include <mach/timex.h>
 #include <mach/regs-timer.h>
 
 #define WDT_DEFAULT_TIME	5	/* seconds */
-- 
cgit v0.10.2


From 26952669f01646c3e7d0832c99b310b199fe2b20 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 3 Mar 2009 15:10:05 +0100
Subject: [WATCHDOG] gef_wdt.c: fsl_get_sys_freq() failure not noticed

fsl_get_sys_freq() may return -1 when 'soc' isn't found, but in
gef_wdt_probe() 'freq' is unsigned, so the test doesn't catch that.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

diff --git a/drivers/watchdog/gef_wdt.c b/drivers/watchdog/gef_wdt.c
index f0c2b7a..734d980 100644
--- a/drivers/watchdog/gef_wdt.c
+++ b/drivers/watchdog/gef_wdt.c
@@ -269,7 +269,7 @@ static int __devinit gef_wdt_probe(struct of_device *dev,
 	bus_clk = 133; /* in MHz */
 
 	freq = fsl_get_sys_freq();
-	if (freq > 0)
+	if (freq != -1)
 		bus_clk = freq;
 
 	/* Map devices registers into memory */
-- 
cgit v0.10.2


From b2b352590d94651579e6914ecdb08d30b2cb5f19 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 15:15:44 +0100
Subject: x86: UV, SGI RTC: add generic system vector, build fix on UP

Make ack_APIC_irq() build on !SMP && !APIC too.

Cc: Dimitri Sivanich <sivanich@sgi.com>
LKML-Reference: <20090304185605.GA24419@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 4ef949c..394d177 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -379,6 +379,7 @@ static inline u32 safe_apic_wait_icr_idle(void)
 
 static inline void ack_APIC_irq(void)
 {
+#ifdef CONFIG_X86_LOCAL_APIC
 	/*
 	 * ack_APIC_irq() actually gets compiled as a single instruction
 	 * ... yummie.
@@ -386,6 +387,7 @@ static inline void ack_APIC_irq(void)
 
 	/* Docs say use 0 for future compatibility */
 	apic_write(APIC_EOI, 0);
+#endif
 }
 
 static inline unsigned default_get_apic_id(unsigned long x)
-- 
cgit v0.10.2


From e0c6dcd8d4257a129fc813ac68f20b77c6ae2a7a Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 5 Mar 2009 16:10:55 +0100
Subject: ide: expiry() returns int, negative expiry() return values won't be
 noticed

bart:
It seems like the bug could cause insanely long timeouts for:
- ATA_DMA_ERR error in dma_timer_expiry()
- commands without ->expiry in tc86c001_timer_expiry()
  (TC86C001 IDE controller only)

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
[bart: port it to the current tree]
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 9ee51ad..9ff90cb 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -908,7 +908,7 @@ void ide_timer_expiry (unsigned long data)
 	ide_drive_t	*uninitialized_var(drive);
 	ide_handler_t	*handler;
 	unsigned long	flags;
-	unsigned long	wait = -1;
+	int		wait = -1;
 	int		plug_device = 0;
 
 	spin_lock_irqsave(&hwif->lock, flags);
-- 
cgit v0.10.2


From 71bfc7a7c73eaf1f99e309dba60822ba050e3ec5 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Thu, 5 Mar 2009 16:10:56 +0100
Subject: ide: NULL noise: drivers/ide/ide-*.c

Fix this sparse warnings:
  drivers/ide/ide-disk_proc.c:130:11: warning: Using plain integer as NULL pointer
  drivers/ide/ide-floppy_proc.c:32:11: warning: Using plain integer as NULL pointer
  drivers/ide/ide-proc.c:234:11: warning: Using plain integer as NULL pointer
  drivers/ide/ide-tape.c:2141:11: warning: Using plain integer as NULL pointer

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Cc: trivial@kernel.org
Cc: kernel-janitors@vger.kernel.org
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/ide-disk_proc.c b/drivers/ide/ide-disk_proc.c
index 1146f42..1f86dcb 100644
--- a/drivers/ide/ide-disk_proc.c
+++ b/drivers/ide/ide-disk_proc.c
@@ -125,5 +125,5 @@ const struct ide_proc_devset ide_disk_settings[] = {
 	IDE_PROC_DEVSET(multcount,	0,    16),
 	IDE_PROC_DEVSET(nowerr,		0,     1),
 	IDE_PROC_DEVSET(wcache,		0,     1),
-	{ 0 },
+	{ NULL },
 };
diff --git a/drivers/ide/ide-floppy_proc.c b/drivers/ide/ide-floppy_proc.c
index 3ec762c..fcd4d81 100644
--- a/drivers/ide/ide-floppy_proc.c
+++ b/drivers/ide/ide-floppy_proc.c
@@ -29,5 +29,5 @@ const struct ide_proc_devset ide_floppy_settings[] = {
 	IDE_PROC_DEVSET(bios_head, 0,  255),
 	IDE_PROC_DEVSET(bios_sect, 0,   63),
 	IDE_PROC_DEVSET(ticks,	   0,  255),
-	{ 0 },
+	{ NULL },
 };
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index 1d8978b..a7b9287 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -231,7 +231,7 @@ static const struct ide_proc_devset ide_generic_settings[] = {
 	IDE_PROC_DEVSET(pio_mode, 0, 255),
 	IDE_PROC_DEVSET(unmaskirq, 0, 1),
 	IDE_PROC_DEVSET(using_dma, 0, 1),
-	{ 0 },
+	{ NULL },
 };
 
 static void proc_ide_settings_warn(void)
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index bb450a7..4e6181c 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -2166,7 +2166,7 @@ static const struct ide_proc_devset idetape_settings[] = {
 	__IDE_PROC_DEVSET(speed,	0, 0xffff, NULL, NULL),
 	__IDE_PROC_DEVSET(tdsc,		IDETAPE_DSC_RW_MIN, IDETAPE_DSC_RW_MAX,
 					mulf_tdsc, divf_tdsc),
-	{ 0 },
+	{ NULL },
 };
 #endif
 
-- 
cgit v0.10.2


From a509538d4fb4f99cdf0a095213d57cc3b2347615 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Thu, 5 Mar 2009 16:10:56 +0100
Subject: ide-iops: fix odd-length ATAPI PIO transfers

Commit 9567b349f7e7dd7e2483db99ee8e4a6fe0caca38 (ide: merge ->atapi_*put_bytes
and ->ata_*put_data methods) introduced a regression  WRT the odd-length ATAPI
PIO transfers -- the final word didn't get written (causing command timeouts).

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 753b92e..b1892bd 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -315,6 +315,8 @@ void ide_output_data(ide_drive_t *drive, struct request *rq, void *buf,
 	u8 io_32bit = drive->io_32bit;
 	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
 
+	len++;
+
 	if (io_32bit) {
 		unsigned long uninitialized_var(flags);
 
-- 
cgit v0.10.2


From 849d7130001ab740a5a4778a561049841fdd77c9 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <stf_xl@wp.pl>
Date: Thu, 5 Mar 2009 16:10:57 +0100
Subject: ide: allow to wrap interrupt handler

Signed-off-by: Stanislaw Gruszka <stf_xl@wp.pl>
Cc: Andrew Victor <linux@maxim.org.za>
[bart: minor checkpatch.pl / CodingStyle fixups]
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 9ff90cb..a9a6c20 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -1162,6 +1162,7 @@ out_early:
 
 	return irq_ret;
 }
+EXPORT_SYMBOL_GPL(ide_intr);
 
 /**
  *	ide_do_drive_cmd	-	issue IDE special command
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index ce0818a..ee8e3e7 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -950,6 +950,7 @@ static int ide_port_setup_devices(ide_hwif_t *hwif)
 static int init_irq (ide_hwif_t *hwif)
 {
 	struct ide_io_ports *io_ports = &hwif->io_ports;
+	irq_handler_t irq_handler;
 	int sa = 0;
 
 	mutex_lock(&ide_cfg_mtx);
@@ -959,6 +960,10 @@ static int init_irq (ide_hwif_t *hwif)
 	hwif->timer.function = &ide_timer_expiry;
 	hwif->timer.data = (unsigned long)hwif;
 
+	irq_handler = hwif->host->irq_handler;
+	if (irq_handler == NULL)
+		irq_handler = ide_intr;
+
 #if defined(__mc68000__)
 	sa = IRQF_SHARED;
 #endif /* __mc68000__ */
@@ -969,7 +974,7 @@ static int init_irq (ide_hwif_t *hwif)
 	if (io_ports->ctl_addr)
 		hwif->tp_ops->set_irq(hwif, 1);
 
-	if (request_irq(hwif->irq, &ide_intr, sa, hwif->name, hwif))
+	if (request_irq(hwif->irq, irq_handler, sa, hwif->name, hwif))
 		goto out_up;
 
 	if (!hwif->rqsize) {
diff --git a/include/linux/ide.h b/include/linux/ide.h
index fe235b6..e0cedfe 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -866,6 +866,7 @@ struct ide_host {
 	unsigned int	n_ports;
 	struct device	*dev[2];
 	unsigned int	(*init_chipset)(struct pci_dev *);
+	irq_handler_t	irq_handler;
 	unsigned long	host_flags;
 	void		*host_priv;
 	ide_hwif_t	*cur_port;	/* for hosts requiring serialization */
-- 
cgit v0.10.2


From 6e5f1e1115bb041993f9f247036996364b4c84d5 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <stf_xl@wp.pl>
Date: Thu, 5 Mar 2009 16:10:58 +0100
Subject: ide: add at91_ide driver

This is IDE host driver for AT91 (SAM9, CAP9, AT572D940HF) Static Memory
Controller with Compact Flash True IDE Mode logic.

Driver have to switch 8/16 bit bus width when accessing Task Tile or Data
Register. Moreover some extra things need to be done when setting PIO mode.
Only PIO mode is used, hardware have no DMA support. If interrupt line is
connected through GPIO extra quirk is needed to cope with fake interrupts.

Signed-off-by: Stanislaw Gruszka <stf_xl@wp.pl>
Cc: Andrew Victor <avictor.za@gmail.com>
Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/arch/arm/mach-at91/include/mach/board.h b/arch/arm/mach-at91/include/mach/board.h
index 0b3ae21..793fe7b 100644
--- a/arch/arm/mach-at91/include/mach/board.h
+++ b/arch/arm/mach-at91/include/mach/board.h
@@ -56,6 +56,9 @@ struct at91_cf_data {
 	u8	vcc_pin;		/* power switching */
 	u8	rst_pin;		/* card reset */
 	u8	chipselect;		/* EBI Chip Select number */
+	u8	flags;
+#define AT91_CF_TRUE_IDE	0x01
+#define AT91_IDE_SWAP_A0_A2	0x02
 };
 extern void __init at91_add_device_cf(struct at91_cf_data *data);
 
diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index e072903..5ea3bfa 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -721,6 +721,11 @@ config BLK_DEV_IDE_TX4939
 	depends on SOC_TX4939
 	select BLK_DEV_IDEDMA_SFF
 
+config BLK_DEV_IDE_AT91
+	tristate "Atmel AT91 (SAM9, CAP9, AT572D940HF) IDE support"
+	depends on ARM && ARCH_AT91 && !ARCH_AT91RM9200 && !ARCH_AT91X40
+	select IDE_TIMINGS
+
 config IDE_ARM
 	tristate "ARM IDE support"
 	depends on ARM && (ARCH_RPC || ARCH_SHARK)
diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile
index d0e3d7d..1c326d9 100644
--- a/drivers/ide/Makefile
+++ b/drivers/ide/Makefile
@@ -116,3 +116,4 @@ obj-$(CONFIG_BLK_DEV_IDE_AU1XXX)	+= au1xxx-ide.o
 
 obj-$(CONFIG_BLK_DEV_IDE_TX4938)	+= tx4938ide.o
 obj-$(CONFIG_BLK_DEV_IDE_TX4939)	+= tx4939ide.o
+obj-$(CONFIG_BLK_DEV_IDE_AT91)		+= at91_ide.o
diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
new file mode 100644
index 0000000..1bb50f4
--- /dev/null
+++ b/drivers/ide/at91_ide.c
@@ -0,0 +1,467 @@
+/*
+ * IDE host driver for AT91 (SAM9, CAP9, AT572D940HF) Static Memory Controller
+ * with Compact Flash True IDE logic
+ *
+ * Copyright (c) 2008, 2009 Kelvatek Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/ide.h>
+#include <linux/platform_device.h>
+
+#include <mach/board.h>
+#include <mach/gpio.h>
+#include <mach/at91sam9263.h>
+#include <mach/at91sam9_smc.h>
+#include <mach/at91sam9263_matrix.h>
+
+#define DRV_NAME "at91_ide"
+
+#define perr(fmt, args...) pr_err(DRV_NAME ": " fmt, ##args)
+#define pdbg(fmt, args...) pr_debug("%s " fmt, __func__, ##args)
+
+/*
+ * Access to IDE device is possible through EBI Static Memory Controller
+ * with Compact Flash logic. For details see EBI and SMC datasheet sections
+ * of any microcontroller from AT91SAM9 family.
+ *
+ * Within SMC chip select address space, lines A[23:21] distinguish Compact
+ * Flash modes (I/O, common memory, attribute memory, True IDE). IDE modes are:
+ *   0x00c0000 - True IDE
+ *   0x00e0000 - Alternate True IDE (Alt Status Register)
+ *
+ * On True IDE mode Task File and Data Register are mapped at the same address.
+ * To distinguish access between these two different bus data width is used:
+ * 8Bit for Task File, 16Bit for Data I/O.
+ *
+ * After initialization we do 8/16 bit flipping (changes in SMC MODE register)
+ * only inside IDE callback routines which are serialized by IDE layer,
+ * so no additional locking needed.
+ */
+
+#define TASK_FILE	0x00c00000
+#define ALT_MODE	0x00e00000
+#define REGS_SIZE	8
+
+#define enter_16bit(cs, mode) do {					\
+	mode = at91_sys_read(AT91_SMC_MODE(cs));			\
+	at91_sys_write(AT91_SMC_MODE(cs), mode | AT91_SMC_DBW_16);	\
+} while (0)
+
+#define leave_16bit(cs, mode) at91_sys_write(AT91_SMC_MODE(cs), mode);
+
+static void set_smc_timings(const u8 chipselect, const u16 cycle,
+			    const u16 setup, const u16 pulse,
+			    const u16 data_float, int use_iordy)
+{
+	unsigned long mode = AT91_SMC_READMODE | AT91_SMC_WRITEMODE |
+			     AT91_SMC_BAT_SELECT;
+
+	/* disable or enable waiting for IORDY signal */
+	if (use_iordy)
+		mode |= AT91_SMC_EXNWMODE_READY;
+
+	/* add data float cycles if needed */
+	if (data_float)
+		mode |= AT91_SMC_TDF_(data_float);
+
+	at91_sys_write(AT91_SMC_MODE(chipselect), mode);
+
+	/* setup timings in SMC */
+	at91_sys_write(AT91_SMC_SETUP(chipselect), AT91_SMC_NWESETUP_(setup) |
+						   AT91_SMC_NCS_WRSETUP_(0) |
+						   AT91_SMC_NRDSETUP_(setup) |
+						   AT91_SMC_NCS_RDSETUP_(0));
+	at91_sys_write(AT91_SMC_PULSE(chipselect), AT91_SMC_NWEPULSE_(pulse) |
+						   AT91_SMC_NCS_WRPULSE_(cycle) |
+						   AT91_SMC_NRDPULSE_(pulse) |
+						   AT91_SMC_NCS_RDPULSE_(cycle));
+	at91_sys_write(AT91_SMC_CYCLE(chipselect), AT91_SMC_NWECYCLE_(cycle) |
+						   AT91_SMC_NRDCYCLE_(cycle));
+}
+
+static unsigned int calc_mck_cycles(unsigned int ns, unsigned int mck_hz)
+{
+	u64 tmp = ns;
+
+	tmp *= mck_hz;
+	tmp += 1000*1000*1000 - 1; /* round up */
+	do_div(tmp, 1000*1000*1000);
+	return (unsigned int) tmp;
+}
+
+static void apply_timings(const u8 chipselect, const u8 pio,
+			  const struct ide_timing *timing, int use_iordy)
+{
+	unsigned int t0, t1, t2, t6z;
+	unsigned int cycle, setup, pulse, data_float;
+	unsigned int mck_hz;
+	struct clk *mck;
+
+	/* see table 22 of Compact Flash standard 4.1 for the meaning,
+	 * we do not stretch active (t2) time, so setup (t1) + hold time (th)
+	 * assure at least minimal recovery (t2i) time */
+	t0 = timing->cyc8b;
+	t1 = timing->setup;
+	t2 = timing->act8b;
+	t6z = (pio < 5) ? 30 : 20;
+
+	pdbg("t0=%u t1=%u t2=%u t6z=%u\n", t0, t1, t2, t6z);
+
+	mck = clk_get(NULL, "mck");
+	BUG_ON(IS_ERR(mck));
+	mck_hz = clk_get_rate(mck);
+	pdbg("mck_hz=%u\n", mck_hz);
+
+	cycle = calc_mck_cycles(t0, mck_hz);
+	setup = calc_mck_cycles(t1, mck_hz);
+	pulse = calc_mck_cycles(t2, mck_hz);
+	data_float = calc_mck_cycles(t6z, mck_hz);
+
+	pdbg("cycle=%u setup=%u pulse=%u data_float=%u\n",
+	     cycle, setup, pulse, data_float);
+
+	set_smc_timings(chipselect, cycle, setup, pulse, data_float, use_iordy);
+}
+
+static void at91_ide_input_data(ide_drive_t *drive, struct request *rq,
+				void *buf, unsigned int len)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	struct ide_io_ports *io_ports = &hwif->io_ports;
+	u8 chipselect = hwif->select_data;
+	unsigned long mode;
+
+	pdbg("cs %u buf %p len %d\n", chipselect, buf, len);
+
+	len++;
+
+	enter_16bit(chipselect, mode);
+	__ide_mm_insw((void __iomem *) io_ports->data_addr, buf, len / 2);
+	leave_16bit(chipselect, mode);
+}
+
+static void at91_ide_output_data(ide_drive_t *drive, struct request *rq,
+				 void *buf, unsigned int len)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	struct ide_io_ports *io_ports = &hwif->io_ports;
+	u8 chipselect = hwif->select_data;
+	unsigned long mode;
+
+	pdbg("cs %u buf %p len %d\n", chipselect,  buf, len);
+
+	enter_16bit(chipselect, mode);
+	__ide_mm_outsw((void __iomem *) io_ports->data_addr, buf, len / 2);
+	leave_16bit(chipselect, mode);
+}
+
+static u8 ide_mm_inb(unsigned long port)
+{
+	return readb((void __iomem *) port);
+}
+
+static void ide_mm_outb(u8 value, unsigned long port)
+{
+	writeb(value, (void __iomem *) port);
+}
+
+static void at91_ide_tf_load(ide_drive_t *drive, ide_task_t *task)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	struct ide_io_ports *io_ports = &hwif->io_ports;
+	struct ide_taskfile *tf = &task->tf;
+	u8 HIHI = (task->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
+
+	if (task->tf_flags & IDE_TFLAG_FLAGGED)
+		HIHI = 0xFF;
+
+	if (task->tf_flags & IDE_TFLAG_OUT_DATA) {
+		u16 data = (tf->hob_data << 8) | tf->data;
+
+		at91_ide_output_data(drive, NULL, &data, 2);
+	}
+
+	if (task->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
+		ide_mm_outb(tf->hob_feature, io_ports->feature_addr);
+	if (task->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
+		ide_mm_outb(tf->hob_nsect, io_ports->nsect_addr);
+	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
+		ide_mm_outb(tf->hob_lbal, io_ports->lbal_addr);
+	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
+		ide_mm_outb(tf->hob_lbam, io_ports->lbam_addr);
+	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
+		ide_mm_outb(tf->hob_lbah, io_ports->lbah_addr);
+
+	if (task->tf_flags & IDE_TFLAG_OUT_FEATURE)
+		ide_mm_outb(tf->feature, io_ports->feature_addr);
+	if (task->tf_flags & IDE_TFLAG_OUT_NSECT)
+		ide_mm_outb(tf->nsect, io_ports->nsect_addr);
+	if (task->tf_flags & IDE_TFLAG_OUT_LBAL)
+		ide_mm_outb(tf->lbal, io_ports->lbal_addr);
+	if (task->tf_flags & IDE_TFLAG_OUT_LBAM)
+		ide_mm_outb(tf->lbam, io_ports->lbam_addr);
+	if (task->tf_flags & IDE_TFLAG_OUT_LBAH)
+		ide_mm_outb(tf->lbah, io_ports->lbah_addr);
+
+	if (task->tf_flags & IDE_TFLAG_OUT_DEVICE)
+		ide_mm_outb((tf->device & HIHI) | drive->select, io_ports->device_addr);
+}
+
+static void at91_ide_tf_read(ide_drive_t *drive, ide_task_t *task)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	struct ide_io_ports *io_ports = &hwif->io_ports;
+	struct ide_taskfile *tf = &task->tf;
+
+	if (task->tf_flags & IDE_TFLAG_IN_DATA) {
+		u16 data;
+
+		at91_ide_input_data(drive, NULL, &data, 2);
+		tf->data = data & 0xff;
+		tf->hob_data = (data >> 8) & 0xff;
+	}
+
+	/* be sure we're looking at the low order bits */
+	ide_mm_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
+
+	if (task->tf_flags & IDE_TFLAG_IN_FEATURE)
+		tf->feature = ide_mm_inb(io_ports->feature_addr);
+	if (task->tf_flags & IDE_TFLAG_IN_NSECT)
+		tf->nsect  = ide_mm_inb(io_ports->nsect_addr);
+	if (task->tf_flags & IDE_TFLAG_IN_LBAL)
+		tf->lbal   = ide_mm_inb(io_ports->lbal_addr);
+	if (task->tf_flags & IDE_TFLAG_IN_LBAM)
+		tf->lbam   = ide_mm_inb(io_ports->lbam_addr);
+	if (task->tf_flags & IDE_TFLAG_IN_LBAH)
+		tf->lbah   = ide_mm_inb(io_ports->lbah_addr);
+	if (task->tf_flags & IDE_TFLAG_IN_DEVICE)
+		tf->device = ide_mm_inb(io_ports->device_addr);
+
+	if (task->tf_flags & IDE_TFLAG_LBA48) {
+		ide_mm_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
+
+		if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
+			tf->hob_feature = ide_mm_inb(io_ports->feature_addr);
+		if (task->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
+			tf->hob_nsect   = ide_mm_inb(io_ports->nsect_addr);
+		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
+			tf->hob_lbal    = ide_mm_inb(io_ports->lbal_addr);
+		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
+			tf->hob_lbam    = ide_mm_inb(io_ports->lbam_addr);
+		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
+			tf->hob_lbah    = ide_mm_inb(io_ports->lbah_addr);
+	}
+}
+
+static void at91_ide_set_pio_mode(ide_drive_t *drive, const u8 pio)
+{
+	struct ide_timing *timing;
+	u8 chipselect = drive->hwif->select_data;
+	int use_iordy = 0;
+
+	pdbg("chipselect %u pio %u\n", chipselect, pio);
+
+	timing = ide_timing_find_mode(XFER_PIO_0 + pio);
+	BUG_ON(!timing);
+
+	if ((pio > 2 || ata_id_has_iordy(drive->id)) &&
+	    !(ata_id_is_cfa(drive->id) && pio > 4))
+		use_iordy = 1;
+
+	apply_timings(chipselect, pio, timing, use_iordy);
+}
+
+static const struct ide_tp_ops at91_ide_tp_ops = {
+	.exec_command	= ide_exec_command,
+	.read_status	= ide_read_status,
+	.read_altstatus	= ide_read_altstatus,
+	.set_irq	= ide_set_irq,
+
+	.tf_load	= at91_ide_tf_load,
+	.tf_read	= at91_ide_tf_read,
+
+	.input_data	= at91_ide_input_data,
+	.output_data	= at91_ide_output_data,
+};
+
+static const struct ide_port_ops at91_ide_port_ops = {
+	.set_pio_mode	= at91_ide_set_pio_mode,
+};
+
+static const struct ide_port_info at91_ide_port_info __initdata = {
+	.port_ops	= &at91_ide_port_ops,
+	.tp_ops		= &at91_ide_tp_ops,
+	.host_flags 	= IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA | IDE_HFLAG_SINGLE |
+			  IDE_HFLAG_NO_IO_32BIT | IDE_HFLAG_UNMASK_IRQS,
+	.pio_mask 	= ATA_PIO5,
+};
+
+/*
+ * If interrupt is delivered through GPIO, IRQ are triggered on falling
+ * and rising edge of signal. Whereas IDE device request interrupt on high
+ * level (rising edge in our case). This mean we have fake interrupts, so
+ * we need to check interrupt pin and exit instantly from ISR when line
+ * is on low level.
+ */
+
+irqreturn_t at91_irq_handler(int irq, void *dev_id)
+{
+	int ntries = 8;
+	int pin_val1, pin_val2;
+
+	/* additional deglitch, line can be noisy in badly designed PCB */
+	do {
+		pin_val1 = at91_get_gpio_value(irq);
+		pin_val2 = at91_get_gpio_value(irq);
+	} while (pin_val1 != pin_val2 && --ntries > 0);
+
+	if (pin_val1 == 0 || ntries <= 0)
+		return IRQ_HANDLED;
+
+	return ide_intr(irq, dev_id);
+}
+
+static int __init at91_ide_probe(struct platform_device *pdev)
+{
+	int ret;
+	hw_regs_t hw;
+	hw_regs_t *hws[] = { &hw, NULL, NULL, NULL };
+	struct ide_host *host;
+	struct resource *res;
+	unsigned long tf_base = 0, ctl_base = 0;
+	struct at91_cf_data *board = pdev->dev.platform_data;
+
+	if (!board)
+		return -ENODEV;
+
+	if (board->det_pin && at91_get_gpio_value(board->det_pin) != 0) {
+		perr("no device detected\n");
+		return -ENODEV;
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		perr("can't get memory resource\n");
+		return -ENODEV;
+	}
+
+	if (!devm_request_mem_region(&pdev->dev, res->start + TASK_FILE,
+				     REGS_SIZE, "ide") ||
+	    !devm_request_mem_region(&pdev->dev, res->start + ALT_MODE,
+				     REGS_SIZE, "alt")) {
+		perr("memory resources in use\n");
+		return -EBUSY;
+	}
+
+	pdbg("chipselect %u irq %u res %08lx\n", board->chipselect,
+	     board->irq_pin, (unsigned long) res->start);
+
+	tf_base = (unsigned long) devm_ioremap(&pdev->dev, res->start + TASK_FILE,
+					       REGS_SIZE);
+	ctl_base = (unsigned long) devm_ioremap(&pdev->dev, res->start + ALT_MODE,
+						REGS_SIZE);
+	if (!tf_base || !ctl_base) {
+		perr("can't map memory regions\n");
+		return -EBUSY;
+	}
+
+	memset(&hw, 0, sizeof(hw));
+
+	if (board->flags & AT91_IDE_SWAP_A0_A2) {
+		/* workaround for stupid hardware bug */
+		hw.io_ports.data_addr	= tf_base + 0;
+		hw.io_ports.error_addr	= tf_base + 4;
+		hw.io_ports.nsect_addr	= tf_base + 2;
+		hw.io_ports.lbal_addr	= tf_base + 6;
+		hw.io_ports.lbam_addr	= tf_base + 1;
+		hw.io_ports.lbah_addr	= tf_base + 5;
+		hw.io_ports.device_addr = tf_base + 3;
+		hw.io_ports.command_addr = tf_base + 7;
+		hw.io_ports.ctl_addr	= ctl_base + 3;
+	} else
+		ide_std_init_ports(&hw, tf_base, ctl_base + 6);
+
+	hw.irq = board->irq_pin;
+	hw.chipset = ide_generic;
+	hw.dev = &pdev->dev;
+
+	host = ide_host_alloc(&at91_ide_port_info, hws);
+	if (!host) {
+		perr("failed to allocate ide host\n");
+		return -ENOMEM;
+	}
+
+	/* setup Static Memory Controller - PIO 0 as default */
+	apply_timings(board->chipselect, 0, ide_timing_find_mode(XFER_PIO_0), 0);
+
+	/* with GPIO interrupt we have to do quirks in handler */
+	if (board->irq_pin >= PIN_BASE)
+		host->irq_handler = at91_irq_handler;
+
+	host->ports[0]->select_data = board->chipselect;
+
+	ret = ide_host_register(host, &at91_ide_port_info, hws);
+	if (ret) {
+		perr("failed to register ide host\n");
+		goto err_free_host;
+	}
+	platform_set_drvdata(pdev, host);
+	return 0;
+
+err_free_host:
+	ide_host_free(host);
+	return ret;
+}
+
+static int __exit at91_ide_remove(struct platform_device *pdev)
+{
+	struct ide_host *host = platform_get_drvdata(pdev);
+
+	ide_host_remove(host);
+	return 0;
+}
+
+static struct platform_driver at91_ide_driver = {
+	.driver	= {
+		.name = DRV_NAME,
+		.owner = THIS_MODULE,
+	},
+	.remove	= __exit_p(at91_ide_remove),
+};
+
+static int __init at91_ide_init(void)
+{
+	return platform_driver_probe(&at91_ide_driver, at91_ide_probe);
+}
+
+static void __exit at91_ide_exit(void)
+{
+	platform_driver_unregister(&at91_ide_driver);
+}
+
+module_init(at91_ide_init);
+module_exit(at91_ide_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Stanislaw Gruszka <stf_xl@wp.pl>");
+
-- 
cgit v0.10.2


From e565f206082a725108a75bcf00bbe3db21a56474 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <stf_xl@wp.pl>
Date: Thu, 5 Mar 2009 16:10:58 +0100
Subject: AT91: initialize Compact Flash on AT91SAM9263 cpu

Signed-off-by: Stanislaw Gruszka <stf_xl@wp.pl>
Cc: Andrew Victor <avictor.za@gmail.com>
Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Acked-by: Andrew Victor <linux@maxim.org.za>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/arch/arm/mach-at91/at91sam9263_devices.c b/arch/arm/mach-at91/at91sam9263_devices.c
index 134af97..b7f2332 100644
--- a/arch/arm/mach-at91/at91sam9263_devices.c
+++ b/arch/arm/mach-at91/at91sam9263_devices.c
@@ -347,6 +347,111 @@ void __init at91_add_device_mmc(short mmc_id, struct at91_mmc_data *data)
 void __init at91_add_device_mmc(short mmc_id, struct at91_mmc_data *data) {}
 #endif
 
+/* --------------------------------------------------------------------
+ *  Compact Flash (PCMCIA or IDE)
+ * -------------------------------------------------------------------- */
+
+#if defined(CONFIG_AT91_CF) || defined(CONFIG_AT91_CF_MODULE) || \
+    defined(CONFIG_BLK_DEV_IDE_AT91) || defined(CONFIG_BLK_DEV_IDE_AT91_MODULE)
+
+static struct at91_cf_data cf0_data;
+
+static struct resource cf0_resources[] = {
+	[0] = {
+		.start	= AT91_CHIPSELECT_4,
+		.end	= AT91_CHIPSELECT_4 + SZ_256M - 1,
+		.flags	= IORESOURCE_MEM | IORESOURCE_MEM_8AND16BIT,
+	}
+};
+
+static struct platform_device cf0_device = {
+	.id		= 0,
+	.dev		= {
+				.platform_data	= &cf0_data,
+	},
+	.resource	= cf0_resources,
+	.num_resources	= ARRAY_SIZE(cf0_resources),
+};
+
+static struct at91_cf_data cf1_data;
+
+static struct resource cf1_resources[] = {
+	[0] = {
+		.start	= AT91_CHIPSELECT_5,
+		.end	= AT91_CHIPSELECT_5 + SZ_256M - 1,
+		.flags	= IORESOURCE_MEM | IORESOURCE_MEM_8AND16BIT,
+	}
+};
+
+static struct platform_device cf1_device = {
+	.id		= 1,
+	.dev		= {
+				.platform_data	= &cf1_data,
+	},
+	.resource	= cf1_resources,
+	.num_resources	= ARRAY_SIZE(cf1_resources),
+};
+
+void __init at91_add_device_cf(struct at91_cf_data *data)
+{
+	unsigned long ebi0_csa;
+	struct platform_device *pdev;
+
+	if (!data)
+		return;
+
+	/*
+	 * assign CS4 or CS5 to SMC with Compact Flash logic support,
+	 * we assume SMC timings are configured by board code,
+	 * except True IDE where timings are controlled by driver
+	 */
+	ebi0_csa = at91_sys_read(AT91_MATRIX_EBI0CSA);
+	switch (data->chipselect) {
+	case 4:
+		at91_set_A_periph(AT91_PIN_PD6, 0);  /* EBI0_NCS4/CFCS0 */
+		ebi0_csa |= AT91_MATRIX_EBI0_CS4A_SMC_CF1;
+		cf0_data = *data;
+		pdev = &cf0_device;
+		break;
+	case 5:
+		at91_set_A_periph(AT91_PIN_PD7, 0);  /* EBI0_NCS5/CFCS1 */
+		ebi0_csa |= AT91_MATRIX_EBI0_CS5A_SMC_CF2;
+		cf1_data = *data;
+		pdev = &cf1_device;
+		break;
+	default:
+		printk(KERN_ERR "AT91 CF: bad chip-select requested (%u)\n",
+		       data->chipselect);
+		return;
+	}
+	at91_sys_write(AT91_MATRIX_EBI0CSA, ebi0_csa);
+
+	if (data->det_pin) {
+		at91_set_gpio_input(data->det_pin, 1);
+		at91_set_deglitch(data->det_pin, 1);
+	}
+
+	if (data->irq_pin) {
+		at91_set_gpio_input(data->irq_pin, 1);
+		at91_set_deglitch(data->irq_pin, 1);
+	}
+
+	if (data->vcc_pin)
+		/* initially off */
+		at91_set_gpio_output(data->vcc_pin, 0);
+
+	/* enable EBI controlled pins */
+	at91_set_A_periph(AT91_PIN_PD5, 1);  /* NWAIT */
+	at91_set_A_periph(AT91_PIN_PD8, 0);  /* CFCE1 */
+	at91_set_A_periph(AT91_PIN_PD9, 0);  /* CFCE2 */
+	at91_set_A_periph(AT91_PIN_PD14, 0); /* CFNRW */
+
+	pdev->name = (data->flags & AT91_CF_TRUE_IDE) ? "at91_ide" : "at91_cf";
+	platform_device_register(pdev);
+}
+#else
+void __init at91_add_device_cf(struct at91_cf_data *data) {}
+#endif
 
 /* --------------------------------------------------------------------
  *  NAND / SmartMedia
-- 
cgit v0.10.2


From ebcad5aaea26da3cb2ca90b7f31a67a027eb60db Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 5 Mar 2009 16:10:59 +0100
Subject: remove stale comment from <linux/hdreg.h>

HDIO_GET_IDENTITY returns 256 words currently.

Noticed by Norman Diamond.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/include/linux/hdreg.h b/include/linux/hdreg.h
index c37e924..ed21bd3 100644
--- a/include/linux/hdreg.h
+++ b/include/linux/hdreg.h
@@ -511,7 +511,6 @@ struct hd_driveid {
 	unsigned short	words69_70[2];	/* reserved words 69-70
 					 * future command overlap and queuing
 					 */
-	/* HDIO_GET_IDENTITY currently returns only words 0 through 70 */
 	unsigned short	words71_74[4];	/* reserved words 71-74
 					 * for IDENTIFY PACKET DEVICE command
 					 */
-- 
cgit v0.10.2


From 7dbc3f6ead13cb87d2318443ebd8f19a987149aa Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier.adi@gmail.com>
Date: Fri, 6 Mar 2009 00:20:49 +0800
Subject: Blackfin arch: add stubs for anomalies 447 and 448

Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/mach-bf518/include/mach/anomaly.h b/arch/blackfin/mach-bf518/include/mach/anomaly.h
index 143a29d..c847bb1 100644
--- a/arch/blackfin/mach-bf518/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf518/include/mach/anomaly.h
@@ -86,5 +86,7 @@
 #define ANOMALY_05000386 (0)
 #define ANOMALY_05000412 (0)
 #define ANOMALY_05000432 (0)
+#define ANOMALY_05000447 (0)
+#define ANOMALY_05000448 (0)
 
 #endif
diff --git a/arch/blackfin/mach-bf527/include/mach/anomaly.h b/arch/blackfin/mach-bf527/include/mach/anomaly.h
index 4ffccb6..df6808d 100644
--- a/arch/blackfin/mach-bf527/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf527/include/mach/anomaly.h
@@ -176,5 +176,7 @@
 #define ANOMALY_05000323 (0)
 #define ANOMALY_05000363 (0)
 #define ANOMALY_05000412 (0)
+#define ANOMALY_05000447 (0)
+#define ANOMALY_05000448 (0)
 
 #endif
diff --git a/arch/blackfin/mach-bf533/include/mach/anomaly.h b/arch/blackfin/mach-bf533/include/mach/anomaly.h
index 9e838f8..1cf893e 100644
--- a/arch/blackfin/mach-bf533/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf533/include/mach/anomaly.h
@@ -283,5 +283,7 @@
 #define ANOMALY_05000412 (0)
 #define ANOMALY_05000432 (0)
 #define ANOMALY_05000435 (0)
+#define ANOMALY_05000447 (0)
+#define ANOMALY_05000448 (0)
 
 #endif
diff --git a/arch/blackfin/mach-bf537/include/mach/anomaly.h b/arch/blackfin/mach-bf537/include/mach/anomaly.h
index 438b43f..1bfd80c 100644
--- a/arch/blackfin/mach-bf537/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf537/include/mach/anomaly.h
@@ -173,5 +173,7 @@
 #define ANOMALY_05000412 (0)
 #define ANOMALY_05000432 (0)
 #define ANOMALY_05000435 (0)
+#define ANOMALY_05000447 (0)
+#define ANOMALY_05000448 (0)
 
 #endif
diff --git a/arch/blackfin/mach-bf538/include/mach/anomaly.h b/arch/blackfin/mach-bf538/include/mach/anomaly.h
index f4ece1c..3a56998 100644
--- a/arch/blackfin/mach-bf538/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf538/include/mach/anomaly.h
@@ -130,5 +130,7 @@
 #define ANOMALY_05000412 (0)
 #define ANOMALY_05000432 (0)
 #define ANOMALY_05000435 (0)
+#define ANOMALY_05000447 (0)
+#define ANOMALY_05000448 (0)
 
 #endif
diff --git a/arch/blackfin/mach-bf561/include/mach/anomaly.h b/arch/blackfin/mach-bf561/include/mach/anomaly.h
index d78bc6b..d0b0b35 100644
--- a/arch/blackfin/mach-bf561/include/mach/anomaly.h
+++ b/arch/blackfin/mach-bf561/include/mach/anomaly.h
@@ -287,5 +287,7 @@
 #define ANOMALY_05000386 (1)
 #define ANOMALY_05000432 (0)
 #define ANOMALY_05000435 (0)
+#define ANOMALY_05000447 (0)
+#define ANOMALY_05000448 (0)
 
 #endif
-- 
cgit v0.10.2


From d42ad15b759d05a87f22b484af63987eff38ea88 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Thu, 5 Mar 2009 17:20:55 +0100
Subject: ata: add CFA specific identify data words

Declare CFA specific identify data words 162 and 163 for future use.

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
[bart: update patch summary/description]
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 08a86d5..9a061ac 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -89,6 +89,8 @@ enum {
 	ATA_ID_DLF		= 128,
 	ATA_ID_CSFO		= 129,
 	ATA_ID_CFA_POWER	= 160,
+	ATA_ID_CFA_KEY_MGMT	= 162,
+	ATA_ID_CFA_MODES	= 163,
 	ATA_ID_ROT_SPEED	= 217,
 	ATA_ID_PIO4		= (1 << 1),
 
-- 
cgit v0.10.2


From 357fd373e194ec5bb02fe875a67c0874ab74b5a6 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier.adi@gmail.com>
Date: Fri, 6 Mar 2009 00:24:01 +0800
Subject: Blackfin arch: remove duplicated ANOMALY_05000448 ifdef check

Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/mach-common/arch_checks.c b/arch/blackfin/mach-common/arch_checks.c
index a2ca26a..80d39b2 100644
--- a/arch/blackfin/mach-common/arch_checks.c
+++ b/arch/blackfin/mach-common/arch_checks.c
@@ -68,8 +68,6 @@
 # error "The kernel load address is too high; keep it below 10meg for safety"
 #endif
 
-#ifdef ANOMALY_05000448
-# if ANOMALY_05000448
-#  error You are using a part with anomaly 05000448, this issue causes random memory read/write failures - that means random crashes.
-# endif
+#if ANOMALY_05000448
+# error You are using a part with anomaly 05000448, this issue causes random memory read/write failures - that means random crashes.
 #endif
-- 
cgit v0.10.2


From 4aad7ec373a559177ee2c488455f6bf8928c030c Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier.adi@gmail.com>
Date: Fri, 6 Mar 2009 00:27:17 +0800
Subject: Blackfin arch: disable legacy /proc/scsi/ support by default

Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/configs/BF548-EZKIT_defconfig b/arch/blackfin/configs/BF548-EZKIT_defconfig
index b41dbd0..6bc2fb1 100644
--- a/arch/blackfin/configs/BF548-EZKIT_defconfig
+++ b/arch/blackfin/configs/BF548-EZKIT_defconfig
@@ -680,7 +680,7 @@ CONFIG_SCSI=y
 CONFIG_SCSI_DMA=y
 # CONFIG_SCSI_TGT is not set
 # CONFIG_SCSI_NETLINK is not set
-CONFIG_SCSI_PROC_FS=y
+# CONFIG_SCSI_PROC_FS is not set
 
 #
 # SCSI support type (disk, tape, CD-ROM)
diff --git a/arch/blackfin/configs/CM-BF548_defconfig b/arch/blackfin/configs/CM-BF548_defconfig
index 542ebc5..f410430 100644
--- a/arch/blackfin/configs/CM-BF548_defconfig
+++ b/arch/blackfin/configs/CM-BF548_defconfig
@@ -595,7 +595,7 @@ CONFIG_SCSI=y
 CONFIG_SCSI_DMA=y
 # CONFIG_SCSI_TGT is not set
 # CONFIG_SCSI_NETLINK is not set
-CONFIG_SCSI_PROC_FS=y
+# CONFIG_SCSI_PROC_FS is not set
 
 #
 # SCSI support type (disk, tape, CD-ROM)
diff --git a/arch/blackfin/configs/IP0X_defconfig b/arch/blackfin/configs/IP0X_defconfig
index eae83b5..7db9387 100644
--- a/arch/blackfin/configs/IP0X_defconfig
+++ b/arch/blackfin/configs/IP0X_defconfig
@@ -612,7 +612,7 @@ CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024
 CONFIG_SCSI=y
 # CONFIG_SCSI_TGT is not set
 # CONFIG_SCSI_NETLINK is not set
-CONFIG_SCSI_PROC_FS=y
+# CONFIG_SCSI_PROC_FS is not set
 
 #
 # SCSI support type (disk, tape, CD-ROM)
-- 
cgit v0.10.2


From f3f704d375fcc92950f688ccb3dd0f650acace92 Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Fri, 6 Mar 2009 00:27:57 +0800
Subject: Blackfin arch: SPI_MMC is now mainlined MMC_SPI

Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/mach-bf518/boards/ezbrd.c b/arch/blackfin/mach-bf518/boards/ezbrd.c
index b044de4..41f2eac 100644
--- a/arch/blackfin/mach-bf518/boards/ezbrd.c
+++ b/arch/blackfin/mach-bf518/boards/ezbrd.c
@@ -182,9 +182,9 @@ static struct bfin5xx_spi_chip spi_switch_info = {
 #endif
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-static struct bfin5xx_spi_chip spi_mmc_chip_info = {
-	.enable_dma = 1,
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
+static struct bfin5xx_spi_chip mmc_spi_chip_info = {
+	.enable_dma = 0,
 	.bits_per_word = 8,
 };
 #endif
@@ -276,23 +276,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = {
 #endif
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-	{
-		.modalias = "spi_mmc_dummy",
-		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = 0,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
-		.mode = SPI_MODE_3,
-	},
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
 	{
-		.modalias = "spi_mmc",
+		.modalias = "mmc_spi",
 		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
 		.bus_num = 0,
-		.chip_select = CONFIG_SPI_MMC_CS_CHAN,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
+		.chip_select = 5,
+		.controller_data = &mmc_spi_chip_info,
 		.mode = SPI_MODE_3,
 	},
 #endif
diff --git a/arch/blackfin/mach-bf527/boards/cm_bf527.c b/arch/blackfin/mach-bf527/boards/cm_bf527.c
index 856c097..48e69ee 100644
--- a/arch/blackfin/mach-bf527/boards/cm_bf527.c
+++ b/arch/blackfin/mach-bf527/boards/cm_bf527.c
@@ -487,9 +487,9 @@ static struct bfin5xx_spi_chip ad9960_spi_chip_info = {
 };
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-static struct bfin5xx_spi_chip spi_mmc_chip_info = {
-	.enable_dma = 1,
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
+static struct bfin5xx_spi_chip  mmc_spi_chip_info = {
+	.enable_dma = 0,
 	.bits_per_word = 8,
 };
 #endif
@@ -585,23 +585,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = {
 		.controller_data = &ad9960_spi_chip_info,
 	},
 #endif
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
 	{
-		.modalias = "spi_mmc_dummy",
-		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = 0,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
-		.mode = SPI_MODE_3,
-	},
-	{
-		.modalias = "spi_mmc",
-		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
+		.modalias = "mmc_spi",
+		.max_speed_hz = 20000000,     /* max spi clock (SCK) speed in HZ */
 		.bus_num = 0,
-		.chip_select = CONFIG_SPI_MMC_CS_CHAN,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
+		.chip_select = 5,
+		.controller_data = &mmc_spi_chip_info,
 		.mode = SPI_MODE_3,
 	},
 #endif
diff --git a/arch/blackfin/mach-bf527/boards/ezbrd.c b/arch/blackfin/mach-bf527/boards/ezbrd.c
index 83606fc..7fe480e 100644
--- a/arch/blackfin/mach-bf527/boards/ezbrd.c
+++ b/arch/blackfin/mach-bf527/boards/ezbrd.c
@@ -256,9 +256,9 @@ static struct bfin5xx_spi_chip spi_adc_chip_info = {
 };
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-static struct bfin5xx_spi_chip spi_mmc_chip_info = {
-	.enable_dma = 1,
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
+static struct bfin5xx_spi_chip mmc_spi_chip_info = {
+	.enable_dma = 0,
 	.bits_per_word = 8,
 };
 #endif
@@ -366,23 +366,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = {
 	},
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
 	{
-		.modalias = "spi_mmc_dummy",
+		.modalias = "mmc_spi",
 		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
 		.bus_num = 0,
-		.chip_select = 0,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
-		.mode = SPI_MODE_3,
-	},
-	{
-		.modalias = "spi_mmc",
-		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = CONFIG_SPI_MMC_CS_CHAN,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
+		.chip_select = 5,
+		.controller_data = &mmc_spi_chip_info,
 		.mode = SPI_MODE_3,
 	},
 #endif
diff --git a/arch/blackfin/mach-bf533/boards/blackstamp.c b/arch/blackfin/mach-bf533/boards/blackstamp.c
index 015c18f..0765872 100644
--- a/arch/blackfin/mach-bf533/boards/blackstamp.c
+++ b/arch/blackfin/mach-bf533/boards/blackstamp.c
@@ -101,9 +101,9 @@ static struct bfin5xx_spi_chip spi_flash_chip_info = {
 };
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-static struct bfin5xx_spi_chip spi_mmc_chip_info = {
-	.enable_dma = 1,
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
+static struct bfin5xx_spi_chip mmc_spi_chip_info = {
+	.enable_dma = 0,
 	.bits_per_word = 8,
 };
 #endif
@@ -129,23 +129,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = {
 	},
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-	{
-		.modalias = "spi_mmc_dummy",
-		.max_speed_hz = 20000000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = 0,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
-		.mode = SPI_MODE_3,
-	},
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
 	{
-		.modalias = "spi_mmc",
+		.modalias = "mmc_spi",
 		.max_speed_hz = 20000000,     /* max spi clock (SCK) speed in HZ */
 		.bus_num = 0,
-		.chip_select = CONFIG_SPI_MMC_CS_CHAN,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
+		.chip_select = 5,
+		.controller_data = &mmc_spi_chip_info,
 		.mode = SPI_MODE_3,
 	},
 #endif
diff --git a/arch/blackfin/mach-bf533/boards/cm_bf533.c b/arch/blackfin/mach-bf533/boards/cm_bf533.c
index e7061c7..e897487 100644
--- a/arch/blackfin/mach-bf533/boards/cm_bf533.c
+++ b/arch/blackfin/mach-bf533/boards/cm_bf533.c
@@ -96,9 +96,9 @@ static struct bfin5xx_spi_chip ad1836_spi_chip_info = {
 };
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-static struct bfin5xx_spi_chip spi_mmc_chip_info = {
-	.enable_dma = 1,
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
+static struct bfin5xx_spi_chip mmc_spi_chip_info = {
+	.enable_dma = 0,
 	.bits_per_word = 8,
 };
 #endif
@@ -138,23 +138,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = {
 	},
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-	{
-		.modalias = "spi_mmc_dummy",
-		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = 0,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
-		.mode = SPI_MODE_3,
-	},
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
 	{
-		.modalias = "spi_mmc",
+		.modalias = "mmc_spi",
 		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
 		.bus_num = 0,
-		.chip_select = CONFIG_SPI_MMC_CS_CHAN,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
+		.chip_select = 5,
+		.controller_data = &mmc_spi_chip_info,
 		.mode = SPI_MODE_3,
 	},
 #endif
diff --git a/arch/blackfin/mach-bf533/boards/ip0x.c b/arch/blackfin/mach-bf533/boards/ip0x.c
index e30b1b7..f19b6337 100644
--- a/arch/blackfin/mach-bf533/boards/ip0x.c
+++ b/arch/blackfin/mach-bf533/boards/ip0x.c
@@ -127,8 +127,8 @@ static struct platform_device dm9000_device2 = {
 #if defined(CONFIG_SPI_BFIN) || defined(CONFIG_SPI_BFIN_MODULE)
 /* all SPI peripherals info goes here */
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-static struct bfin5xx_spi_chip spi_mmc_chip_info = {
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
+static struct bfin5xx_spi_chip mmc_spi_chip_info = {
 /*
  * CPOL (Clock Polarity)
  *  0 - Active high SCK
@@ -152,14 +152,13 @@ static struct bfin5xx_spi_chip spi_mmc_chip_info = {
 /* Notice: for blackfin, the speed_hz is the value of register
  * SPI_BAUD, not the real baudrate */
 static struct spi_board_info bfin_spi_board_info[] __initdata = {
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
 	{
-		.modalias = "spi_mmc",
+		.modalias = "mmc_spi",
 		.max_speed_hz = 2,
 		.bus_num = 1,
-		.chip_select = CONFIG_SPI_MMC_CS_CHAN,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
+		.chip_select = 5,
+		.controller_data = &mmc_spi_chip_info,
 	},
 #endif
 };
diff --git a/arch/blackfin/mach-bf537/boards/cm_bf537.c b/arch/blackfin/mach-bf537/boards/cm_bf537.c
index 9cd8fb2..41c75b9 100644
--- a/arch/blackfin/mach-bf537/boards/cm_bf537.c
+++ b/arch/blackfin/mach-bf537/boards/cm_bf537.c
@@ -108,9 +108,9 @@ static struct bfin5xx_spi_chip ad9960_spi_chip_info = {
 };
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-static struct bfin5xx_spi_chip spi_mmc_chip_info = {
-	.enable_dma = 1,
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
+static struct bfin5xx_spi_chip  mmc_spi_chip_info = {
+	.enable_dma = 0,
 	.bits_per_word = 8,
 };
 #endif
@@ -160,23 +160,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = {
 	},
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-	{
-		.modalias = "spi_mmc_dummy",
-		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = 7,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
-		.mode = SPI_MODE_3,
-	},
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
 	{
-		.modalias = "spi_mmc",
-		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
+		.modalias = "mmc_spi",
+		.max_speed_hz = 20000000,     /* max spi clock (SCK) speed in HZ */
 		.bus_num = 0,
-		.chip_select = CONFIG_SPI_MMC_CS_CHAN,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
+		.chip_select = 1,
+		.controller_data = &mmc_spi_chip_info,
 		.mode = SPI_MODE_3,
 	},
 #endif
diff --git a/arch/blackfin/mach-bf537/boards/minotaur.c b/arch/blackfin/mach-bf537/boards/minotaur.c
index db7d3a3..3c15981 100644
--- a/arch/blackfin/mach-bf537/boards/minotaur.c
+++ b/arch/blackfin/mach-bf537/boards/minotaur.c
@@ -134,9 +134,9 @@ static struct bfin5xx_spi_chip spi_flash_chip_info = {
 };
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-static struct bfin5xx_spi_chip spi_mmc_chip_info = {
-	.enable_dma = 1,
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
+static struct bfin5xx_spi_chip mmc_spi_chip_info = {
+	.enable_dma = 0,
 	.bits_per_word = 8,
 };
 #endif
@@ -156,23 +156,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = {
 	},
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
 	{
-		.modalias = "spi_mmc_dummy",
+		.modalias = "mmc_spi",
 		.max_speed_hz = 5000000,     /* max spi clock (SCK) speed in HZ */
 		.bus_num = 0,
-		.chip_select = 0,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
-		.mode = SPI_MODE_3,
-	},
-	{
-		.modalias = "spi_mmc",
-		.max_speed_hz = 5000000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = CONFIG_SPI_MMC_CS_CHAN,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
+		.chip_select = 5,
+		.controller_data = &mmc_spi_chip_info,
 		.mode = SPI_MODE_3,
 	},
 #endif
diff --git a/arch/blackfin/mach-bf537/boards/pnav10.c b/arch/blackfin/mach-bf537/boards/pnav10.c
index 590eb3a..4e1de1e 100644
--- a/arch/blackfin/mach-bf537/boards/pnav10.c
+++ b/arch/blackfin/mach-bf537/boards/pnav10.c
@@ -289,9 +289,9 @@ static struct bfin5xx_spi_chip ad9960_spi_chip_info = {
 };
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-static struct bfin5xx_spi_chip spi_mmc_chip_info = {
-	.enable_dma = 1,
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
+static struct bfin5xx_spi_chip mmc_spi_chip_info = {
+	.enable_dma = 0,
 	.bits_per_word = 8,
 };
 #endif
@@ -364,23 +364,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = {
 		.controller_data = &ad9960_spi_chip_info,
 	},
 #endif
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-	{
-		.modalias = "spi_mmc_dummy",
-		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = 7,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
-		.mode = SPI_MODE_3,
-	},
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
 	{
-		.modalias = "spi_mmc",
+		.modalias = "mmc_spi",
 		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
 		.bus_num = 0,
-		.chip_select = CONFIG_SPI_MMC_CS_CHAN,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
+		.chip_select = 5,
+		.controller_data = &mmc_spi_chip_info,
 		.mode = SPI_MODE_3,
 	},
 #endif
diff --git a/arch/blackfin/mach-bf537/boards/tcm_bf537.c b/arch/blackfin/mach-bf537/boards/tcm_bf537.c
index 3f4f203..53ad10f 100644
--- a/arch/blackfin/mach-bf537/boards/tcm_bf537.c
+++ b/arch/blackfin/mach-bf537/boards/tcm_bf537.c
@@ -108,9 +108,9 @@ static struct bfin5xx_spi_chip ad9960_spi_chip_info = {
 };
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-static struct bfin5xx_spi_chip spi_mmc_chip_info = {
-	.enable_dma = 1,
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
+static struct bfin5xx_spi_chip mmc_spi_chip_info = {
+	.enable_dma = 0,
 	.bits_per_word = 8,
 };
 #endif
@@ -160,23 +160,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = {
 	},
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-	{
-		.modalias = "spi_mmc_dummy",
-		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
-		.bus_num = 0,
-		.chip_select = 7,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
-		.mode = SPI_MODE_3,
-	},
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
 	{
-		.modalias = "spi_mmc",
+		.modalias = "mmc_spi",
 		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
 		.bus_num = 0,
-		.chip_select = CONFIG_SPI_MMC_CS_CHAN,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
+		.chip_select = 5,
+		.controller_data = &mmc_spi_chip_info,
 		.mode = SPI_MODE_3,
 	},
 #endif
diff --git a/arch/blackfin/mach-bf561/boards/cm_bf561.c b/arch/blackfin/mach-bf561/boards/cm_bf561.c
index 6880d1e..f623c6b 100644
--- a/arch/blackfin/mach-bf561/boards/cm_bf561.c
+++ b/arch/blackfin/mach-bf561/boards/cm_bf561.c
@@ -105,9 +105,9 @@ static struct bfin5xx_spi_chip ad9960_spi_chip_info = {
 };
 #endif
 
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
-static struct bfin5xx_spi_chip spi_mmc_chip_info = {
-	.enable_dma = 1,
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
+static struct bfin5xx_spi_chip mmc_spi_chip_info = {
+	.enable_dma = 0,
 	.bits_per_word = 8,
 };
 #endif
@@ -155,14 +155,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = {
 		.controller_data = &ad9960_spi_chip_info,
 	},
 #endif
-#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE)
+#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE)
 	{
-		.modalias = "spi_mmc",
+		.modalias = "mmc_spi",
 		.max_speed_hz = 25000000,     /* max spi clock (SCK) speed in HZ */
 		.bus_num = 0,
-		.chip_select = CONFIG_SPI_MMC_CS_CHAN,
-		.platform_data = NULL,
-		.controller_data = &spi_mmc_chip_info,
+		.chip_select = 5,
+		.controller_data = &mmc_spi_chip_info,
 		.mode = SPI_MODE_3,
 	},
 #endif
-- 
cgit v0.10.2


From 9eb77ab0762d8cfc4686e1ec5e9a52905f3a5009 Mon Sep 17 00:00:00 2001
From: Xose Vazquez Perez <xose.vazquez@gmail.com>
Date: Sat, 28 Feb 2009 00:26:09 +0100
Subject: rt2x00 : more devices to rt2500usb.c

add more usb_dev to rt2500usb.c . IDs 'stolen' from the
windows inf file(02/12/2009, 2.01.01.0015).

Signed-off-by: Xose Vazquez Perez <xose.vazquez@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/rt2x00/rt2500usb.c b/drivers/net/wireless/rt2x00/rt2500usb.c
index af6b584..3e2ac2b 100644
--- a/drivers/net/wireless/rt2x00/rt2500usb.c
+++ b/drivers/net/wireless/rt2x00/rt2500usb.c
@@ -1952,6 +1952,8 @@ static struct usb_device_id rt2500usb_device_table[] = {
 	{ USB_DEVICE(0x13b1, 0x000d), USB_DEVICE_DATA(&rt2500usb_ops) },
 	{ USB_DEVICE(0x13b1, 0x0011), USB_DEVICE_DATA(&rt2500usb_ops) },
 	{ USB_DEVICE(0x13b1, 0x001a), USB_DEVICE_DATA(&rt2500usb_ops) },
+	/* CNet */
+	{ USB_DEVICE(0x1371, 0x9022), USB_DEVICE_DATA(&rt2500usb_ops) },
 	/* Conceptronic */
 	{ USB_DEVICE(0x14b2, 0x3c02), USB_DEVICE_DATA(&rt2500usb_ops) },
 	/* D-LINK */
@@ -1976,14 +1978,20 @@ static struct usb_device_id rt2500usb_device_table[] = {
 	{ USB_DEVICE(0x148f, 0x2570), USB_DEVICE_DATA(&rt2500usb_ops) },
 	{ USB_DEVICE(0x148f, 0x2573), USB_DEVICE_DATA(&rt2500usb_ops) },
 	{ USB_DEVICE(0x148f, 0x9020), USB_DEVICE_DATA(&rt2500usb_ops) },
+	/* Sagem */
+	{ USB_DEVICE(0x079b, 0x004b), USB_DEVICE_DATA(&rt2500usb_ops) },
 	/* Siemens */
 	{ USB_DEVICE(0x0681, 0x3c06), USB_DEVICE_DATA(&rt2500usb_ops) },
 	/* SMC */
 	{ USB_DEVICE(0x0707, 0xee13), USB_DEVICE_DATA(&rt2500usb_ops) },
 	/* Spairon */
 	{ USB_DEVICE(0x114b, 0x0110), USB_DEVICE_DATA(&rt2500usb_ops) },
+	/* SURECOM */
+	{ USB_DEVICE(0x0769, 0x11f3), USB_DEVICE_DATA(&rt2500usb_ops) },
 	/* Trust */
 	{ USB_DEVICE(0x0eb0, 0x9020), USB_DEVICE_DATA(&rt2500usb_ops) },
+	/* VTech */
+	{ USB_DEVICE(0x0f88, 0x3012), USB_DEVICE_DATA(&rt2500usb_ops) },
 	/* Zinwell */
 	{ USB_DEVICE(0x5a57, 0x0260), USB_DEVICE_DATA(&rt2500usb_ops) },
 	{ 0, }
-- 
cgit v0.10.2


From ef4bb70d876b4ae5787cc43727477d1a36cdc2e8 Mon Sep 17 00:00:00 2001
From: Xose Vazquez Perez <xose.vazquez@gmail.com>
Date: Sat, 28 Feb 2009 00:34:23 +0100
Subject: rt2x00 : more devices to rt73usb.c

add more usb_dev to rt73usb.c . IDs 'stolen' from the
windows inf file(10/21/2008, 1.03.02.0000) plus some
from the Ralink linux driver(2009_0206_RT73_Linux_STA_Drv1.1.0.2.tar.bz2)

Signed-off-by: Xose Vazquez Perez <xose.vazquez@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/rt2x00/rt73usb.c b/drivers/net/wireless/rt2x00/rt73usb.c
index 96a8d69..cefee1b 100644
--- a/drivers/net/wireless/rt2x00/rt73usb.c
+++ b/drivers/net/wireless/rt2x00/rt73usb.c
@@ -2281,7 +2281,18 @@ static const struct rt2x00_ops rt73usb_ops = {
  */
 static struct usb_device_id rt73usb_device_table[] = {
 	/* AboCom */
+	{ USB_DEVICE(0x07b8, 0xb21b), USB_DEVICE_DATA(&rt73usb_ops) },
+	{ USB_DEVICE(0x07b8, 0xb21c), USB_DEVICE_DATA(&rt73usb_ops) },
 	{ USB_DEVICE(0x07b8, 0xb21d), USB_DEVICE_DATA(&rt73usb_ops) },
+	{ USB_DEVICE(0x07b8, 0xb21e), USB_DEVICE_DATA(&rt73usb_ops) },
+	{ USB_DEVICE(0x07b8, 0xb21f), USB_DEVICE_DATA(&rt73usb_ops) },
+	/* AL */
+	{ USB_DEVICE(0x14b2, 0x3c10), USB_DEVICE_DATA(&rt73usb_ops) },
+	/* Amigo */
+	{ USB_DEVICE(0x148f, 0x9021), USB_DEVICE_DATA(&rt73usb_ops) },
+	{ USB_DEVICE(0x0eb0, 0x9021), USB_DEVICE_DATA(&rt73usb_ops) },
+	/* AMIT  */
+	{ USB_DEVICE(0x18c5, 0x0002), USB_DEVICE_DATA(&rt73usb_ops) },
 	/* Askey */
 	{ USB_DEVICE(0x1690, 0x0722), USB_DEVICE_DATA(&rt73usb_ops) },
 	/* ASUS */
@@ -2294,7 +2305,9 @@ static struct usb_device_id rt73usb_device_table[] = {
 	{ USB_DEVICE(0x050d, 0x905c), USB_DEVICE_DATA(&rt73usb_ops) },
 	/* Billionton */
 	{ USB_DEVICE(0x1631, 0xc019), USB_DEVICE_DATA(&rt73usb_ops) },
+	{ USB_DEVICE(0x08dd, 0x0120), USB_DEVICE_DATA(&rt73usb_ops) },
 	/* Buffalo */
+	{ USB_DEVICE(0x0411, 0x00d8), USB_DEVICE_DATA(&rt73usb_ops) },
 	{ USB_DEVICE(0x0411, 0x00f4), USB_DEVICE_DATA(&rt73usb_ops) },
 	/* CNet */
 	{ USB_DEVICE(0x1371, 0x9022), USB_DEVICE_DATA(&rt73usb_ops) },
@@ -2308,6 +2321,11 @@ static struct usb_device_id rt73usb_device_table[] = {
 	{ USB_DEVICE(0x07d1, 0x3c04), USB_DEVICE_DATA(&rt73usb_ops) },
 	{ USB_DEVICE(0x07d1, 0x3c06), USB_DEVICE_DATA(&rt73usb_ops) },
 	{ USB_DEVICE(0x07d1, 0x3c07), USB_DEVICE_DATA(&rt73usb_ops) },
+	/* Edimax */
+	{ USB_DEVICE(0x7392, 0x7318), USB_DEVICE_DATA(&rt73usb_ops) },
+	{ USB_DEVICE(0x7392, 0x7618), USB_DEVICE_DATA(&rt73usb_ops) },
+	/* EnGenius */
+	{ USB_DEVICE(0x1740, 0x3701), USB_DEVICE_DATA(&rt73usb_ops) },
 	/* Gemtek */
 	{ USB_DEVICE(0x15a9, 0x0004), USB_DEVICE_DATA(&rt73usb_ops) },
 	/* Gigabyte */
@@ -2328,22 +2346,34 @@ static struct usb_device_id rt73usb_device_table[] = {
 	{ USB_DEVICE(0x0db0, 0xa861), USB_DEVICE_DATA(&rt73usb_ops) },
 	{ USB_DEVICE(0x0db0, 0xa874), USB_DEVICE_DATA(&rt73usb_ops) },
 	/* Ralink */
+	{ USB_DEVICE(0x04bb, 0x093d), USB_DEVICE_DATA(&rt73usb_ops) },
 	{ USB_DEVICE(0x148f, 0x2573), USB_DEVICE_DATA(&rt73usb_ops) },
 	{ USB_DEVICE(0x148f, 0x2671), USB_DEVICE_DATA(&rt73usb_ops) },
 	/* Qcom */
 	{ USB_DEVICE(0x18e8, 0x6196), USB_DEVICE_DATA(&rt73usb_ops) },
 	{ USB_DEVICE(0x18e8, 0x6229), USB_DEVICE_DATA(&rt73usb_ops) },
 	{ USB_DEVICE(0x18e8, 0x6238), USB_DEVICE_DATA(&rt73usb_ops) },
+	/* Samsung */
+	{ USB_DEVICE(0x04e8, 0x4471), USB_DEVICE_DATA(&rt73usb_ops) },
 	/* Senao */
 	{ USB_DEVICE(0x1740, 0x7100), USB_DEVICE_DATA(&rt73usb_ops) },
 	/* Sitecom */
-	{ USB_DEVICE(0x0df6, 0x9712), USB_DEVICE_DATA(&rt73usb_ops) },
+	{ USB_DEVICE(0x0df6, 0x0024), USB_DEVICE_DATA(&rt73usb_ops) },
+	{ USB_DEVICE(0x0df6, 0x0027), USB_DEVICE_DATA(&rt73usb_ops) },
+	{ USB_DEVICE(0x0df6, 0x002f), USB_DEVICE_DATA(&rt73usb_ops) },
 	{ USB_DEVICE(0x0df6, 0x90ac), USB_DEVICE_DATA(&rt73usb_ops) },
+	{ USB_DEVICE(0x0df6, 0x9712), USB_DEVICE_DATA(&rt73usb_ops) },
 	/* Surecom */
 	{ USB_DEVICE(0x0769, 0x31f3), USB_DEVICE_DATA(&rt73usb_ops) },
+	/* Philips */
+	{ USB_DEVICE(0x0471, 0x200a), USB_DEVICE_DATA(&rt73usb_ops) },
 	/* Planex */
 	{ USB_DEVICE(0x2019, 0xab01), USB_DEVICE_DATA(&rt73usb_ops) },
 	{ USB_DEVICE(0x2019, 0xab50), USB_DEVICE_DATA(&rt73usb_ops) },
+	/* Zcom */
+	{ USB_DEVICE(0x0cde, 0x001c), USB_DEVICE_DATA(&rt73usb_ops) },
+	/* ZyXEL */
+	{ USB_DEVICE(0x0586, 0x3415), USB_DEVICE_DATA(&rt73usb_ops) },
 	{ 0, }
 };
 
-- 
cgit v0.10.2


From 623d563e52d4d4041612e24b33a5610a900dd778 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Tue, 3 Mar 2009 11:37:04 -0800
Subject: iwlwifi: fix error flow in iwl*_pci_probe

Both the agn and 3945 drivers has some problems with dealing with
errors in their probe functions. Ensure that a goto will undo only
things that was done before the goto was called.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c
index 36bafeb..129e2d3 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
@@ -3868,7 +3868,7 @@ static int iwl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 	err = iwl_eeprom_check_version(priv);
 	if (err)
-		goto out_iounmap;
+		goto out_free_eeprom;
 
 	/* extract MAC Address */
 	iwl_eeprom_get_mac(priv, priv->mac_addr);
@@ -3945,6 +3945,8 @@ static int iwl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	return 0;
 
  out_remove_sysfs:
+	destroy_workqueue(priv->workqueue);
+	priv->workqueue = NULL;
 	sysfs_remove_group(&pdev->dev.kobj, &iwl_attribute_group);
  out_uninit_drv:
 	iwl_uninit_drv(priv);
@@ -3953,8 +3955,8 @@ static int iwl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
  out_iounmap:
 	pci_iounmap(pdev, priv->hw_base);
  out_pci_release_regions:
-	pci_release_regions(pdev);
 	pci_set_drvdata(pdev, NULL);
+	pci_release_regions(pdev);
  out_pci_disable_device:
 	pci_disable_device(pdev);
  out_ieee80211_free_hw:
diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index 93be74a..57dd34e 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -7911,7 +7911,7 @@ static int iwl3945_pci_probe(struct pci_dev *pdev, const struct pci_device_id *e
 				CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY, 25000);
 	if (err < 0) {
 		IWL_DEBUG_INFO("Failed to init the card\n");
-		goto out_remove_sysfs;
+		goto out_iounmap;
 	}
 
 	/***********************
@@ -7921,7 +7921,7 @@ static int iwl3945_pci_probe(struct pci_dev *pdev, const struct pci_device_id *e
 	err = iwl3945_eeprom_init(priv);
 	if (err) {
 		IWL_ERROR("Unable to init EEPROM\n");
-		goto out_remove_sysfs;
+		goto out_iounmap;
 	}
 	/* MAC Address location in EEPROM same for 3945/4965 */
 	get_eeprom_mac(priv, priv->mac_addr);
@@ -7975,7 +7975,7 @@ static int iwl3945_pci_probe(struct pci_dev *pdev, const struct pci_device_id *e
 	err = iwl3945_init_channel_map(priv);
 	if (err) {
 		IWL_ERROR("initializing regulatory failed: %d\n", err);
-		goto out_release_irq;
+		goto out_unset_hw_setting;
 	}
 
 	err = iwl3945_init_geos(priv);
@@ -8045,25 +8045,22 @@ static int iwl3945_pci_probe(struct pci_dev *pdev, const struct pci_device_id *e
 	return 0;
 
  out_remove_sysfs:
+	destroy_workqueue(priv->workqueue);
+	priv->workqueue = NULL;
 	sysfs_remove_group(&pdev->dev.kobj, &iwl3945_attribute_group);
  out_free_geos:
 	iwl3945_free_geos(priv);
  out_free_channel_map:
 	iwl3945_free_channel_map(priv);
-
-
- out_release_irq:
-	destroy_workqueue(priv->workqueue);
-	priv->workqueue = NULL;
+ out_unset_hw_setting:
 	iwl3945_unset_hw_setting(priv);
-
  out_iounmap:
 	pci_iounmap(pdev, priv->hw_base);
  out_pci_release_regions:
 	pci_release_regions(pdev);
  out_pci_disable_device:
-	pci_disable_device(pdev);
 	pci_set_drvdata(pdev, NULL);
+	pci_disable_device(pdev);
  out_ieee80211_free_hw:
 	ieee80211_free_hw(priv->hw);
  out:
-- 
cgit v0.10.2


From c9a0c8a6845b5efb64841f40b8efb4c387051d46 Mon Sep 17 00:00:00 2001
From: Wim Van Sebroeck <wim@iguana.be>
Date: Wed, 25 Feb 2009 13:33:01 +0100
Subject: [WATCHDOG] orion5x_wdt.c: 'ORION5X_TCLK' undeclared

orion5x_wdt no longer compiled after the changes in commit
ebe35aff883496c07248df82c8576c3b6e84bbbe ("Orion: prepare for
runtime-determined timer tick rate").

The tick rate define (ORION5X_TCLK) was removed in favor of a runtime
detection. The quick fix is to add the define in the watchdog driver.
The fix is not correct for all supported orion5x platforms, but since
the supported platforms right now are 133 Mhz and 166 Mhz, it won't
be _that_ far off. ;-) A fix that uses the runtime-determined timer
tick rate will be applied later.

Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
Cc: Kristof Provost <kristof@sigsegv.be>
Acked-by: Lennert Buytenhek <buytenh@wantstofly.org>
Cc: Nicolas Pitre <nico@cam.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>

diff --git a/drivers/watchdog/orion5x_wdt.c b/drivers/watchdog/orion5x_wdt.c
index 14a339f..b64ae1a 100644
--- a/drivers/watchdog/orion5x_wdt.c
+++ b/drivers/watchdog/orion5x_wdt.c
@@ -29,6 +29,7 @@
 #define  WDT_EN			0x0010
 #define WDT_VAL			(TIMER_VIRT_BASE + 0x0024)
 
+#define ORION5X_TCLK		166666667
 #define WDT_MAX_DURATION	(0xffffffff / ORION5X_TCLK)
 #define WDT_IN_USE		0
 #define WDT_OK_TO_CLOSE		1
-- 
cgit v0.10.2


From 6a242909b01120f6f3d571c0b75e20ec61f0d8d3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:58 +0900
Subject: percpu: clean up percpu constants

Impact: cleaup

Make the following cleanups.

* There isn't much arch-specific about PERCPU_MODULE_RESERVE.  Always
  define it whether arch overrides PERCPU_ENOUGH_ROOM or not.

* blackfin overrides PERCPU_ENOUGH_ROOM to align static area size.  Do
  it by default.

* percpu allocation sizes doesn't have much to do with the page size.
  Don't use PAGE_SHIFT in their definition.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/include/asm/percpu.h b/arch/blackfin/include/asm/percpu.h
index 797c0c1..c94c7bc 100644
--- a/arch/blackfin/include/asm/percpu.h
+++ b/arch/blackfin/include/asm/percpu.h
@@ -3,14 +3,4 @@
 
 #include <asm-generic/percpu.h>
 
-#ifdef CONFIG_MODULES
-#define PERCPU_MODULE_RESERVE 8192
-#else
-#define PERCPU_MODULE_RESERVE 0
-#endif
-
-#define PERCPU_ENOUGH_ROOM \
-	(ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
-	 PERCPU_MODULE_RESERVE)
-
 #endif	/* __ARCH_BLACKFIN_PERCPU__ */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 545b068..2d34b03 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -5,6 +5,7 @@
 #include <linux/slab.h> /* For kmalloc() */
 #include <linux/smp.h>
 #include <linux/cpumask.h>
+#include <linux/pfn.h>
 
 #include <asm/percpu.h>
 
@@ -52,17 +53,18 @@
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
 
-/* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
-#ifndef PERCPU_ENOUGH_ROOM
+/* enough to cover all DEFINE_PER_CPUs in modules */
 #ifdef CONFIG_MODULES
-#define PERCPU_MODULE_RESERVE	8192
+#define PERCPU_MODULE_RESERVE		(8 << 10)
 #else
-#define PERCPU_MODULE_RESERVE	0
+#define PERCPU_MODULE_RESERVE		0
 #endif
 
+#ifndef PERCPU_ENOUGH_ROOM
 #define PERCPU_ENOUGH_ROOM						\
-	(__per_cpu_end - __per_cpu_start + PERCPU_MODULE_RESERVE)
-#endif	/* PERCPU_ENOUGH_ROOM */
+	(ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) +	\
+	 PERCPU_MODULE_RESERVE)
+#endif
 
 /*
  * Must be an lvalue. Since @var must be a simple identifier,
@@ -79,7 +81,7 @@
 #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 
 /* minimum unit size, also is the maximum supported allocation size */
-#define PCPU_MIN_UNIT_SIZE		(16UL << PAGE_SHIFT)
+#define PCPU_MIN_UNIT_SIZE		PFN_ALIGN(64 << 10)
 
 /*
  * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
@@ -96,15 +98,15 @@
 #ifndef PERCPU_DYNAMIC_RESERVE
 #  if BITS_PER_LONG > 32
 #    ifdef CONFIG_MODULES
-#      define PERCPU_DYNAMIC_RESERVE	(6 << PAGE_SHIFT)
+#      define PERCPU_DYNAMIC_RESERVE	(24 << 10)
 #    else
-#      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
+#      define PERCPU_DYNAMIC_RESERVE	(16 << 10)
 #    endif
 #  else
 #    ifdef CONFIG_MODULES
-#      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
+#      define PERCPU_DYNAMIC_RESERVE	(16 << 10)
 #    else
-#      define PERCPU_DYNAMIC_RESERVE	(2 << PAGE_SHIFT)
+#      define PERCPU_DYNAMIC_RESERVE	(8 << 10)
 #    endif
 #  endif
 #endif	/* PERCPU_DYNAMIC_RESERVE */
-- 
cgit v0.10.2


From 2441d15c97d498b18f03ae9fba262ffeae42a08b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu: cosmetic renames in pcpu_setup_first_chunk()

Impact: cosmetic, preparation for future changes

Make the following renames in pcpur_setup_first_chunk() in preparation
for future changes.

* s/free_size/dyn_size/
* s/static_vm/first_vm/
* s/static_chunk/schunk/

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 2d34b03..a0b4ea2 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -118,7 +118,7 @@ typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 					size_t static_size, size_t unit_size,
-					size_t free_size, void *base_addr,
+					size_t dyn_size, void *base_addr,
 					pcpu_populate_pte_fn_t populate_pte_fn);
 
 /*
diff --git a/mm/percpu.c b/mm/percpu.c
index 3d0f545..9531590 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -831,7 +831,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
  * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
- * @free_size: free size in bytes, 0 for auto
+ * @dyn_size: free size for dynamic allocation in bytes, 0 for auto
  * @base_addr: mapped address, NULL for auto
  * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
  *
@@ -849,12 +849,12 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * return the same number of pages for all cpus.
  *
  * @unit_size, if non-zero, determines unit size and must be aligned
- * to PAGE_SIZE and equal to or larger than @static_size + @free_size.
+ * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size.
  *
- * @free_size determines the number of free bytes after the static
+ * @dyn_size determines the number of free bytes after the static
  * area in the first chunk.  If zero, whatever left is available.
  * Specifying non-zero value make percpu leave the area after
- * @static_size + @free_size alone.
+ * @static_size + @dyn_size alone.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -870,19 +870,19 @@ EXPORT_SYMBOL_GPL(free_percpu);
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				     size_t static_size, size_t unit_size,
-				     size_t free_size, void *base_addr,
+				     size_t dyn_size, void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
-	static struct vm_struct static_vm;
-	struct pcpu_chunk *static_chunk;
+	static struct vm_struct first_vm;
+	struct pcpu_chunk *schunk;
 	unsigned int cpu;
 	int nr_pages;
 	int err, i;
 
 	/* santiy checks */
 	BUG_ON(!static_size);
-	BUG_ON(!unit_size && free_size);
-	BUG_ON(unit_size && unit_size < static_size + free_size);
+	BUG_ON(!unit_size && dyn_size);
+	BUG_ON(unit_size && unit_size < static_size + dyn_size);
 	BUG_ON(unit_size & ~PAGE_MASK);
 	BUG_ON(base_addr && !unit_size);
 	BUG_ON(base_addr && populate_pte_fn);
@@ -908,24 +908,24 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 
-	/* init static_chunk */
-	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
-	INIT_LIST_HEAD(&static_chunk->list);
-	static_chunk->vm = &static_vm;
+	/* init static chunk */
+	schunk = alloc_bootmem(pcpu_chunk_struct_size);
+	INIT_LIST_HEAD(&schunk->list);
+	schunk->vm = &first_vm;
 
-	if (free_size)
-		static_chunk->free_size = free_size;
+	if (dyn_size)
+		schunk->free_size = dyn_size;
 	else
-		static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+		schunk->free_size = pcpu_unit_size - pcpu_static_size;
 
-	static_chunk->contig_hint = static_chunk->free_size;
+	schunk->contig_hint = schunk->free_size;
 
 	/* allocate vm address */
-	static_vm.flags = VM_ALLOC;
-	static_vm.size = pcpu_chunk_size;
+	first_vm.flags = VM_ALLOC;
+	first_vm.size = pcpu_chunk_size;
 
 	if (!base_addr)
-		vm_area_register_early(&static_vm, PAGE_SIZE);
+		vm_area_register_early(&first_vm, PAGE_SIZE);
 	else {
 		/*
 		 * Pages already mapped.  No need to remap into
@@ -933,8 +933,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		 * be mapped or unmapped by percpu and is marked
 		 * immutable.
 		 */
-		static_vm.addr = base_addr;
-		static_chunk->immutable = true;
+		first_vm.addr = base_addr;
+		schunk->immutable = true;
 	}
 
 	/* assign pages */
@@ -945,7 +945,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 
 			if (!page)
 				break;
-			*pcpu_chunk_pagep(static_chunk, cpu, i) = page;
+			*pcpu_chunk_pagep(schunk, cpu, i) = page;
 		}
 
 		BUG_ON(i < PFN_UP(pcpu_static_size));
@@ -960,20 +960,20 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	if (populate_pte_fn) {
 		for_each_possible_cpu(cpu)
 			for (i = 0; i < nr_pages; i++)
-				populate_pte_fn(pcpu_chunk_addr(static_chunk,
+				populate_pte_fn(pcpu_chunk_addr(schunk,
 								cpu, i));
 
-		err = pcpu_map(static_chunk, 0, nr_pages);
+		err = pcpu_map(schunk, 0, nr_pages);
 		if (err)
 			panic("failed to setup static percpu area, err=%d\n",
 			      err);
 	}
 
-	/* link static_chunk in */
-	pcpu_chunk_relocate(static_chunk, -1);
-	pcpu_chunk_addr_insert(static_chunk);
+	/* link the first chunk in */
+	pcpu_chunk_relocate(schunk, -1);
+	pcpu_chunk_addr_insert(schunk);
 
 	/* we're done */
-	pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
+	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
 	return pcpu_unit_size;
 }
-- 
cgit v0.10.2


From 61ace7fa2fff9c4b6641c506b6b3f1a9394a1b11 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu: improve first chunk initial area map handling

Impact: no functional change

When the first chunk is created, its initial area map is not allocated
because kmalloc isn't online yet.  The map is allocated and
initialized on the first allocation request on the chunk.  This works
fine but the scattering of initialization logic between the init
function and allocation path is a bit confusing.

This patch makes the first chunk initialize and use minimal statically
allocated map from pcpu_setpu_first_chunk().  The map resizing path
still needs to handle this specially but it's more straight-forward
and gives more latitude to the init path.  This will ease future
changes.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index 9531590..503ccad 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -93,9 +93,6 @@ static size_t pcpu_chunk_struct_size __read_mostly;
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
-/* the size of kernel static area */
-static int pcpu_static_size __read_mostly;
-
 /*
  * One mutex to rule them all.
  *
@@ -316,15 +313,28 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 
 	/* reallocation required? */
 	if (chunk->map_alloc < target) {
-		int new_alloc = chunk->map_alloc;
+		int new_alloc;
 		int *new;
 
+		new_alloc = PCPU_DFL_MAP_ALLOC;
 		while (new_alloc < target)
 			new_alloc *= 2;
 
-		new = pcpu_realloc(chunk->map,
-				   chunk->map_alloc * sizeof(new[0]),
-				   new_alloc * sizeof(new[0]));
+		if (chunk->map_alloc < PCPU_DFL_MAP_ALLOC) {
+			/*
+			 * map_alloc smaller than the default size
+			 * indicates that the chunk is one of the
+			 * first chunks and still using static map.
+			 * Allocate a dynamic one and copy.
+			 */
+			new = pcpu_realloc(NULL, 0, new_alloc * sizeof(new[0]));
+			if (new)
+				memcpy(new, chunk->map,
+				       chunk->map_alloc * sizeof(new[0]));
+		} else
+			new = pcpu_realloc(chunk->map,
+					   chunk->map_alloc * sizeof(new[0]),
+					   new_alloc * sizeof(new[0]));
 		if (!new)
 			return -ENOMEM;
 
@@ -367,22 +377,6 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 	int max_contig = 0;
 	int i, off;
 
-	/*
-	 * The static chunk initially doesn't have map attached
-	 * because kmalloc wasn't available during init.  Give it one.
-	 */
-	if (unlikely(!chunk->map)) {
-		chunk->map = pcpu_realloc(NULL, 0,
-				PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
-		if (!chunk->map)
-			return -ENOMEM;
-
-		chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
-		chunk->map[chunk->map_used++] = -pcpu_static_size;
-		if (chunk->free_size)
-			chunk->map[chunk->map_used++] = chunk->free_size;
-	}
-
 	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
 		bool is_last = i + 1 == chunk->map_used;
 		int head, tail;
@@ -874,12 +868,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
+	static int smap[2];
 	struct pcpu_chunk *schunk;
 	unsigned int cpu;
 	int nr_pages;
 	int err, i;
 
 	/* santiy checks */
+	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
 	BUG_ON(!unit_size && dyn_size);
 	BUG_ON(unit_size && unit_size < static_size + dyn_size);
@@ -893,7 +889,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
 					PFN_UP(static_size));
 
-	pcpu_static_size = static_size;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
@@ -912,14 +907,20 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	schunk = alloc_bootmem(pcpu_chunk_struct_size);
 	INIT_LIST_HEAD(&schunk->list);
 	schunk->vm = &first_vm;
+	schunk->map = smap;
+	schunk->map_alloc = ARRAY_SIZE(smap);
 
 	if (dyn_size)
 		schunk->free_size = dyn_size;
 	else
-		schunk->free_size = pcpu_unit_size - pcpu_static_size;
+		schunk->free_size = pcpu_unit_size - static_size;
 
 	schunk->contig_hint = schunk->free_size;
 
+	schunk->map[schunk->map_used++] = -static_size;
+	if (schunk->free_size)
+		schunk->map[schunk->map_used++] = schunk->free_size;
+
 	/* allocate vm address */
 	first_vm.flags = VM_ALLOC;
 	first_vm.size = pcpu_chunk_size;
@@ -948,7 +949,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 			*pcpu_chunk_pagep(schunk, cpu, i) = page;
 		}
 
-		BUG_ON(i < PFN_UP(pcpu_static_size));
+		BUG_ON(i < PFN_UP(static_size));
 
 		if (nr_pages < 0)
 			nr_pages = i;
-- 
cgit v0.10.2


From cafe8816b217b98dc3f268d3b77445da498beb4f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu: use negative for auto for pcpu_setup_first_chunk() arguments

Impact: argument semantic cleanup

In pcpu_setup_first_chunk(), zero @unit_size and @dyn_size meant
auto-sizing.  It's okay for @unit_size as 0 doesn't make sense but 0
dynamic reserve size is valid.  Alos, if arch @dyn_size is calculated
from other parameters, it might end up passing in 0 @dyn_size and
malfunction when the size is automatically adjusted.

This patch makes both @unit_size and @dyn_size ssize_t and use -1 for
auto sizing.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index c29f301..ef3a2cd3f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -344,7 +344,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
 		pcpu4k_nr_static_pages, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL,
 				     pcpu4k_populate_pte);
 	goto out_free_ar;
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index a0b4ea2..a96fc53 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -117,8 +117,9 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-					size_t static_size, size_t unit_size,
-					size_t dyn_size, void *base_addr,
+					size_t static_size,
+					ssize_t unit_size, ssize_t dyn_size,
+					void *base_addr,
 					pcpu_populate_pte_fn_t populate_pte_fn);
 
 /*
diff --git a/mm/percpu.c b/mm/percpu.c
index 503ccad..a84cf99 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -824,8 +824,8 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * pcpu_setup_first_chunk - initialize the first percpu chunk
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
- * @dyn_size: free size for dynamic allocation in bytes, 0 for auto
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
+ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  * @base_addr: mapped address, NULL for auto
  * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
  *
@@ -842,13 +842,14 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * indicates end of pages for the cpu.  Note that @get_page_fn() must
  * return the same number of pages for all cpus.
  *
- * @unit_size, if non-zero, determines unit size and must be aligned
- * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size.
+ * @unit_size, if non-negative, specifies unit size and must be
+ * aligned to PAGE_SIZE and equal to or larger than @static_size +
+ * @dyn_size.
  *
- * @dyn_size determines the number of free bytes after the static
- * area in the first chunk.  If zero, whatever left is available.
- * Specifying non-zero value make percpu leave the area after
- * @static_size + @dyn_size alone.
+ * @dyn_size, if non-negative, limits the number of bytes available
+ * for dynamic allocation in the first chunk.  Specifying non-negative
+ * value make percpu leave alone the area beyond @static_size +
+ * @dyn_size.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -863,8 +864,9 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * percpu access.
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-				     size_t static_size, size_t unit_size,
-				     size_t dyn_size, void *base_addr,
+				     size_t static_size,
+				     ssize_t unit_size, ssize_t dyn_size,
+				     void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
@@ -877,13 +879,17 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	/* santiy checks */
 	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
-	BUG_ON(!unit_size && dyn_size);
-	BUG_ON(unit_size && unit_size < static_size + dyn_size);
-	BUG_ON(unit_size & ~PAGE_MASK);
-	BUG_ON(base_addr && !unit_size);
+	if (unit_size >= 0) {
+		BUG_ON(unit_size < static_size +
+				   (dyn_size >= 0 ? dyn_size : 0));
+		BUG_ON(unit_size & ~PAGE_MASK);
+	} else {
+		BUG_ON(dyn_size >= 0);
+		BUG_ON(base_addr);
+	}
 	BUG_ON(base_addr && populate_pte_fn);
 
-	if (unit_size)
+	if (unit_size >= 0)
 		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	else
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
@@ -894,6 +900,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
 		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 
+	if (dyn_size < 0)
+		dyn_size = pcpu_unit_size - static_size;
+
 	/*
 	 * Allocate chunk slots.  The additional last slot is for
 	 * empty chunks.
@@ -909,12 +918,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	schunk->vm = &first_vm;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
-
-	if (dyn_size)
-		schunk->free_size = dyn_size;
-	else
-		schunk->free_size = pcpu_unit_size - static_size;
-
+	schunk->free_size = dyn_size;
 	schunk->contig_hint = schunk->free_size;
 
 	schunk->map[schunk->map_used++] = -static_size;
-- 
cgit v0.10.2


From 9a4f8a878b68d5a5d9ee60908a52cf6a55e1b823 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: x86: make embedding percpu allocator return excessive free space

Impact: reduce unnecessary memory usage on certain configurations

Embedding percpu allocator allocates unit_size *
smp_num_possible_cpus() bytes consecutively and use it for the first
chunk.  However, if the static area is small, this can result in
excessive prellocated free space in the first chunk due to
PCPU_MIN_UNIT_SIZE restriction.

This patch makes embedding percpu allocator preallocate only what's
necessary as described by PERPCU_DYNAMIC_RESERVE and return the
leftover to the bootmem allocator.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ef3a2cd3f..38e2b2a 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -241,24 +241,31 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
  * Embedding allocator
  *
  * The first chunk is sized to just contain the static area plus
- * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using
- * bootmem allocator and used as-is without being mapped into vmalloc
- * area.  This enables the first chunk to piggy back on the linear
- * physical PMD mapping and doesn't add any additional pressure to
- * TLB.
+ * module and dynamic reserves, and allocated as a contiguous area
+ * using bootmem allocator and used as-is without being mapped into
+ * vmalloc area.  This enables the first chunk to piggy back on the
+ * linear physical PMD mapping and doesn't add any additional pressure
+ * to TLB.  Note that if the needed size is smaller than the minimum
+ * unit size, the leftover is returned to the bootmem allocator.
  */
 static void *pcpue_ptr __initdata;
+static size_t pcpue_size __initdata;
 static size_t pcpue_unit_size __initdata;
 
 static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
 {
-	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size
-			    + ((size_t)pageno << PAGE_SHIFT));
+	size_t off = (size_t)pageno << PAGE_SHIFT;
+
+	if (off >= pcpue_size)
+		return NULL;
+
+	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
 }
 
 static ssize_t __init setup_pcpu_embed(size_t static_size)
 {
 	unsigned int cpu;
+	size_t dyn_size;
 
 	/*
 	 * If large page isn't supported, there's no benefit in doing
@@ -269,25 +276,30 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 		return -EINVAL;
 
 	/* allocate and copy */
-	pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
-	pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
+	pcpue_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
+	dyn_size = pcpue_size - static_size;
+
 	pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
 				       PAGE_SIZE);
 	if (!pcpue_ptr)
 		return -ENOMEM;
 
-	for_each_possible_cpu(cpu)
-		memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load,
-		       static_size);
+	for_each_possible_cpu(cpu) {
+		void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
+
+		free_bootmem(__pa(ptr + pcpue_size),
+			     pcpue_unit_size - pcpue_size);
+		memcpy(ptr, __per_cpu_load, static_size);
+	}
 
 	/* we're ready, commit */
 	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
-		pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 
 	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
-				      pcpue_unit_size,
-				      pcpue_unit_size - static_size, pcpue_ptr,
-				      NULL);
+				      pcpue_unit_size, dyn_size,
+				      pcpue_ptr, NULL);
 }
 
 /*
-- 
cgit v0.10.2


From 3e24aa58907c62bc79d1094e941a374568f62522 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu: add an indirection ptr for chunk page map access

Impact: allow sharing page map, no functional difference yet

Make chunk->page access indirect by adding a pointer and renaming the
actual array to page_ar.  This will be used by future changes.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index a84cf99..5b47d9f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,7 +80,8 @@ struct pcpu_chunk {
 	int			map_alloc;	/* # of map entries allocated */
 	int			*map;		/* allocation map */
 	bool			immutable;	/* no [de]population allowed */
-	struct page		*page[];	/* #cpus * UNIT_PAGES */
+	struct page		**page;		/* points to page array */
+	struct page		*page_ar[];	/* #cpus * UNIT_PAGES */
 };
 
 static int pcpu_unit_pages __read_mostly;
@@ -696,6 +697,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 				  PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
 	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 	chunk->map[chunk->map_used++] = pcpu_unit_size;
+	chunk->page = chunk->page_ar;
 
 	chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
 	if (!chunk->vm) {
@@ -918,6 +920,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	schunk->vm = &first_vm;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
+	schunk->page = schunk->page_ar;
 	schunk->free_size = dyn_size;
 	schunk->contig_hint = schunk->free_size;
 
-- 
cgit v0.10.2


From edcb463997ed7b2ffa3bac76e3e75957318f2e01 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu, module: implement reserved allocation and use it for module
 percpu variables

Impact: add reserved allocation functionality and use it for module
	percpu variables

This patch implements reserved allocation from the first chunk.  When
setting up the first chunk, arch can ask to set aside certain number
of bytes right after the core static area which is available only
through a separate reserved allocator.  This will be used primarily
for module static percpu variables on architectures with limited
relocation range to ensure that the module perpcu symbols are inside
the relocatable range.

If reserved area is requested, the first chunk becomes reserved and
isn't available for regular allocation.  If the first chunk also
includes piggy-back dynamic allocation area, a separate chunk mapping
the same region is created to serve dynamic allocation.  The first one
is called static first chunk and the second dynamic first chunk.
Although they share the page map, their different area map
initializations guarantee they serve disjoint areas according to their
purposes.

If arch doesn't setup reserved area, reserved allocation is handled
like any other allocation.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 38e2b2a..dd4eabc 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -217,7 +217,7 @@ proceed:
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
 		"%zu bytes\n", vm.addr, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
+	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE,
 				     pcpur_size - static_size, vm.addr, NULL);
 	goto out_free_ar;
 
@@ -297,7 +297,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
 		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 
-	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+	return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0,
 				      pcpue_unit_size, dyn_size,
 				      pcpue_ptr, NULL);
 }
@@ -356,8 +356,8 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
 		pcpu4k_nr_static_pages, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL,
-				     pcpu4k_populate_pte);
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1,
+				     NULL, pcpu4k_populate_pte);
 	goto out_free_ar;
 
 enomem:
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index a96fc53..8ff1515 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -117,10 +117,10 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-					size_t static_size,
-					ssize_t unit_size, ssize_t dyn_size,
-					void *base_addr,
-					pcpu_populate_pte_fn_t populate_pte_fn);
+				size_t static_size, size_t reserved_size,
+				ssize_t unit_size, ssize_t dyn_size,
+				void *base_addr,
+				pcpu_populate_pte_fn_t populate_pte_fn);
 
 /*
  * Use this to get to a cpu's version of the per-cpu object
@@ -129,6 +129,8 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
  */
 #define per_cpu_ptr(ptr, cpu)	SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
 
+extern void *__alloc_reserved_percpu(size_t size, size_t align);
+
 #else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
 
 struct percpu_data {
diff --git a/kernel/module.c b/kernel/module.c
index 1f0657a..f0e04d6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -381,7 +381,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
 		align = PAGE_SIZE;
 	}
 
-	ptr = __alloc_percpu(size, align);
+	ptr = __alloc_reserved_percpu(size, align);
 	if (!ptr)
 		printk(KERN_WARNING
 		       "Could not allocate %lu bytes percpu data\n", size);
diff --git a/mm/percpu.c b/mm/percpu.c
index 5b47d9f..ef8e169 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -94,6 +94,11 @@ static size_t pcpu_chunk_struct_size __read_mostly;
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
+/* optional reserved chunk, only accessible for reserved allocations */
+static struct pcpu_chunk *pcpu_reserved_chunk;
+/* offset limit of the reserved chunk */
+static int pcpu_reserved_chunk_limit;
+
 /*
  * One mutex to rule them all.
  *
@@ -201,13 +206,14 @@ static void *pcpu_realloc(void *p, size_t size, size_t new_size)
  *
  * This function is called after an allocation or free changed @chunk.
  * New slot according to the changed state is determined and @chunk is
- * moved to the slot.
+ * moved to the slot.  Note that the reserved chunk is never put on
+ * chunk slots.
  */
 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
 	int nslot = pcpu_chunk_slot(chunk);
 
-	if (oslot != nslot) {
+	if (chunk != pcpu_reserved_chunk && oslot != nslot) {
 		if (oslot < nslot)
 			list_move(&chunk->list, &pcpu_slot[nslot]);
 		else
@@ -255,6 +261,15 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 	struct rb_node *n, *parent;
 	struct pcpu_chunk *chunk;
 
+	/* is it in the reserved chunk? */
+	if (pcpu_reserved_chunk) {
+		void *start = pcpu_reserved_chunk->vm->addr;
+
+		if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
+			return pcpu_reserved_chunk;
+	}
+
+	/* nah... search the regular ones */
 	n = *pcpu_chunk_rb_search(addr, &parent);
 	if (!n) {
 		/* no exactly matching chunk, the parent is the closest */
@@ -713,9 +728,10 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 }
 
 /**
- * __alloc_percpu - allocate percpu area
+ * pcpu_alloc - the percpu allocator
  * @size: size of area to allocate in bytes
  * @align: alignment of area (max PAGE_SIZE)
+ * @reserved: allocate from the reserved chunk if available
  *
  * Allocate percpu area of @size bytes aligned at @align.  Might
  * sleep.  Might trigger writeouts.
@@ -723,7 +739,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
-void *__alloc_percpu(size_t size, size_t align)
+static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
 	void *ptr = NULL;
 	struct pcpu_chunk *chunk;
@@ -737,7 +753,18 @@ void *__alloc_percpu(size_t size, size_t align)
 
 	mutex_lock(&pcpu_mutex);
 
-	/* allocate area */
+	/* serve reserved allocations from the reserved chunk if available */
+	if (reserved && pcpu_reserved_chunk) {
+		chunk = pcpu_reserved_chunk;
+		if (size > chunk->contig_hint)
+			goto out_unlock;
+		off = pcpu_alloc_area(chunk, size, align);
+		if (off >= 0)
+			goto area_found;
+		goto out_unlock;
+	}
+
+	/* search through normal chunks */
 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
@@ -773,8 +800,41 @@ out_unlock:
 	mutex_unlock(&pcpu_mutex);
 	return ptr;
 }
+
+/**
+ * __alloc_percpu - allocate dynamic percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align.  Might
+ * sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+	return pcpu_alloc(size, align, false);
+}
 EXPORT_SYMBOL_GPL(__alloc_percpu);
 
+/**
+ * __alloc_reserved_percpu - allocate reserved percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align from reserved
+ * percpu area if arch has set it up; otherwise, allocation is served
+ * from the same dynamic area.  Might sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_reserved_percpu(size_t size, size_t align)
+{
+	return pcpu_alloc(size, align, true);
+}
+
 static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
 {
 	WARN_ON(chunk->immutable);
@@ -826,6 +886,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * pcpu_setup_first_chunk - initialize the first percpu chunk
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
  * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  * @base_addr: mapped address, NULL for auto
@@ -844,14 +905,22 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * indicates end of pages for the cpu.  Note that @get_page_fn() must
  * return the same number of pages for all cpus.
  *
+ * @reserved_size, if non-zero, specifies the amount of bytes to
+ * reserve after the static area in the first chunk.  This reserves
+ * the first chunk such that it's available only through reserved
+ * percpu allocation.  This is primarily used to serve module percpu
+ * static areas on architectures where the addressing model has
+ * limited offset range for symbol relocations to guarantee module
+ * percpu symbols fall inside the relocatable range.
+ *
  * @unit_size, if non-negative, specifies unit size and must be
  * aligned to PAGE_SIZE and equal to or larger than @static_size +
- * @dyn_size.
+ * @reserved_size + @dyn_size.
  *
  * @dyn_size, if non-negative, limits the number of bytes available
  * for dynamic allocation in the first chunk.  Specifying non-negative
  * value make percpu leave alone the area beyond @static_size +
- * @dyn_size.
+ * @reserved_size + @dyn_size.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -861,28 +930,36 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * @populate_pte_fn is used to populate the pagetable.  NULL means the
  * caller already populated the pagetable.
  *
+ * If the first chunk ends up with both reserved and dynamic areas, it
+ * is served by two chunks - one to serve the core static and reserved
+ * areas and the other for the dynamic area.  They share the same vm
+ * and page map but uses different area allocation map to stay away
+ * from each other.  The latter chunk is circulated in the chunk slots
+ * and available for dynamic allocation like any other chunks.
+ *
  * RETURNS:
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access.
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-				     size_t static_size,
+				     size_t static_size, size_t reserved_size,
 				     ssize_t unit_size, ssize_t dyn_size,
 				     void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
-	static int smap[2];
-	struct pcpu_chunk *schunk;
+	static int smap[2], dmap[2];
+	struct pcpu_chunk *schunk, *dchunk = NULL;
 	unsigned int cpu;
 	int nr_pages;
 	int err, i;
 
 	/* santiy checks */
-	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
+	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
+		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
 	if (unit_size >= 0) {
-		BUG_ON(unit_size < static_size +
+		BUG_ON(unit_size < static_size + reserved_size +
 				   (dyn_size >= 0 ? dyn_size : 0));
 		BUG_ON(unit_size & ~PAGE_MASK);
 	} else {
@@ -895,7 +972,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	else
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
-					PFN_UP(static_size));
+					PFN_UP(static_size + reserved_size));
 
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
@@ -903,7 +980,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 
 	if (dyn_size < 0)
-		dyn_size = pcpu_unit_size - static_size;
+		dyn_size = pcpu_unit_size - static_size - reserved_size;
 
 	/*
 	 * Allocate chunk slots.  The additional last slot is for
@@ -914,20 +991,49 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 
-	/* init static chunk */
+	/*
+	 * Initialize static chunk.  If reserved_size is zero, the
+	 * static chunk covers static area + dynamic allocation area
+	 * in the first chunk.  If reserved_size is not zero, it
+	 * covers static area + reserved area (mostly used for module
+	 * static percpu allocation).
+	 */
 	schunk = alloc_bootmem(pcpu_chunk_struct_size);
 	INIT_LIST_HEAD(&schunk->list);
 	schunk->vm = &first_vm;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
 	schunk->page = schunk->page_ar;
-	schunk->free_size = dyn_size;
+
+	if (reserved_size) {
+		schunk->free_size = reserved_size;
+		pcpu_reserved_chunk = schunk;	/* not for dynamic alloc */
+	} else {
+		schunk->free_size = dyn_size;
+		dyn_size = 0;			/* dynamic area covered */
+	}
 	schunk->contig_hint = schunk->free_size;
 
 	schunk->map[schunk->map_used++] = -static_size;
 	if (schunk->free_size)
 		schunk->map[schunk->map_used++] = schunk->free_size;
 
+	pcpu_reserved_chunk_limit = static_size + schunk->free_size;
+
+	/* init dynamic chunk if necessary */
+	if (dyn_size) {
+		dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
+		INIT_LIST_HEAD(&dchunk->list);
+		dchunk->vm = &first_vm;
+		dchunk->map = dmap;
+		dchunk->map_alloc = ARRAY_SIZE(dmap);
+		dchunk->page = schunk->page_ar;	/* share page map with schunk */
+
+		dchunk->contig_hint = dchunk->free_size = dyn_size;
+		dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
+		dchunk->map[dchunk->map_used++] = dchunk->free_size;
+	}
+
 	/* allocate vm address */
 	first_vm.flags = VM_ALLOC;
 	first_vm.size = pcpu_chunk_size;
@@ -937,12 +1043,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	else {
 		/*
 		 * Pages already mapped.  No need to remap into
-		 * vmalloc area.  In this case the static chunk can't
-		 * be mapped or unmapped by percpu and is marked
+		 * vmalloc area.  In this case the first chunks can't
+		 * be mapped or unmapped by percpu and are marked
 		 * immutable.
 		 */
 		first_vm.addr = base_addr;
 		schunk->immutable = true;
+		if (dchunk)
+			dchunk->immutable = true;
 	}
 
 	/* assign pages */
@@ -978,8 +1086,13 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	}
 
 	/* link the first chunk in */
-	pcpu_chunk_relocate(schunk, -1);
-	pcpu_chunk_addr_insert(schunk);
+	if (!dchunk) {
+		pcpu_chunk_relocate(schunk, -1);
+		pcpu_chunk_addr_insert(schunk);
+	} else {
+		pcpu_chunk_relocate(dchunk, -1);
+		pcpu_chunk_addr_insert(dchunk);
+	}
 
 	/* we're done */
 	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
-- 
cgit v0.10.2


From 6b19b0c2400437a3c10059ede0e59b517092e1bd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: x86, percpu: setup reserved percpu area for x86_64

Impact: fix relocation overflow during module load

x86_64 uses 32bit relocations for symbol access and static percpu
symbols whether in core or modules must be inside 2GB of the percpu
segement base which the dynamic percpu allocator doesn't guarantee.
This patch makes x86_64 reserve PERCPU_MODULE_RESERVE bytes in the
first chunk so that module percpu areas are always allocated from the
first chunk which is always inside the relocatable range.

This problem exists for any percpu allocator but is easily triggered
when using the embedding allocator because the second chunk is located
beyond 2GB on it.

This patch also changes the meaning of PERCPU_DYNAMIC_RESERVE such
that it only indicates the size of the area to reserve for dynamic
allocation as static and dynamic areas can be separate.  New
PERCPU_DYNAMIC_RESERVED is increased by 4k for both 32 and 64bits as
the reserved area separation eats away some allocatable space and
having slightly more headroom (currently between 4 and 8k after
minimal boot sans module area) makes sense for common case
performance.

x86_32 can address anywhere from anywhere and doesn't need reserving.

Mike Galbraith first reported the problem first and bisected it to the
embedding percpu allocator commit.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Mike Galbraith <efault@gmx.de>
Reported-by: Jaswinder Singh Rajput <jaswinder@kernel.org>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index dd4eabc..efa615f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -42,6 +42,19 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 };
 EXPORT_SYMBOL(__per_cpu_offset);
 
+/*
+ * On x86_64 symbols referenced from code should be reachable using
+ * 32bit relocations.  Reserve space for static percpu variables in
+ * modules so that they are always served from the first chunk which
+ * is located at the percpu segment base.  On x86_32, anything can
+ * address anywhere.  No need to reserve space in the first chunk.
+ */
+#ifdef CONFIG_X86_64
+#define PERCPU_FIRST_CHUNK_RESERVE	PERCPU_MODULE_RESERVE
+#else
+#define PERCPU_FIRST_CHUNK_RESERVE	0
+#endif
+
 /**
  * pcpu_need_numa - determine percpu allocation needs to consider NUMA
  *
@@ -141,7 +154,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
 {
 	static struct vm_struct vm;
 	pg_data_t *last;
-	size_t ptrs_size;
+	size_t ptrs_size, dyn_size;
 	unsigned int cpu;
 	ssize_t ret;
 
@@ -169,12 +182,14 @@ proceed:
 	 * Currently supports only single page.  Supporting multiple
 	 * pages won't be too difficult if it ever becomes necessary.
 	 */
-	pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+	pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+			       PERCPU_DYNAMIC_RESERVE);
 	if (pcpur_size > PMD_SIZE) {
 		pr_warning("PERCPU: static data is larger than large page, "
 			   "can't use large page\n");
 		return -EINVAL;
 	}
+	dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
 
 	/* allocate pointer array and alloc large pages */
 	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
@@ -217,8 +232,9 @@ proceed:
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
 		"%zu bytes\n", vm.addr, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE,
-				     pcpur_size - static_size, vm.addr, NULL);
+	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
+				     PERCPU_FIRST_CHUNK_RESERVE,
+				     PMD_SIZE, dyn_size, vm.addr, NULL);
 	goto out_free_ar;
 
 enomem:
@@ -276,9 +292,10 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 		return -EINVAL;
 
 	/* allocate and copy */
-	pcpue_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+	pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+			       PERCPU_DYNAMIC_RESERVE);
 	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-	dyn_size = pcpue_size - static_size;
+	dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
 
 	pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
 				       PAGE_SIZE);
@@ -297,7 +314,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
 		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 
-	return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0,
+	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+				      PERCPU_FIRST_CHUNK_RESERVE,
 				      pcpue_unit_size, dyn_size,
 				      pcpue_ptr, NULL);
 }
@@ -356,8 +374,9 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
 		pcpu4k_nr_static_pages, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1,
-				     NULL, pcpu4k_populate_pte);
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
+				     PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL,
+				     pcpu4k_populate_pte);
 	goto out_free_ar;
 
 enomem:
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 8ff1515..54a968b 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -85,31 +85,20 @@
 
 /*
  * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
- * back on the first chunk if arch is manually allocating and mapping
- * it for faster access (as a part of large page mapping for example).
- * Note that dynamic percpu allocator covers both static and dynamic
- * areas, so these values are bigger than PERCPU_MODULE_RESERVE.
+ * back on the first chunk for dynamic percpu allocation if arch is
+ * manually allocating and mapping it for faster access (as a part of
+ * large page mapping for example).
  *
- * On typical configuration with modules, the following values leave
- * about 8k of free space on the first chunk after boot on both x86_32
- * and 64 when module support is enabled.  When module support is
- * disabled, it's much tighter.
+ * The following values give between one and two pages of free space
+ * after typical minimal boot (2-way SMP, single disk and NIC) with
+ * both defconfig and a distro config on x86_64 and 32.  More
+ * intelligent way to determine this would be nice.
  */
-#ifndef PERCPU_DYNAMIC_RESERVE
-#  if BITS_PER_LONG > 32
-#    ifdef CONFIG_MODULES
-#      define PERCPU_DYNAMIC_RESERVE	(24 << 10)
-#    else
-#      define PERCPU_DYNAMIC_RESERVE	(16 << 10)
-#    endif
-#  else
-#    ifdef CONFIG_MODULES
-#      define PERCPU_DYNAMIC_RESERVE	(16 << 10)
-#    else
-#      define PERCPU_DYNAMIC_RESERVE	(8 << 10)
-#    endif
-#  endif
-#endif	/* PERCPU_DYNAMIC_RESERVE */
+#if BITS_PER_LONG > 32
+#define PERCPU_DYNAMIC_RESERVE		(20 << 10)
+#else
+#define PERCPU_DYNAMIC_RESERVE		(12 << 10)
+#endif
 
 extern void *pcpu_base_addr;
 
-- 
cgit v0.10.2


From 59247eaea50cc68cc6ce3d3fd3855f3301b65c96 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 6 Mar 2009 08:55:24 +0100
Subject: block: fix missing bio back/front segment size setting in
 blk_recount_segments()

Commit 1e42807918d17e8c93bf14fbb74be84b141334c1 introduced a bug where we
don't get front/back segment sizes in the bio in blk_recount_segments().
Fix this by tracking the back bio as well as the front bio in
__blk_recalc_rq_segments(), this also cleans up the interface by getting
rid of the segment size pointer passing.

Tested-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-merge.c b/block/blk-merge.c
index a104593..5a244f0 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -39,14 +39,13 @@ void blk_recalc_rq_sectors(struct request *rq, int nsect)
 }
 
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
-					     struct bio *bio,
-					     unsigned int *seg_size_ptr)
+					     struct bio *bio)
 {
 	unsigned int phys_size;
 	struct bio_vec *bv, *bvprv = NULL;
 	int cluster, i, high, highprv = 1;
 	unsigned int seg_size, nr_phys_segs;
-	struct bio *fbio;
+	struct bio *fbio, *bbio;
 
 	if (!bio)
 		return 0;
@@ -87,26 +86,20 @@ new_segment:
 			seg_size = bv->bv_len;
 			highprv = high;
 		}
+		bbio = bio;
 	}
 
-	if (seg_size_ptr)
-		*seg_size_ptr = seg_size;
+	if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size)
+		fbio->bi_seg_front_size = seg_size;
+	if (seg_size > bbio->bi_seg_back_size)
+		bbio->bi_seg_back_size = seg_size;
 
 	return nr_phys_segs;
 }
 
 void blk_recalc_rq_segments(struct request *rq)
 {
-	unsigned int seg_size = 0, phys_segs;
-
-	phys_segs = __blk_recalc_rq_segments(rq->q, rq->bio, &seg_size);
-
-	if (phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size)
-		rq->bio->bi_seg_front_size = seg_size;
-	if (seg_size > rq->biotail->bi_seg_back_size)
-		rq->biotail->bi_seg_back_size = seg_size;
-
-	rq->nr_phys_segments = phys_segs;
+	rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
 }
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
@@ -114,7 +107,7 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio)
 	struct bio *nxt = bio->bi_next;
 
 	bio->bi_next = NULL;
-	bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, NULL);
+	bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
 	bio->bi_next = nxt;
 	bio->bi_flags |= (1 << BIO_SEG_VALID);
 }
-- 
cgit v0.10.2


From 14b97595e0e1f47b6f809e180e5bcd8dcd995690 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 6 Mar 2009 09:42:07 +0100
Subject: ALSA: hda - Fix typos in slave controls in patch_sigmatel.c

"Headphone Playback ..." appears twice in slave_vols[] and slave_sws[].
They should be "Headphone Playback2 ..."

Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c
index 3bc4276..995b413 100644
--- a/sound/pci/hda/patch_sigmatel.c
+++ b/sound/pci/hda/patch_sigmatel.c
@@ -1207,7 +1207,7 @@ static const char *slave_vols[] = {
 	"LFE Playback Volume",
 	"Side Playback Volume",
 	"Headphone Playback Volume",
-	"Headphone Playback Volume",
+	"Headphone2 Playback Volume",
 	"Speaker Playback Volume",
 	"External Speaker Playback Volume",
 	"Speaker2 Playback Volume",
@@ -1221,7 +1221,7 @@ static const char *slave_sws[] = {
 	"LFE Playback Switch",
 	"Side Playback Switch",
 	"Headphone Playback Switch",
-	"Headphone Playback Switch",
+	"Headphone2 Playback Switch",
 	"Speaker Playback Switch",
 	"External Speaker Playback Switch",
 	"Speaker2 Playback Switch",
-- 
cgit v0.10.2


From c50ff7c04225c945b13d410d50fde6ff6c59d7ee Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 6 Mar 2009 09:43:58 +0100
Subject: ALSA: hda - Fix headphone-detect regression with multiple HP jacks

The recent changes over the DAC detection mechanism in patch_sigmatel.c
breaks the HP detection on the machines with multiple HP jacks.
It's basically because of the workaround to support the multi-channel
output.  Since the HP detection is more important feature, disable
the HP-swap workaroud temporarily.

Reference: Novell bnc#482052
	https://bugzilla.novell.com/show_bug.cgi?id=482052

Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c
index 995b413..6094344 100644
--- a/sound/pci/hda/patch_sigmatel.c
+++ b/sound/pci/hda/patch_sigmatel.c
@@ -3516,6 +3516,7 @@ static int stac92xx_parse_auto_config(struct hda_codec *codec, hda_nid_t dig_out
 	if (! spec->autocfg.line_outs)
 		return 0; /* can't find valid pin config */
 
+#if 0 /* FIXME: temporarily disabled */
 	/* If we have no real line-out pin and multiple hp-outs, HPs should
 	 * be set up as multi-channel outputs.
 	 */
@@ -3535,6 +3536,7 @@ static int stac92xx_parse_auto_config(struct hda_codec *codec, hda_nid_t dig_out
 		spec->autocfg.line_out_type = AUTO_PIN_HP_OUT;
 		spec->autocfg.hp_outs = 0;
 	}
+#endif /* FIXME: temporarily disabled */
 	if (spec->autocfg.mono_out_pin) {
 		int dir = get_wcaps(codec, spec->autocfg.mono_out_pin) &
 			(AC_WCAP_OUT_AMP | AC_WCAP_IN_AMP);
-- 
cgit v0.10.2


From d1a8e7792047f7dca7eb5759250e2c12800bf262 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 6 Mar 2009 03:12:50 -0800
Subject: x86: make "memtest" like "memtest=17"

Impact: make boot command line "memtest" do one loop by default

So don't need to guess many patterns in one loop.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49B10532.3020105@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 0bcd788..605c8be 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -100,6 +100,9 @@ static int __init parse_memtest(char *arg)
 {
 	if (arg)
 		memtest_pattern = simple_strtoul(arg, NULL, 0);
+	else
+		memtest_pattern = ARRAY_SIZE(patterns);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From c77a3b59c624454c501cbfa1a3611d5a00bf9532 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 17:04:26 +0200
Subject: x86: fix uninitialized variable in init_memory_mapping()

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1236265466.31324.9.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6d63e3d..15219e0 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -134,8 +134,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 {
 	unsigned long page_size_mask = 0;
 	unsigned long start_pfn, end_pfn;
+	unsigned long ret = 0;
 	unsigned long pos;
-	unsigned long ret;
 
 	struct map_range mr[NR_RANGE_MR];
 	int nr_range, i;
-- 
cgit v0.10.2


From 5dd61dfabcaa5bfb67afb8a2d255bd1e156562e3 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 17:04:57 +0200
Subject: x86: rename do_not_nx to disable_nx in mm/init_64.c

As a preparational step for unifying noexec handling on 32-bit and 64-bit,
rename the do_not_nx variable to disable_nx on 64-bit.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1236265497.31324.11.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 8a853bc..54efa57d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -85,7 +85,7 @@ early_param("gbpages", parse_direct_gbpages_on);
 pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
-static int do_not_nx __cpuinitdata;
+static int disable_nx __cpuinitdata;
 
 /*
  * noexec=on|off
@@ -100,9 +100,9 @@ static int __init nonx_setup(char *str)
 		return -EINVAL;
 	if (!strncmp(str, "on", 2)) {
 		__supported_pte_mask |= _PAGE_NX;
-		do_not_nx = 0;
+		disable_nx = 0;
 	} else if (!strncmp(str, "off", 3)) {
-		do_not_nx = 1;
+		disable_nx = 1;
 		__supported_pte_mask &= ~_PAGE_NX;
 	}
 	return 0;
@@ -114,7 +114,7 @@ void __cpuinit check_efer(void)
 	unsigned long efer;
 
 	rdmsrl(MSR_EFER, efer);
-	if (!(efer & EFER_NX) || do_not_nx)
+	if (!(efer & EFER_NX) || disable_nx)
 		__supported_pte_mask &= ~_PAGE_NX;
 }
 
-- 
cgit v0.10.2


From 9ca0791dcaa666e8e8f4b4ca028b65b4bde9cb28 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Thu, 5 Mar 2009 08:49:54 +0100
Subject: x86, bts: remove bad warning

In case a ptraced task is reaped (while the tracer is still attached),
ds_exit_thread() is called before ptrace_exit(). The latter will
release the bts_tracer and remove the thread's ds_ctx.
The former will WARN() if the context is not NULL.

Oleg Nesterov submitted patches that move ptrace_exit() before
exit_thread() and thus reverse the order of the above calls.

Remove the bad warning. I will add it again when Oleg's changes are in.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090305084954.A22000@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 169a120..de7cdbd 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -1029,5 +1029,4 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
 
 void ds_exit_thread(struct task_struct *tsk)
 {
-	WARN_ON(tsk->thread.ds_ctx);
 }
-- 
cgit v0.10.2


From 73bf1b62f561fc8ecb00e2810efe4fe769f4933e Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Thu, 5 Mar 2009 08:57:21 +0100
Subject: x86, pebs: correct qualifier passed to ds_write_config() from
 ds_request_pebs()

ds_write_config() can write the BTS as well as the PEBS part of
the DS config. ds_request_pebs() passes the wrong qualifier, which
results in the wrong configuration to be written.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090305085721.A22550@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index de7cdbd..87b67e3 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -729,7 +729,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 
 	spin_unlock_irqrestore(&ds_lock, irq);
 
-	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
 	ds_resume_pebs(tracer);
 
 	return tracer;
-- 
cgit v0.10.2


From 1880d93b80acc3171850e9df5048bcb26b75c2f5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2009 00:44:09 +0900
Subject: percpu: replace pcpu_realloc() with pcpu_mem_alloc() and
 pcpu_mem_free()

Impact: code reorganization for later changes

With static map handling moved to pcpu_split_block(), pcpu_realloc()
only clutters the code and it's also unsuitable for scheduled locking
changes.  Implement and use pcpu_mem_alloc/free() instead.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index ef8e169..f1d0e90 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -164,39 +164,41 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 }
 
 /**
- * pcpu_realloc - versatile realloc
- * @p: the current pointer (can be NULL for new allocations)
- * @size: the current size in bytes (can be 0 for new allocations)
- * @new_size: the wanted new size in bytes (can be 0 for free)
+ * pcpu_mem_alloc - allocate memory
+ * @size: bytes to allocate
  *
- * More robust realloc which can be used to allocate, resize or free a
- * memory area of arbitrary size.  If the needed size goes over
- * PAGE_SIZE, kernel VM is used.
+ * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
+ * kzalloc() is used; otherwise, vmalloc() is used.  The returned
+ * memory is always zeroed.
  *
  * RETURNS:
- * The new pointer on success, NULL on failure.
+ * Pointer to the allocated area on success, NULL on failure.
  */
-static void *pcpu_realloc(void *p, size_t size, size_t new_size)
+static void *pcpu_mem_alloc(size_t size)
 {
-	void *new;
-
-	if (new_size <= PAGE_SIZE)
-		new = kmalloc(new_size, GFP_KERNEL);
-	else
-		new = vmalloc(new_size);
-	if (new_size && !new)
-		return NULL;
-
-	memcpy(new, p, min(size, new_size));
-	if (new_size > size)
-		memset(new + size, 0, new_size - size);
+	if (size <= PAGE_SIZE)
+		return kzalloc(size, GFP_KERNEL);
+	else {
+		void *ptr = vmalloc(size);
+		if (ptr)
+			memset(ptr, 0, size);
+		return ptr;
+	}
+}
 
+/**
+ * pcpu_mem_free - free memory
+ * @ptr: memory to free
+ * @size: size of the area
+ *
+ * Free @ptr.  @ptr should have been allocated using pcpu_mem_alloc().
+ */
+static void pcpu_mem_free(void *ptr, size_t size)
+{
 	if (size <= PAGE_SIZE)
-		kfree(p);
+		kfree(ptr);
 	else
-		vfree(p);
-
-	return new;
+		vfree(ptr);
 }
 
 /**
@@ -331,29 +333,27 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 	if (chunk->map_alloc < target) {
 		int new_alloc;
 		int *new;
+		size_t size;
 
 		new_alloc = PCPU_DFL_MAP_ALLOC;
 		while (new_alloc < target)
 			new_alloc *= 2;
 
-		if (chunk->map_alloc < PCPU_DFL_MAP_ALLOC) {
-			/*
-			 * map_alloc smaller than the default size
-			 * indicates that the chunk is one of the
-			 * first chunks and still using static map.
-			 * Allocate a dynamic one and copy.
-			 */
-			new = pcpu_realloc(NULL, 0, new_alloc * sizeof(new[0]));
-			if (new)
-				memcpy(new, chunk->map,
-				       chunk->map_alloc * sizeof(new[0]));
-		} else
-			new = pcpu_realloc(chunk->map,
-					   chunk->map_alloc * sizeof(new[0]),
-					   new_alloc * sizeof(new[0]));
+		new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
 		if (!new)
 			return -ENOMEM;
 
+		size = chunk->map_alloc * sizeof(chunk->map[0]);
+		memcpy(new, chunk->map, size);
+
+		/*
+		 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the
+		 * chunk is one of the first chunks and still using
+		 * static map.
+		 */
+		if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
+			pcpu_mem_free(chunk->map, size);
+
 		chunk->map_alloc = new_alloc;
 		chunk->map = new;
 	}
@@ -696,7 +696,7 @@ static void free_pcpu_chunk(struct pcpu_chunk *chunk)
 		return;
 	if (chunk->vm)
 		free_vm_area(chunk->vm);
-	pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
+	pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
 	kfree(chunk);
 }
 
@@ -708,8 +708,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 	if (!chunk)
 		return NULL;
 
-	chunk->map = pcpu_realloc(NULL, 0,
-				  PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+	chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
 	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 	chunk->map[chunk->map_used++] = pcpu_unit_size;
 	chunk->page = chunk->page_ar;
-- 
cgit v0.10.2


From 9f7dcf224bd09ec9ebcbfb383bf2c465e0e0b03d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2009 00:44:09 +0900
Subject: percpu: move chunk area map extension out of area allocation

Impact: code reorganization for later changes

Separate out chunk area map extension into a separate function -
pcpu_extend_area_map() - and call it directly from pcpu_alloc() such
that pcpu_alloc_area() is guaranteed to have enough area map slots on
invocation.

With this change, pcpu_alloc_area() does only area allocation and the
only failure mode is when the chunk doens't have enough room, so
there's no need to distinguish it from memory allocation failures.
Make it return -1 on such cases instead of hacky -ENOSPC.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index f1d0e90..7d9bc35 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -307,6 +307,50 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 }
 
 /**
+ * pcpu_extend_area_map - extend area map for allocation
+ * @chunk: target chunk
+ *
+ * Extend area map of @chunk so that it can accomodate an allocation.
+ * A single allocation can split an area into three areas, so this
+ * function makes sure that @chunk->map has at least two extra slots.
+ *
+ * RETURNS:
+ * 0 if noop, 1 if successfully extended, -errno on failure.
+ */
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
+{
+	int new_alloc;
+	int *new;
+	size_t size;
+
+	/* has enough? */
+	if (chunk->map_alloc >= chunk->map_used + 2)
+		return 0;
+
+	new_alloc = PCPU_DFL_MAP_ALLOC;
+	while (new_alloc < chunk->map_used + 2)
+		new_alloc *= 2;
+
+	new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
+	if (!new)
+		return -ENOMEM;
+
+	size = chunk->map_alloc * sizeof(chunk->map[0]);
+	memcpy(new, chunk->map, size);
+
+	/*
+	 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
+	 * one of the first chunks and still using static map.
+	 */
+	if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
+		pcpu_mem_free(chunk->map, size);
+
+	chunk->map_alloc = new_alloc;
+	chunk->map = new;
+	return 0;
+}
+
+/**
  * pcpu_split_block - split a map block
  * @chunk: chunk of interest
  * @i: index of map block to split
@@ -321,44 +365,16 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
  * depending on @head, is reduced by @tail bytes and @tail byte block
  * is inserted after the target block.
  *
- * RETURNS:
- * 0 on success, -errno on failure.
+ * @chunk->map must have enough free slots to accomodate the split.
  */
-static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
+static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
+			     int head, int tail)
 {
 	int nr_extra = !!head + !!tail;
-	int target = chunk->map_used + nr_extra;
-
-	/* reallocation required? */
-	if (chunk->map_alloc < target) {
-		int new_alloc;
-		int *new;
-		size_t size;
-
-		new_alloc = PCPU_DFL_MAP_ALLOC;
-		while (new_alloc < target)
-			new_alloc *= 2;
-
-		new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
-		if (!new)
-			return -ENOMEM;
-
-		size = chunk->map_alloc * sizeof(chunk->map[0]);
-		memcpy(new, chunk->map, size);
-
-		/*
-		 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the
-		 * chunk is one of the first chunks and still using
-		 * static map.
-		 */
-		if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
-			pcpu_mem_free(chunk->map, size);
 
-		chunk->map_alloc = new_alloc;
-		chunk->map = new;
-	}
+	BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);
 
-	/* insert a new subblock */
+	/* insert new subblocks */
 	memmove(&chunk->map[i + nr_extra], &chunk->map[i],
 		sizeof(chunk->map[0]) * (chunk->map_used - i));
 	chunk->map_used += nr_extra;
@@ -371,7 +387,6 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 		chunk->map[i++] -= tail;
 		chunk->map[i] = tail;
 	}
-	return 0;
 }
 
 /**
@@ -384,8 +399,11 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
  * Note that this function only allocates the offset.  It doesn't
  * populate or map the area.
  *
+ * @chunk->map must have at least two free slots.
+ *
  * RETURNS:
- * Allocated offset in @chunk on success, -errno on failure.
+ * Allocated offset in @chunk on success, -1 if no matching area is
+ * found.
  */
 static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 {
@@ -433,8 +451,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 
 		/* split if warranted */
 		if (head || tail) {
-			if (pcpu_split_block(chunk, i, head, tail))
-				return -ENOMEM;
+			pcpu_split_block(chunk, i, head, tail);
 			if (head) {
 				i++;
 				off += head;
@@ -461,14 +478,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 	chunk->contig_hint = max_contig;	/* fully scanned */
 	pcpu_chunk_relocate(chunk, oslot);
 
-	/*
-	 * Tell the upper layer that this chunk has no area left.
-	 * Note that this is not an error condition but a notification
-	 * to upper layer that it needs to look at other chunks.
-	 * -ENOSPC is chosen as it isn't used in memory subsystem and
-	 * matches the meaning in a way.
-	 */
-	return -ENOSPC;
+	/* tell the upper layer that this chunk has no matching area */
+	return -1;
 }
 
 /**
@@ -755,7 +766,8 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 	/* serve reserved allocations from the reserved chunk if available */
 	if (reserved && pcpu_reserved_chunk) {
 		chunk = pcpu_reserved_chunk;
-		if (size > chunk->contig_hint)
+		if (size > chunk->contig_hint ||
+		    pcpu_extend_area_map(chunk) < 0)
 			goto out_unlock;
 		off = pcpu_alloc_area(chunk, size, align);
 		if (off >= 0)
@@ -768,11 +780,11 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
 				continue;
+			if (pcpu_extend_area_map(chunk) < 0)
+				goto out_unlock;
 			off = pcpu_alloc_area(chunk, size, align);
 			if (off >= 0)
 				goto area_found;
-			if (off != -ENOSPC)
-				goto out_unlock;
 		}
 	}
 
-- 
cgit v0.10.2


From a56dbddf06b653ef9c04ca3767f260fd31ccebab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2009 00:44:11 +0900
Subject: percpu: move fully free chunk reclamation into a work

Impact: code reorganization for later changes

Do fully free chunk reclamation using a work.  This change is to
prepare for locking changes.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index 7d9bc35..4c8a419 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -63,6 +63,7 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/workqueue.h>
 
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -118,6 +119,10 @@ static DEFINE_MUTEX(pcpu_mutex);
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
 
+/* reclaim work to release fully free chunks, scheduled from free path */
+static void pcpu_reclaim(struct work_struct *work);
+static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
+
 static int __pcpu_size_to_slot(int size)
 {
 	int highbit = fls(size);	/* size is in bytes */
@@ -846,13 +851,37 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
 	return pcpu_alloc(size, align, true);
 }
 
-static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
+/**
+ * pcpu_reclaim - reclaim fully free chunks, workqueue function
+ * @work: unused
+ *
+ * Reclaim all fully free chunks except for the first one.
+ */
+static void pcpu_reclaim(struct work_struct *work)
 {
-	WARN_ON(chunk->immutable);
-	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
-	list_del(&chunk->list);
-	rb_erase(&chunk->rb_node, &pcpu_addr_root);
-	free_pcpu_chunk(chunk);
+	LIST_HEAD(todo);
+	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
+	struct pcpu_chunk *chunk, *next;
+
+	mutex_lock(&pcpu_mutex);
+
+	list_for_each_entry_safe(chunk, next, head, list) {
+		WARN_ON(chunk->immutable);
+
+		/* spare the first one */
+		if (chunk == list_first_entry(head, struct pcpu_chunk, list))
+			continue;
+
+		rb_erase(&chunk->rb_node, &pcpu_addr_root);
+		list_move(&chunk->list, &todo);
+	}
+
+	mutex_unlock(&pcpu_mutex);
+
+	list_for_each_entry_safe(chunk, next, &todo, list) {
+		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+		free_pcpu_chunk(chunk);
+	}
 }
 
 /**
@@ -877,14 +906,13 @@ void free_percpu(void *ptr)
 
 	pcpu_free_area(chunk, off);
 
-	/* the chunk became fully free, kill one if there are other free ones */
+	/* if there are more than one fully free chunks, wake up grim reaper */
 	if (chunk->free_size == pcpu_unit_size) {
 		struct pcpu_chunk *pos;
 
-		list_for_each_entry(pos,
-				    &pcpu_slot[pcpu_chunk_slot(chunk)], list)
+		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
 			if (pos != chunk) {
-				pcpu_kill_chunk(pos);
+				schedule_work(&pcpu_reclaim_work);
 				break;
 			}
 	}
-- 
cgit v0.10.2


From 7ab152470e8416ef2a44c800fdc157e2192f2974 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 6 Mar 2009 19:08:34 +0300
Subject: x86: linkage.h - guard assembler specifics by __ASSEMBLY__

Stephen Rothwell reported:

|Today's linux-next build (x86_64 allmodconfig) produced this warning:
|
|In file included from drivers/char/epca.c:49:
|drivers/char/digiFep1.h:7:1: warning: "GLOBAL" redefined
|In file included from include/linux/linkage.h:5,
|                 from include/linux/kernel.h:11,
|                 from arch/x86/include/asm/system.h:10,
|                 from arch/x86/include/asm/processor.h:17,
|                 from include/linux/prefetch.h:14,
|                 from include/linux/list.h:6,
|                 from include/linux/module.h:9,
|                 from drivers/char/epca.c:29:
|arch/x86/include/asm/linkage.h:55:1: warning: this is the location of the previous definition
|
|Probably introduced by commit 95695547a7db44b88a7ee36cf5df188de267e99e
|("x86: asm linkage - introduce GLOBAL macro") from the x86 tree.

Any assembler specific snippets being placed in headers
are to be protected by __ASSEMBLY__. Fixed.

Also move __ALIGN definition under the same protection as well.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090306160833.GB7420@localhost>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 9320e2a..a0d70b4 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -4,11 +4,6 @@
 #undef notrace
 #define notrace __attribute__((no_instrument_function))
 
-#ifdef CONFIG_X86_64
-#define __ALIGN .p2align 4,,15
-#define __ALIGN_STR ".p2align 4,,15"
-#endif
-
 #ifdef CONFIG_X86_32
 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
 /*
@@ -50,16 +45,25 @@
 	__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
 			      "g" (arg4), "g" (arg5), "g" (arg6))
 
-#endif
+#endif /* CONFIG_X86_32 */
+
+#ifdef __ASSEMBLY__
 
 #define GLOBAL(name)	\
 	.globl name;	\
 	name:
 
+#ifdef CONFIG_X86_64
+#define __ALIGN .p2align 4,,15
+#define __ALIGN_STR ".p2align 4,,15"
+#endif
+
 #ifdef CONFIG_X86_ALIGNMENT_16
 #define __ALIGN .align 16,0x90
 #define __ALIGN_STR ".align 16,0x90"
 #endif
 
+#endif /* __ASSEMBLY__ */
+
 #endif /* _ASM_X86_LINKAGE_H */
 
-- 
cgit v0.10.2


From c63c58056e268b0d6bd6994b69030c144567990d Mon Sep 17 00:00:00 2001
From: Jeremy Higdon <jeremy@sgi.com>
Date: Wed, 4 Mar 2009 12:09:46 -0800
Subject: [IA64] fix PCI DMA flag propagation on SN (Altix) with PICs

We recently discovered a problem with passing of DMA attributes on SN
systems with the older PIC chips.

[akpm@linux-foundation.org: coding-style fixes]

Signed-off-by: Jeremy Higdon <jeremy@sgi.com>
Cc: <habeck@sgi.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>

diff --git a/arch/ia64/sn/pci/pcibr/pcibr_dma.c b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
index e626e50..060df4a 100644
--- a/arch/ia64/sn/pci/pcibr/pcibr_dma.c
+++ b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
@@ -135,11 +135,10 @@ pcibr_dmatrans_direct64(struct pcidev_info * info, u64 paddr,
 	if (SN_DMA_ADDRTYPE(dma_flags) == SN_DMA_ADDR_PHYS)
 		pci_addr = IS_PIC_SOFT(pcibus_info) ?
 				PHYS_TO_DMA(paddr) :
-		    		PHYS_TO_TIODMA(paddr) | dma_attributes;
+				PHYS_TO_TIODMA(paddr);
 	else
-		pci_addr = IS_PIC_SOFT(pcibus_info) ?
-				paddr :
-				paddr | dma_attributes;
+		pci_addr = paddr;
+	pci_addr |= dma_attributes;
 
 	/* Handle Bus mode */
 	if (IS_PCIX(pcibus_info))
-- 
cgit v0.10.2


From bd05f28e1a15ae62994fe309a524695fe26dd834 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 3 Mar 2009 22:55:21 +0100
Subject: cfg80211: test before subtraction on unsigned

freq_diff is unsigned, so test before subtraction

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 85c9034..bd0a16c 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -380,7 +380,8 @@ static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule)
 
 	freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz;
 
-	if (freq_diff <= 0 || freq_range->max_bandwidth_khz > freq_diff)
+	if (freq_range->end_freq_khz <= freq_range->start_freq_khz ||
+			freq_range->max_bandwidth_khz > freq_diff)
 		return false;
 
 	return true;
-- 
cgit v0.10.2


From c0350024723b4a69e38655816484d934aca8eb30 Mon Sep 17 00:00:00 2001
From: Christian Lamparter <chunkeey@web.de>
Date: Fri, 6 Mar 2009 00:53:59 +0100
Subject: p54: fix race condition in memory management

This patch fixes a number of race conditions in the driver.
Up until now, "entry" pointer was initialized before acquiring the right lock.

Signed-off-by: Christian Lamparter <chunkeey@web.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/p54/p54common.c b/drivers/net/wireless/p54/p54common.c
index 34561e6..f170106 100644
--- a/drivers/net/wireless/p54/p54common.c
+++ b/drivers/net/wireless/p54/p54common.c
@@ -710,10 +710,11 @@ static struct sk_buff *p54_find_tx_entry(struct ieee80211_hw *dev,
 					   __le32 req_id)
 {
 	struct p54_common *priv = dev->priv;
-	struct sk_buff *entry = priv->tx_queue.next;
+	struct sk_buff *entry;
 	unsigned long flags;
 
 	spin_lock_irqsave(&priv->tx_queue.lock, flags);
+	entry = priv->tx_queue.next;
 	while (entry != (struct sk_buff *)&priv->tx_queue) {
 		struct p54_hdr *hdr = (struct p54_hdr *) entry->data;
 
@@ -732,7 +733,7 @@ static void p54_rx_frame_sent(struct ieee80211_hw *dev, struct sk_buff *skb)
 	struct p54_common *priv = dev->priv;
 	struct p54_hdr *hdr = (struct p54_hdr *) skb->data;
 	struct p54_frame_sent *payload = (struct p54_frame_sent *) hdr->data;
-	struct sk_buff *entry = (struct sk_buff *) priv->tx_queue.next;
+	struct sk_buff *entry;
 	u32 addr = le32_to_cpu(hdr->req_id) - priv->headroom;
 	struct memrecord *range = NULL;
 	u32 freed = 0;
@@ -741,6 +742,7 @@ static void p54_rx_frame_sent(struct ieee80211_hw *dev, struct sk_buff *skb)
 	int count, idx;
 
 	spin_lock_irqsave(&priv->tx_queue.lock, flags);
+	entry = (struct sk_buff *) priv->tx_queue.next;
 	while (entry != (struct sk_buff *)&priv->tx_queue) {
 		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(entry);
 		struct p54_hdr *entry_hdr;
@@ -976,7 +978,7 @@ static int p54_assign_address(struct ieee80211_hw *dev, struct sk_buff *skb,
 			       struct p54_hdr *data, u32 len)
 {
 	struct p54_common *priv = dev->priv;
-	struct sk_buff *entry = priv->tx_queue.next;
+	struct sk_buff *entry;
 	struct sk_buff *target_skb = NULL;
 	struct ieee80211_tx_info *info;
 	struct memrecord *range;
@@ -1014,6 +1016,7 @@ static int p54_assign_address(struct ieee80211_hw *dev, struct sk_buff *skb,
 		}
 	}
 
+	entry = priv->tx_queue.next;
 	while (left--) {
 		u32 hole_size;
 		info = IEEE80211_SKB_CB(entry);
-- 
cgit v0.10.2


From ccea34b5d0fbab081496d1860f31acee99fa8a6d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2009 00:44:13 +0900
Subject: percpu: finer grained locking to break deadlock and allow atomic free

Impact: fix deadlock and allow atomic free

Percpu allocation always uses GFP_KERNEL and whole alloc/free paths
were protected by single mutex.  All percpu allocations have been from
GFP_KERNEL-safe context and the original allocator had this assumption
too.  However, by protecting both alloc and free paths with the same
mutex, the new allocator creates free -> alloc -> GFP_KERNEL
dependency which the original allocator didn't have.  This can lead to
deadlock if free is called from FS or IO paths.  Also, in general,
allocators are expected to allow free to be called from atomic
context.

This patch implements finer grained locking to break the deadlock and
allow atomic free.  For details, please read the "Synchronization
rules" comment.

While at it, also add CONTEXT: to function comments to describe which
context they expect to be called from and what they do to it.

This problem was reported by Thomas Gleixner and Peter Zijlstra.

  http://thread.gmane.org/gmane.linux.kernel/802384

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Peter Zijlstra <peterz@infradead.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index 4c8a419..bfe6a3a 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -62,6 +62,7 @@
 #include <linux/pfn.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 
@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
 static int pcpu_reserved_chunk_limit;
 
 /*
- * One mutex to rule them all.
- *
- * The following mutex is grabbed in the outermost public alloc/free
- * interface functions and released only when the operation is
- * complete.  As such, every function in this file other than the
- * outermost functions are called under pcpu_mutex.
- *
- * It can easily be switched to use spinlock such that only the area
- * allocation and page population commit are protected with it doing
- * actual [de]allocation without holding any lock.  However, given
- * what this allocator does, I think it's better to let them run
- * sequentially.
+ * Synchronization rules.
+ *
+ * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
+ * protects allocation/reclaim paths, chunks and chunk->page arrays.
+ * The latter is a spinlock and protects the index data structures -
+ * chunk slots, rbtree, chunks and area maps in chunks.
+ *
+ * During allocation, pcpu_alloc_mutex is kept locked all the time and
+ * pcpu_lock is grabbed and released as necessary.  All actual memory
+ * allocations are done using GFP_KERNEL with pcpu_lock released.
+ *
+ * Free path accesses and alters only the index data structures, so it
+ * can be safely called from atomic context.  When memory needs to be
+ * returned to the system, free path schedules reclaim_work which
+ * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
+ * reclaimed, release both locks and frees the chunks.  Note that it's
+ * necessary to grab both locks to remove a chunk from circulation as
+ * allocation path might be referencing the chunk with only
+ * pcpu_alloc_mutex locked.
  */
-static DEFINE_MUTEX(pcpu_mutex);
+static DEFINE_MUTEX(pcpu_alloc_mutex);	/* protects whole alloc and reclaim */
+static DEFINE_SPINLOCK(pcpu_lock);	/* protects index data structures */
 
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
  * kzalloc() is used; otherwise, vmalloc() is used.  The returned
  * memory is always zeroed.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size)
  * New slot according to the changed state is determined and @chunk is
  * moved to the slot.  Note that the reserved chunk is never put on
  * chunk slots.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
@@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
  * searchs for the chunk with the highest start address which isn't
  * beyond @addr.
  *
+ * CONTEXT:
+ * pcpu_lock.
+ *
  * RETURNS:
  * The address of the found chunk.
  */
@@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
  * @new: chunk to insert
  *
  * Insert @new into address rb tree.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 {
@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
  * A single allocation can split an area into three areas, so this
  * function makes sure that @chunk->map has at least two extra slots.
  *
+ * CONTEXT:
+ * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
+ * if area map is extended.
+ *
  * RETURNS:
  * 0 if noop, 1 if successfully extended, -errno on failure.
  */
@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
 	if (chunk->map_alloc >= chunk->map_used + 2)
 		return 0;
 
+	spin_unlock_irq(&pcpu_lock);
+
 	new_alloc = PCPU_DFL_MAP_ALLOC;
 	while (new_alloc < chunk->map_used + 2)
 		new_alloc *= 2;
 
 	new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
-	if (!new)
+	if (!new) {
+		spin_lock_irq(&pcpu_lock);
 		return -ENOMEM;
+	}
+
+	/*
+	 * Acquire pcpu_lock and switch to new area map.  Only free
+	 * could have happened inbetween, so map_used couldn't have
+	 * grown.
+	 */
+	spin_lock_irq(&pcpu_lock);
+	BUG_ON(new_alloc < chunk->map_used + 2);
 
 	size = chunk->map_alloc * sizeof(chunk->map[0]);
 	memcpy(new, chunk->map, size);
@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
  * is inserted after the target block.
  *
  * @chunk->map must have enough free slots to accomodate the split.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
 			     int head, int tail)
@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
  *
  * @chunk->map must have at least two free slots.
  *
+ * CONTEXT:
+ * pcpu_lock.
+ *
  * RETURNS:
  * Allocated offset in @chunk on success, -1 if no matching area is
  * found.
@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
  * Free area starting from @freeme to @chunk.  Note that this function
  * only modifies the allocation map.  It doesn't depopulate or unmap
  * the area.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 {
@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
  * from @chunk.  If @flush is true, vcache is flushed before unmapping
  * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
  */
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
 				  bool flush)
@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
  *
  * For each cpu, populate and map pages [@page_start,@page_end) into
  * @chunk.  The area is cleared on return.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
  * @align: alignment of area (max PAGE_SIZE)
  * @reserved: allocate from the reserved chunk if available
  *
- * Allocate percpu area of @size bytes aligned at @align.  Might
- * sleep.  Might trigger writeouts.
+ * Allocate percpu area of @size bytes aligned at @align.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
  *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
 static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
-	void *ptr = NULL;
 	struct pcpu_chunk *chunk;
 	int slot, off;
 
@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 		return NULL;
 	}
 
-	mutex_lock(&pcpu_mutex);
+	mutex_lock(&pcpu_alloc_mutex);
+	spin_lock_irq(&pcpu_lock);
 
 	/* serve reserved allocations from the reserved chunk if available */
 	if (reserved && pcpu_reserved_chunk) {
 		chunk = pcpu_reserved_chunk;
 		if (size > chunk->contig_hint ||
 		    pcpu_extend_area_map(chunk) < 0)
-			goto out_unlock;
+			goto fail_unlock;
 		off = pcpu_alloc_area(chunk, size, align);
 		if (off >= 0)
 			goto area_found;
-		goto out_unlock;
+		goto fail_unlock;
 	}
 
+restart:
 	/* search through normal chunks */
 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
 				continue;
-			if (pcpu_extend_area_map(chunk) < 0)
-				goto out_unlock;
+
+			switch (pcpu_extend_area_map(chunk)) {
+			case 0:
+				break;
+			case 1:
+				goto restart;	/* pcpu_lock dropped, restart */
+			default:
+				goto fail_unlock;
+			}
+
 			off = pcpu_alloc_area(chunk, size, align);
 			if (off >= 0)
 				goto area_found;
@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 	}
 
 	/* hmmm... no space left, create a new chunk */
+	spin_unlock_irq(&pcpu_lock);
+
 	chunk = alloc_pcpu_chunk();
 	if (!chunk)
-		goto out_unlock;
+		goto fail_unlock_mutex;
+
+	spin_lock_irq(&pcpu_lock);
 	pcpu_chunk_relocate(chunk, -1);
 	pcpu_chunk_addr_insert(chunk);
-
-	off = pcpu_alloc_area(chunk, size, align);
-	if (off < 0)
-		goto out_unlock;
+	goto restart;
 
 area_found:
+	spin_unlock_irq(&pcpu_lock);
+
 	/* populate, map and clear the area */
 	if (pcpu_populate_chunk(chunk, off, size)) {
+		spin_lock_irq(&pcpu_lock);
 		pcpu_free_area(chunk, off);
-		goto out_unlock;
+		goto fail_unlock;
 	}
 
-	ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
-out_unlock:
-	mutex_unlock(&pcpu_mutex);
-	return ptr;
+	mutex_unlock(&pcpu_alloc_mutex);
+
+	return __addr_to_pcpu_ptr(chunk->vm->addr + off);
+
+fail_unlock:
+	spin_unlock_irq(&pcpu_lock);
+fail_unlock_mutex:
+	mutex_unlock(&pcpu_alloc_mutex);
+	return NULL;
 }
 
 /**
@@ -825,6 +897,9 @@ out_unlock:
  * Allocate percpu area of @size bytes aligned at @align.  Might
  * sleep.  Might trigger writeouts.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
  * percpu area if arch has set it up; otherwise, allocation is served
  * from the same dynamic area.  Might sleep.  Might trigger writeouts.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
  * @work: unused
  *
  * Reclaim all fully free chunks except for the first one.
+ *
+ * CONTEXT:
+ * workqueue context.
  */
 static void pcpu_reclaim(struct work_struct *work)
 {
@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work)
 	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
 	struct pcpu_chunk *chunk, *next;
 
-	mutex_lock(&pcpu_mutex);
+	mutex_lock(&pcpu_alloc_mutex);
+	spin_lock_irq(&pcpu_lock);
 
 	list_for_each_entry_safe(chunk, next, head, list) {
 		WARN_ON(chunk->immutable);
@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work)
 		list_move(&chunk->list, &todo);
 	}
 
-	mutex_unlock(&pcpu_mutex);
+	spin_unlock_irq(&pcpu_lock);
+	mutex_unlock(&pcpu_alloc_mutex);
 
 	list_for_each_entry_safe(chunk, next, &todo, list) {
 		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
@@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work)
  * free_percpu - free percpu area
  * @ptr: pointer to area to free
  *
- * Free percpu area @ptr.  Might sleep.
+ * Free percpu area @ptr.
+ *
+ * CONTEXT:
+ * Can be called from atomic context.
  */
 void free_percpu(void *ptr)
 {
 	void *addr = __pcpu_ptr_to_addr(ptr);
 	struct pcpu_chunk *chunk;
+	unsigned long flags;
 	int off;
 
 	if (!ptr)
 		return;
 
-	mutex_lock(&pcpu_mutex);
+	spin_lock_irqsave(&pcpu_lock, flags);
 
 	chunk = pcpu_chunk_addr_search(addr);
 	off = addr - chunk->vm->addr;
@@ -917,7 +1004,7 @@ void free_percpu(void *ptr)
 			}
 	}
 
-	mutex_unlock(&pcpu_mutex);
+	spin_unlock_irqrestore(&pcpu_lock, flags);
 }
 EXPORT_SYMBOL_GPL(free_percpu);
 
-- 
cgit v0.10.2


From ab96ddec7213004b632d24dc2cdcd2df5f16f50b Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sat, 7 Mar 2009 13:39:22 -0800
Subject: Input: serio - fix protocol number for TouchIT213

Protocol 0x37 has been reserved for iNexio devices and Sahara
was supposed to get 0x38.

Reported-by: Claudio Nieder <private@claudio.ch>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>

diff --git a/include/linux/serio.h b/include/linux/serio.h
index 1bcb357..e0417e4 100644
--- a/include/linux/serio.h
+++ b/include/linux/serio.h
@@ -212,7 +212,7 @@ static inline void serio_unpin_driver(struct serio *serio)
 #define SERIO_FUJITSU	0x35
 #define SERIO_ZHENHUA	0x36
 #define SERIO_INEXIO	0x37
-#define SERIO_TOUCHIT213	0x37
+#define SERIO_TOUCHIT213	0x38
 #define SERIO_W8001	0x39
 
 #endif
-- 
cgit v0.10.2


From 3a450de1365d20afde406f0d9b2931a5e4a4fd6a Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Fri, 6 Mar 2009 17:30:56 -0600
Subject: x86: UV: remove uv_flush_tlb_others() WARN_ON

In uv_flush_tlb_others() (arch/x86/kernel/tlb_uv.c),
the "WARN_ON(!in_atomic())" fails if CONFIG_PREEMPT is not enabled.

And CONFIG_PREEMPT is not enabled by default in the distribution that
most UV owners will use.

We could #ifdef CONFIG_PREEMPT the warning, but that is not good form.
And there seems to be no suitable fix to in_atomic() when CONFIG_PREMPT
is not on.

As Ingo commented:

  > and we have no proper primitive to test for atomicity. (mainly
  > because we dont know about atomicity on a non-preempt kernel)

So we drop the WARN_ON.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index f04549a..d038b9c 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -314,8 +314,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	int locals = 0;
 	struct bau_desc *bau_desc;
 
-	WARN_ON(!in_atomic());
-
 	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
 
 	uv_cpu = uv_blade_processor_id();
-- 
cgit v0.10.2


From cda56ac29f2d8288d62978272856884d26e0b47b Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Tue, 10 Feb 2009 16:32:33 +0200
Subject: mmc: fix data timeout for SEND_EXT_CSD

Commit 0d3e0460f307e84904968aad6cff97bd688583d8
"MMC: CSD and CID timeout values" inadvertently broke
the timeout for the MMC command SEND_EXT_CSD.

This patch puts it back again.

Depending on the characteristics of the controller,
this bug may prevent the use of MMC cards.

Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>

diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c
index 9c50e6f..34ce270 100644
--- a/drivers/mmc/core/mmc_ops.c
+++ b/drivers/mmc/core/mmc_ops.c
@@ -248,12 +248,15 @@ mmc_send_cxd_data(struct mmc_card *card, struct mmc_host *host,
 
 	sg_init_one(&sg, data_buf, len);
 
-	/*
-	 * The spec states that CSR and CID accesses have a timeout
-	 * of 64 clock cycles.
-	 */
-	data.timeout_ns = 0;
-	data.timeout_clks = 64;
+	if (opcode == MMC_SEND_CSD || opcode == MMC_SEND_CID) {
+		/*
+		 * The spec states that CSR and CID accesses have a timeout
+		 * of 64 clock cycles.
+		 */
+		data.timeout_ns = 0;
+		data.timeout_clks = 64;
+	} else
+		mmc_set_data_timeout(&data, card);
 
 	mmc_wait_for_req(host, &mrq);
 
-- 
cgit v0.10.2


From 1f442d70c84aa798e243e721eba728a98434cd86 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 7 Mar 2009 23:46:26 -0800
Subject: x86: remove smp_apply_quirks()/smp_checks()

Impact: cleanup and code size reduction on 64-bit

This code is only applied to Intel Pentium and AMD K7 32-bit cpus.

Move those checks to intel_init()/amd_init() for 32-bit
so 64-bit will not build this code.

Also change to use cpu_index check to see if we need to emit warning.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49B377D2.8030108@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 25423a5..f47df59 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/numa_64.h>
@@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
 	}
 }
 
+static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	/* calling is from identify_secondary_cpu() ? */
+	if (c->cpu_index == boot_cpu_id)
+		return;
+
+	/*
+	 * Certain Athlons might work (for various values of 'work') in SMP
+	 * but they are not certified as MP capable.
+	 */
+	/* Athlon 660/661 is valid. */
+	if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
+	    (c->x86_mask == 1)))
+		goto valid_k7;
+
+	/* Duron 670 is valid */
+	if ((c->x86_model == 7) && (c->x86_mask == 0))
+		goto valid_k7;
+
+	/*
+	 * Athlon 662, Duron 671, and Athlon >model 7 have capability
+	 * bit. It's worth noting that the A5 stepping (662) of some
+	 * Athlon XP's have the MP bit set.
+	 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+	 * more.
+	 */
+	if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
+	    ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+	     (c->x86_model > 7))
+		if (cpu_has_mp)
+			goto valid_k7;
+
+	/* If we get here, not a certified SMP capable AMD system. */
+
+	/*
+	 * Don't taint if we are running SMP kernel on a single non-MP
+	 * approved Athlon
+	 */
+	WARN_ONCE(1, "WARNING: This combination of AMD"
+		"processors is not suitable for SMP.\n");
+	if (!test_taint(TAINT_UNSAFE_SMP))
+		add_taint(TAINT_UNSAFE_SMP);
+
+valid_k7:
+	;
+#endif
+}
+
 static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
@@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
 	}
 
 	set_cpu_cap(c, X86_FEATURE_K7);
+
+	amd_k7_smp_check(c);
 }
 #endif
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 25c559b..191117f 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -13,6 +13,7 @@
 #include <asm/uaccess.h>
 #include <asm/ds.h>
 #include <asm/bugs.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/topology.h>
@@ -110,6 +111,28 @@ static void __cpuinit trap_init_f00f_bug(void)
 }
 #endif
 
+static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	/* calling is from identify_secondary_cpu() ? */
+	if (c->cpu_index == boot_cpu_id)
+		return;
+
+	/*
+	 * Mask B, Pentium, but not Pentium MMX
+	 */
+	if (c->x86 == 5 &&
+	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
+	    c->x86_model <= 3) {
+		/*
+		 * Remember we have B step Pentia with bugs
+		 */
+		WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
+				    "with B stepping processors.\n");
+	}
+#endif
+}
+
 static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 {
 	unsigned long lo, hi;
@@ -186,6 +209,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_NUMAQ
 	numaq_tsc_disable();
 #endif
+
+	intel_smp_check(c);
 }
 #else
 static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 249334f..ef7d101 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -114,10 +114,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
 
 atomic_t init_deasserted;
 
-
-/* Set if we find a B stepping CPU */
-static int __cpuinitdata smp_b_stepping;
-
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
 
 /* which logical CPUs are on which nodes */
@@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void)
 	cpumask_set_cpu(cpuid, cpu_callin_mask);
 }
 
-static int __cpuinitdata unsafe_smp;
-
 /*
  * Activate a secondary processor.
  */
@@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused)
 	cpu_idle();
 }
 
-static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
-{
-	/*
-	 * Mask B, Pentium, but not Pentium MMX
-	 */
-	if (c->x86_vendor == X86_VENDOR_INTEL &&
-	    c->x86 == 5 &&
-	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
-	    c->x86_model <= 3)
-		/*
-		 * Remember we have B step Pentia with bugs
-		 */
-		smp_b_stepping = 1;
-
-	/*
-	 * Certain Athlons might work (for various values of 'work') in SMP
-	 * but they are not certified as MP capable.
-	 */
-	if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
-
-		if (num_possible_cpus() == 1)
-			goto valid_k7;
-
-		/* Athlon 660/661 is valid. */
-		if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
-		    (c->x86_mask == 1)))
-			goto valid_k7;
-
-		/* Duron 670 is valid */
-		if ((c->x86_model == 7) && (c->x86_mask == 0))
-			goto valid_k7;
-
-		/*
-		 * Athlon 662, Duron 671, and Athlon >model 7 have capability
-		 * bit. It's worth noting that the A5 stepping (662) of some
-		 * Athlon XP's have the MP bit set.
-		 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
-		 * more.
-		 */
-		if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
-		    ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
-		     (c->x86_model > 7))
-			if (cpu_has_mp)
-				goto valid_k7;
-
-		/* If we get here, not a certified SMP capable AMD system. */
-		unsafe_smp = 1;
-	}
-
-valid_k7:
-	;
-}
-
-static void __cpuinit smp_checks(void)
-{
-	if (smp_b_stepping)
-		printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
-				    "with B stepping processors.\n");
-
-	/*
-	 * Don't taint if we are running SMP kernel on a single non-MP
-	 * approved Athlon
-	 */
-	if (unsafe_smp && num_online_cpus() > 1) {
-		printk(KERN_INFO "WARNING: This combination of AMD"
-			"processors is not suitable for SMP.\n");
-		add_taint(TAINT_UNSAFE_SMP);
-	}
-}
-
 /*
  * The bootstrap kernel entry code has set these up. Save them for
  * a given CPU
@@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id)
 	c->cpu_index = id;
 	if (id != 0)
 		identify_secondary_cpu(c);
-	smp_apply_quirks(c);
 }
 
 
@@ -1193,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 	pr_debug("Boot done.\n");
 
 	impress_friends();
-	smp_checks();
 #ifdef CONFIG_X86_IO_APIC
 	setup_ioapic_dest();
 #endif
-- 
cgit v0.10.2


From 8827247ffcc9e880cbe4705655065cf011265157 Mon Sep 17 00:00:00 2001
From: Wang Chen <wangchen@cn.fujitsu.com>
Date: Sat, 7 Mar 2009 13:34:19 +0800
Subject: x86: don't define __this_fixmap_does_not_exist()

Impact: improve out-of-range fixmap index debugging

Commit "1b42f51630c7eebce6fb780b480731eb81afd325"
defined the __this_fixmap_does_not_exist() function
with a WARN_ON(1) in it.

This causes the linker to not report an error when
__this_fixmap_does_not_exist() is called with a
non-constant parameter.

Ingo defined __this_fixmap_does_not_exist() because he
wanted to get virt addresses of fix memory of nest level
by non-constant index.

But we can fix this and still keep the link-time check:

We can get the four slot virt addresses on link time and
store them to array slot_virt[].

Then we can then refer the slot_virt with non-constant index,
in the ioremap-leak detection code.

Signed-off-by: Wang Chen <wangchen@cn.fujitsu.com>
LKML-Reference: <49B2075B.4070509@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 62773ab..96786ef 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -504,13 +504,19 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
 	return &bm_pte[pte_index(addr)];
 }
 
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
 void __init early_ioremap_init(void)
 {
 	pmd_t *pmd;
+	int i;
 
 	if (early_ioremap_debug)
 		printk(KERN_INFO "early_ioremap_init()\n");
 
+	for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+		slot_virt[i] = fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+
 	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
 	memset(bm_pte, 0, sizeof(bm_pte));
 	pmd_populate_kernel(&init_mm, pmd, bm_pte);
@@ -577,6 +583,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
 
 static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
 static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+
 static int __init check_early_ioremap_leak(void)
 {
 	int count = 0;
@@ -598,7 +605,8 @@ static int __init check_early_ioremap_leak(void)
 }
 late_initcall(check_early_ioremap_leak);
 
-static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
+static void __init __iomem *
+__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
 {
 	unsigned long offset, last_addr;
 	unsigned int nrpages;
@@ -664,9 +672,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo
 		--nrpages;
 	}
 	if (early_ioremap_debug)
-		printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
+		printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
 
-	prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0));
+	prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
 	return prev_map[slot];
 }
 
@@ -734,8 +742,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
 	}
 	prev_map[slot] = NULL;
 }
-
-void __this_fixmap_does_not_exist(void)
-{
-	WARN_ON(1);
-}
-- 
cgit v0.10.2


From 4302e5d53b9166d45317e3ddf0a7a9dab3efd43b Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Thu, 5 Mar 2009 11:45:48 +0100
Subject: MIPS: compat: Implement is_compat_task.

This is a build fix required after "x86-64: seccomp: fix 32/64 syscall
hole" (commit 5b1017404aea6d2e552e991b3fd814d839e9cd67).  MIPS doesn't
have the issue that was fixed for x86-64 by that patch.

This also doesn't solve the N32 issue which is that N32 seccomp processes
will be treated as non-compat processes thus only have access to N64
syscalls.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h
index ac5d541..6c5b409 100644
--- a/arch/mips/include/asm/compat.h
+++ b/arch/mips/include/asm/compat.h
@@ -3,6 +3,8 @@
 /*
  * Architecture specific compatibility types
  */
+#include <linux/seccomp.h>
+#include <linux/thread_info.h>
 #include <linux/types.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
@@ -218,4 +220,9 @@ struct compat_shmid64_ds {
 	compat_ulong_t	__unused2;
 };
 
+static inline int is_compat_task(void)
+{
+	return test_thread_flag(TIF_32BIT);
+}
+
 #endif /* _ASM_COMPAT_H */
-- 
cgit v0.10.2


From e954ef20c29b7af07a8cb5452f14fb69e3d9d2b2 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 5 Mar 2009 12:04:57 -0800
Subject: x86: fix warning about nodeid

Impact: cleanup

Ingo found there warning about nodeid with some configs.

try to use for_each_online_node for non numa too. in that case
nodeid will be 0.

also move out boundary checking from setup_node_bootmem(), so
non-numa config will not check it.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <49B03069.80001@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2966c6b..db81e9a 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -806,11 +806,6 @@ static unsigned long __init setup_node_bootmem(int nodeid,
 {
 	unsigned long bootmap_size;
 
-	if (start_pfn > max_low_pfn)
-		return bootmap;
-	if (end_pfn > max_low_pfn)
-		end_pfn = max_low_pfn;
-
 	/* don't touch min_low_pfn */
 	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
 					 bootmap >> PAGE_SHIFT,
@@ -843,13 +838,23 @@ void __init setup_bootmem_allocator(void)
 		 max_pfn_mapped<<PAGE_SHIFT);
 	printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
 
+	for_each_online_node(nodeid) {
+		 unsigned long start_pfn, end_pfn;
+
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-	for_each_online_node(nodeid)
-		bootmap = setup_node_bootmem(nodeid, node_start_pfn[nodeid],
-					node_end_pfn[nodeid], bootmap);
+		start_pfn = node_start_pfn[nodeid];
+		end_pfn = node_end_pfn[nodeid];
+		if (start_pfn > max_low_pfn)
+			continue;
+		if (end_pfn > max_low_pfn)
+			end_pfn = max_low_pfn;
 #else
-	bootmap = setup_node_bootmem(0, 0, max_low_pfn, bootmap);
+		start_pfn = 0;
+		end_pfn = max_low_pfn;
 #endif
+		bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
+						 bootmap);
+	}
 
 	after_bootmem = 1;
 }
-- 
cgit v0.10.2


From d0fc63f7bd07cb779a06dc1cdd0c5a14e7f5d562 Mon Sep 17 00:00:00 2001
From: Stuart Bennett <stuart@freedesktop.org>
Date: Sun, 8 Mar 2009 20:21:35 +0200
Subject: x86 mmiotrace: fix remove_kmmio_fault_pages()

Impact: fix race+crash in mmiotrace

The list manipulation in remove_kmmio_fault_pages() was broken. If more
than one consecutive kmmio_fault_page was re-added during the grace
period between unregister_kmmio_probe() and remove_kmmio_fault_pages(),
the list manipulation failed to remove pages from the release list.

After a second grace period the pages get into rcu_free_kmmio_fault_pages()
and raise a BUG_ON() kernel crash.

The list manipulation is fixed to properly remove pages from the release
list.

This bug has been present from the very beginning of mmiotrace in the
mainline kernel. It was introduced in 0fd0e3da ("x86: mmiotrace full
patch, preview 1");

An urgent fix for Linus. Tested by Stuart (on 32-bit) and Pekka
(on amd and intel 64-bit systems, nouveau and nvidia proprietary).

Signed-off-by: Stuart Bennett <stuart@freedesktop.org>
Signed-off-by: Pekka Paalanen <pq@iki.fi>
LKML-Reference: <20090308202135.34933feb@daedalus.pq.iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 9f20503..6a518dd 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -451,23 +451,24 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
 
 static void remove_kmmio_fault_pages(struct rcu_head *head)
 {
-	struct kmmio_delayed_release *dr = container_of(
-						head,
-						struct kmmio_delayed_release,
-						rcu);
+	struct kmmio_delayed_release *dr =
+		container_of(head, struct kmmio_delayed_release, rcu);
 	struct kmmio_fault_page *p = dr->release_list;
 	struct kmmio_fault_page **prevp = &dr->release_list;
 	unsigned long flags;
+
 	spin_lock_irqsave(&kmmio_lock, flags);
 	while (p) {
-		if (!p->count)
+		if (!p->count) {
 			list_del_rcu(&p->list);
-		else
+			prevp = &p->release_next;
+		} else {
 			*prevp = p->release_next;
-		prevp = &p->release_next;
+		}
 		p = p->release_next;
 	}
 	spin_unlock_irqrestore(&kmmio_lock, flags);
+
 	/* This is the real RCU destroy call. */
 	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
 }
-- 
cgit v0.10.2


From 0feca851c1b3cb4ebfa3149144b3d5de0879ebaa Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Fri, 6 Mar 2009 10:09:26 -0800
Subject: x86-32: make sure virt_addr_valid() returns false for fixmap
 addresses

I found that virt_addr_valid() was returning true for fixmap addresses.

I'm not sure whether pfn_valid() is supposed to include this test,
but there's no harm in being explicit.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49B166D6.2080505@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 62773ab..62def57 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -87,6 +87,8 @@ bool __virt_addr_valid(unsigned long x)
 		return false;
 	if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
 		return false;
+	if (x >= FIXADDR_START)
+		return false;
 	return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
 }
 EXPORT_SYMBOL(__virt_addr_valid);
-- 
cgit v0.10.2


From cbd88c8e6f5cdb8d4b9af01df825305200240382 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 9 Mar 2009 10:06:22 -0600
Subject: lguest: fix crash 'unhandled trap 13 at <native_read_msr_safe>'

Impact: fix lguest boot crash on modern Intel machines

The code in early_init_intel does:

	if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
		u64 misc_enable;

		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);

And that rdmsr faults (not allowed from non-0 PL).  We can get around
this by mugging the family ID part of the cpuid.  5 seems like a good
number.

Of course, this is a hack (how very lguest!).  We could just indicate
that we don't support MSRs, or implement lguest_rdmst.

Reported-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Tested-by: Patrick McHardy <kaber@trash.net>

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 92f1c6f..ba5c05e 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -343,6 +343,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
 		 * flush_tlb_user() for both user and kernel mappings unless
 		 * the Page Global Enable (PGE) feature bit is set. */
 		*dx |= 0x00002000;
+		/* We also lie, and say we're family id 5.  6 or greater
+		 * leads to a rdmsr in early_init_intel which we can't handle.
+		 * Family ID is returned as bits 8-12 in ax. */
+		*ax &= 0xFFFFF0FF;
+		*ax |= 0x00000500;
 		break;
 	case 0x80000000:
 		/* Futureproof this a little: if they ask how much extended
-- 
cgit v0.10.2


From 6db6a5f3ae2ca6b874b0fd97ae16fdc9b5cdd6cc Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 9 Mar 2009 10:06:28 -0600
Subject: lguest: fix for CONFIG_SPARSE_IRQ=y

Impact: remove lots of lguest boot WARN_ON() when CONFIG_SPARSE_IRQ=y

We now need to call irq_to_desc_alloc_cpu() before
set_irq_chip_and_handler_name(), but we can't do that from init_IRQ (no
kmalloc available).

So do it as we use interrupts instead.  Also means we only alloc for
irqs we use, which was the intent of CONFIG_SPARSE_IRQ anyway.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@redhat.com>

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ba5c05e..960a8d9 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -594,19 +594,21 @@ static void __init lguest_init_IRQ(void)
 		/* Some systems map "vectors" to interrupts weirdly.  Lguest has
 		 * a straightforward 1 to 1 mapping, so force that here. */
 		__get_cpu_var(vector_irq)[vector] = i;
-		if (vector != SYSCALL_VECTOR) {
-			set_intr_gate(vector,
-				      interrupt[vector-FIRST_EXTERNAL_VECTOR]);
-			set_irq_chip_and_handler_name(i, &lguest_irq_controller,
-						      handle_level_irq,
-						      "level");
-		}
+		if (vector != SYSCALL_VECTOR)
+			set_intr_gate(vector, interrupt[i]);
 	}
 	/* This call is required to set up for 4k stacks, where we have
 	 * separate stacks for hard and soft interrupts. */
 	irq_ctx_init(smp_processor_id());
 }
 
+void lguest_setup_irq(unsigned int irq)
+{
+	irq_to_desc_alloc_cpu(irq, 0);
+	set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
+				      handle_level_irq, "level");
+}
+
 /*
  * Time.
  *
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index b4d44e5..8132533 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -212,6 +212,9 @@ static void lg_notify(struct virtqueue *vq)
 	hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0);
 }
 
+/* An extern declaration inside a C file is bad form.  Don't do it. */
+extern void lguest_setup_irq(unsigned int irq);
+
 /* This routine finds the first virtqueue described in the configuration of
  * this device and sets it up.
  *
@@ -266,6 +269,9 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 		goto unmap;
 	}
 
+	/* Make sure the interrupt is allocated. */
+	lguest_setup_irq(lvq->config.irq);
+
 	/* Tell the interrupt for this virtqueue to go to the virtio_ring
 	 * interrupt handler. */
 	/* FIXME: We used to have a flag for the Host to tell us we could use
-- 
cgit v0.10.2


From 6d5b5acca9e566515ef3f1ed617e7295c4f94345 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 9 Mar 2009 13:31:59 +0100
Subject: Fix fixpoint divide exception in acct_update_integrals

Frans Pop reported the crash below when running an s390 kernel under Hercules:

  Kernel BUG at 000738b4  verbose debug info unavailable!
  fixpoint divide exception: 0009  #1! SMP
  Modules linked in: nfs lockd nfs_acl sunrpc ctcm fsm tape_34xx
     cu3088 tape ccwgroup tape_class ext3 jbd mbcache dm_mirror dm_log dm_snapshot
     dm_mod dasd_eckd_mod dasd_mod
  CPU: 0 Not tainted 2.6.27.19 #13
  Process awk (pid: 2069, task: 0f9ed9b8, ksp: 0f4f7d18)
  Krnl PSW : 070c1000 800738b4 (acct_update_integrals+0x4c/0x118)
             R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:0 CC:1 PM:0
  Krnl GPRS: 00000000 000007d0 7fffffff fffff830
             00000000 ffffffff 00000002 0f9ed9b8
             00000000 00008ca0 00000000 0f9ed9b8
             0f9edda4 8007386e 0f4f7ec8 0f4f7e98
  Krnl Code: 800738aa: a71807d0         lhi     %r1,2000
             800738ae: 8c200001         srdl    %r2,1
             800738b2: 1d21             dr      %r2,%r1
            >800738b4: 5810d10e         l       %r1,270(%r13)
             800738b8: 1823             lr      %r2,%r3
             800738ba: 4130f060         la      %r3,96(%r15)
             800738be: 0de1             basr    %r14,%r1
             800738c0: 5800f060         l       %r0,96(%r15)
  Call Trace:
  ( <000000000004fdea>! blocking_notifier_call_chain+0x1e/0x2c)
    <0000000000038502>! do_exit+0x106/0x7c0
    <0000000000038c36>! do_group_exit+0x7a/0xb4
    <0000000000038c8e>! SyS_exit_group+0x1e/0x30
    <0000000000021c28>! sysc_do_restart+0x12/0x16
    <0000000077e7e924>! 0x77e7e924

Reason for this is that cpu time accounting usually only happens from
interrupt context, but acct_update_integrals gets also called from
process context with interrupts enabled.

So in acct_update_integrals we may end up with the following scenario:

Between reading tsk->stime/tsk->utime and tsk->acct_timexpd an interrupt
happens which updates accouting values.  This causes acct_timexpd to be
greater than the former stime + utime.  The subsequent calculation of

	dtime = cputime_sub(time, tsk->acct_timexpd);

will be negative and the division performed by

	cputime_to_jiffies(dtime)

will generate an exception since the result won't fit into a 32 bit
register.

In order to fix this just always disable interrupts while accessing any
of the accounting values.

Reported by: Frans Pop <elendil@planet.nl>
Tested by: Frans Pop <elendil@planet.nl>
Cc: stable@kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 43f891b..00d59d0 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -122,8 +122,10 @@ void acct_update_integrals(struct task_struct *tsk)
 	if (likely(tsk->mm)) {
 		cputime_t time, dtime;
 		struct timeval value;
+		unsigned long flags;
 		u64 delta;
 
+		local_irq_save(flags);
 		time = tsk->stime + tsk->utime;
 		dtime = cputime_sub(time, tsk->acct_timexpd);
 		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
@@ -131,10 +133,12 @@ void acct_update_integrals(struct task_struct *tsk)
 		delta = delta * USEC_PER_SEC + value.tv_usec;
 
 		if (delta == 0)
-			return;
+			goto out;
 		tsk->acct_timexpd = time;
 		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+	out:
+		local_irq_restore(flags);
 	}
 }
 
-- 
cgit v0.10.2


From b9447ef80bd301b932ac4d85c9622e929de5fd62 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 9 Mar 2009 11:45:38 -0400
Subject: Btrfs: fix spinlock assertions on UP systems

btrfs_tree_locked was being used to make sure a given extent_buffer was
properly locked in a few places.  But, it wasn't correct for UP compiled
kernels.

This switches it to using assert_spin_locked instead, and renames it to
btrfs_assert_tree_locked to better reflect how it was really being used.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 42491d7..37f31b5 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -277,7 +277,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if (*cow_ret == buf)
 		unlock_orig = 1;
 
-	WARN_ON(!btrfs_tree_locked(buf));
+	btrfs_assert_tree_locked(buf);
 
 	if (parent)
 		parent_start = parent->start;
@@ -2365,7 +2365,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (slot >= btrfs_header_nritems(upper) - 1)
 		return 1;
 
-	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+	btrfs_assert_tree_locked(path->nodes[1]);
 
 	right = read_node_slot(root, upper, slot + 1);
 	btrfs_tree_lock(right);
@@ -2562,7 +2562,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (right_nritems == 0)
 		return 1;
 
-	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+	btrfs_assert_tree_locked(path->nodes[1]);
 
 	left = read_node_slot(root, path->nodes[1], slot - 1);
 	btrfs_tree_lock(left);
@@ -4101,7 +4101,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 
 		next = read_node_slot(root, c, slot);
 		if (!path->skip_locking) {
-			WARN_ON(!btrfs_tree_locked(c));
+			btrfs_assert_tree_locked(c);
 			btrfs_tree_lock(next);
 			btrfs_set_lock_blocking(next);
 		}
@@ -4126,7 +4126,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			reada_for_search(root, path, level, slot, 0);
 		next = read_node_slot(root, next, 0);
 		if (!path->skip_locking) {
-			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
+			btrfs_assert_tree_locked(path->nodes[level]);
 			btrfs_tree_lock(next);
 			btrfs_set_lock_blocking(next);
 		}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index adda739..3e18175 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -857,7 +857,7 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	if (btrfs_header_generation(buf) ==
 	    root->fs_info->running_transaction->transid) {
-		WARN_ON(!btrfs_tree_locked(buf));
+		btrfs_assert_tree_locked(buf);
 
 		/* ugh, clear_extent_buffer_dirty can be expensive */
 		btrfs_set_lock_blocking(buf);
@@ -2361,7 +2361,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 
 	btrfs_set_lock_blocking(buf);
 
-	WARN_ON(!btrfs_tree_locked(buf));
+	btrfs_assert_tree_locked(buf);
 	if (transid != root->fs_info->generation) {
 		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
 		       "found %llu running %llu\n",
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6b5966a..9abf81f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4418,13 +4418,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	BUG_ON(!btrfs_tree_locked(parent));
+	btrfs_assert_tree_locked(parent);
 	parent_level = btrfs_header_level(parent);
 	extent_buffer_get(parent);
 	path->nodes[parent_level] = parent;
 	path->slots[parent_level] = btrfs_header_nritems(parent);
 
-	BUG_ON(!btrfs_tree_locked(node));
+	btrfs_assert_tree_locked(node);
 	level = btrfs_header_level(node);
 	extent_buffer_get(node);
 	path->nodes[level] = node;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 85506c4..47b0a88 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -220,8 +220,8 @@ int btrfs_tree_unlock(struct extent_buffer *eb)
 	return 0;
 }
 
-int btrfs_tree_locked(struct extent_buffer *eb)
+void btrfs_assert_tree_locked(struct extent_buffer *eb)
 {
-	return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
-			spin_is_locked(&eb->lock);
+	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+		assert_spin_locked(&eb->lock);
 }
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 6bb0afb..6c4ce45 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,11 +21,11 @@
 
 int btrfs_tree_lock(struct extent_buffer *eb);
 int btrfs_tree_unlock(struct extent_buffer *eb);
-int btrfs_tree_locked(struct extent_buffer *eb);
 
 int btrfs_try_tree_lock(struct extent_buffer *eb);
 int btrfs_try_spin_lock(struct extent_buffer *eb);
 
 void btrfs_set_lock_blocking(struct extent_buffer *eb);
 void btrfs_clear_lock_blocking(struct extent_buffer *eb);
+void btrfs_assert_tree_locked(struct extent_buffer *eb);
 #endif
-- 
cgit v0.10.2


From 129f8ae9b1b5be94517da76009ea956e89104ce8 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Mon, 9 Mar 2009 15:07:33 -0400
Subject: Revert "[CPUFREQ] Disable sysfs ui for p4-clockmod."

This reverts commit e088e4c9cdb618675874becb91b2fd581ee707e6.

Removing the sysfs interface for p4-clockmod was flagged as a
regression in bug 12826.

Course of action:
 - Find out the remaining causes of overheating, and fix them
   if possible. ACPI should be doing the right thing automatically.
   If it isn't, we need to fix that.
 - mark p4-clockmod ui as deprecated
 - try again with the removal in six months.

It's not really feasible to printk about the deprecation, because
it needs to happen at all the sysfs entry points, which means adding
a lot of strcmp("p4-clockmod".. calls to the core, which.. bleuch.

Signed-off-by: Dave Jones <davej@redhat.com>

diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index b585e04c..3178c3a 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -277,7 +277,6 @@ static struct cpufreq_driver p4clockmod_driver = {
 	.name		= "p4-clockmod",
 	.owner		= THIS_MODULE,
 	.attr		= p4clockmod_attr,
-	.hide_interface	= 1,
 };
 
 
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index b55cb67..d6daf3c 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -754,11 +754,6 @@ static struct kobj_type ktype_cpufreq = {
 	.release	= cpufreq_sysfs_release,
 };
 
-static struct kobj_type ktype_empty_cpufreq = {
-	.sysfs_ops	= &sysfs_ops,
-	.release	= cpufreq_sysfs_release,
-};
-
 
 /**
  * cpufreq_add_dev - add a CPU device
@@ -892,36 +887,26 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 	memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
 
 	/* prepare interface data */
-	if (!cpufreq_driver->hide_interface) {
-		ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
-					   &sys_dev->kobj, "cpufreq");
+	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, &sys_dev->kobj,
+				   "cpufreq");
+	if (ret)
+		goto err_out_driver_exit;
+
+	/* set up files for this cpu device */
+	drv_attr = cpufreq_driver->attr;
+	while ((drv_attr) && (*drv_attr)) {
+		ret = sysfs_create_file(&policy->kobj, &((*drv_attr)->attr));
 		if (ret)
 			goto err_out_driver_exit;
-
-		/* set up files for this cpu device */
-		drv_attr = cpufreq_driver->attr;
-		while ((drv_attr) && (*drv_attr)) {
-			ret = sysfs_create_file(&policy->kobj,
-						&((*drv_attr)->attr));
-			if (ret)
-				goto err_out_driver_exit;
-			drv_attr++;
-		}
-		if (cpufreq_driver->get) {
-			ret = sysfs_create_file(&policy->kobj,
-						&cpuinfo_cur_freq.attr);
-			if (ret)
-				goto err_out_driver_exit;
-		}
-		if (cpufreq_driver->target) {
-			ret = sysfs_create_file(&policy->kobj,
-						&scaling_cur_freq.attr);
-			if (ret)
-				goto err_out_driver_exit;
-		}
-	} else {
-		ret = kobject_init_and_add(&policy->kobj, &ktype_empty_cpufreq,
-					   &sys_dev->kobj, "cpufreq");
+		drv_attr++;
+	}
+	if (cpufreq_driver->get) {
+		ret = sysfs_create_file(&policy->kobj, &cpuinfo_cur_freq.attr);
+		if (ret)
+			goto err_out_driver_exit;
+	}
+	if (cpufreq_driver->target) {
+		ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr);
 		if (ret)
 			goto err_out_driver_exit;
 	}
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 384b38d..1610427 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -234,7 +234,6 @@ struct cpufreq_driver {
 	int	(*suspend)	(struct cpufreq_policy *policy, pm_message_t pmsg);
 	int	(*resume)	(struct cpufreq_policy *policy);
 	struct freq_attr	**attr;
-	bool			hide_interface;
 };
 
 /* flags */
-- 
cgit v0.10.2


From 753b7aea8e4611433c13ac157f944d8b4bf42482 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Mon, 9 Mar 2009 15:14:37 -0400
Subject: [CPUFREQ] Add p4-clockmod sysfs-ui removal to feature-removal
 schedule.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Dave Jones <davej@redhat.com>

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 5ddbe35..20d3b94 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -335,3 +335,12 @@ Why:	In 2.6.18 the Secmark concept was introduced to replace the "compat_net"
 	Secmark, it is time to deprecate the older mechanism and start the
 	process of removing the old code.
 Who:	Paul Moore <paul.moore@hp.com>
+---------------------------
+
+What:	sysfs ui for changing p4-clockmod parameters
+When:	September 2009
+Why:	See commits 129f8ae9b1b5be94517da76009ea956e89104ce8 and
+	e088e4c9cdb618675874becb91b2fd581ee707e6.
+	Removal is subject to fixing any remaining bugs in ACPI which may
+	cause the thermal throttling not to happen at the right time.
+Who:	Dave Jones <davej@redhat.com>, Matthew Garrett <mjg@redhat.com>
-- 
cgit v0.10.2


From 2d5516cbb9daf7d0e342a2e3b0fc6f8c39a81205 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 2 Mar 2009 22:58:45 +0100
Subject: copy_process: fix CLONE_PARENT && parent_exec_id interaction

CLONE_PARENT can fool the ->self_exec_id/parent_exec_id logic. If we
re-use the old parent, we must also re-use ->parent_exec_id to make
sure exit_notify() sees the right ->xxx_exec_id's when the CLONE_PARENT'ed
task exits.

Also, move down the "p->parent_exec_id = p->self_exec_id" thing, to place
two different cases together.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/fork.c b/kernel/fork.c
index a66fbde..4854c2c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1179,10 +1179,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
 	clear_all_latency_tracing(p);
 
-	/* Our parent execution domain becomes current domain
-	   These must match for thread signalling to apply */
-	p->parent_exec_id = p->self_exec_id;
-
 	/* ok, now we should be set up.. */
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
@@ -1220,10 +1216,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		set_task_cpu(p, smp_processor_id());
 
 	/* CLONE_PARENT re-uses the old parent */
-	if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
+	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
 		p->real_parent = current->real_parent;
-	else
+		p->parent_exec_id = current->parent_exec_id;
+	} else {
 		p->real_parent = current;
+		p->parent_exec_id = current->self_exec_id;
+	}
 
 	spin_lock(&current->sighand->siglock);
 
-- 
cgit v0.10.2


From ba087e6f69381de6c91d6634aa0f603a2fdc96a9 Mon Sep 17 00:00:00 2001
From: Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
Date: Fri, 6 Mar 2009 02:51:14 +0000
Subject: sh: Add media/soc_camera.h to board setup of Renesas AP325RXA

Other compilation errors were revised by commit of
"sh: ap325rxa: Revert ov772x support"
(08c2f5b4d76f83213e379b12df504269d21c9e7c) but other compilation
errors are given.
We revert this commit and need to add new header(media/soc_camera.h).
This change revises new compilation error.

Signed-off-by: Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/board-ap325rxa.c b/arch/sh/boards/board-ap325rxa.c
index 72da416..15b6d45 100644
--- a/arch/sh/boards/board-ap325rxa.c
+++ b/arch/sh/boards/board-ap325rxa.c
@@ -22,6 +22,7 @@
 #include <linux/gpio.h>
 #include <linux/spi/spi.h>
 #include <linux/spi/spi_gpio.h>
+#include <media/soc_camera.h>
 #include <media/soc_camera_platform.h>
 #include <media/sh_mobile_ceu.h>
 #include <video/sh_mobile_lcdc.h>
-- 
cgit v0.10.2


From 467fc4988986865b5dbcc8cc6a86c9b650cb0c6f Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 10 Mar 2009 06:08:49 +0000
Subject: video: deferred io cleanup fix for sh_mobile_lcdcfb

Fix deferred io cleanup patch in the sh_mobile_lcdcfb driver.

If probe() fails early the sh_mobile_lcdc_stop() function will
be called to clean up deferred io. This patch modifies the
code to only call fb_deferred_io_cleanup() after deferred io
has been initialized.

With this patch applied we no longer hit BUG_ON() inside
fb_deferred_io_cleanup(). Triggers on a Migo-R with the
SYS QVGA panel board unmounted.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/video/sh_mobile_lcdcfb.c b/drivers/video/sh_mobile_lcdcfb.c
index 0e2b8fd..2c5d069 100644
--- a/drivers/video/sh_mobile_lcdcfb.c
+++ b/drivers/video/sh_mobile_lcdcfb.c
@@ -446,7 +446,6 @@ static void sh_mobile_lcdc_stop(struct sh_mobile_lcdc_priv *priv)
 {
 	struct sh_mobile_lcdc_chan *ch;
 	struct sh_mobile_lcdc_board_cfg	*board_cfg;
-	unsigned long tmp;
 	int k;
 
 	/* tell the board code to disable the panel */
@@ -456,9 +455,8 @@ static void sh_mobile_lcdc_stop(struct sh_mobile_lcdc_priv *priv)
 		if (board_cfg->display_off)
 			board_cfg->display_off(board_cfg->board_data);
 
-		/* cleanup deferred io if SYS bus */
-		tmp = ch->cfg.sys_bus_cfg.deferred_io_msec;
-		if (ch->ldmt1r_value & (1 << 12) && tmp) {
+		/* cleanup deferred io if enabled */
+		if (ch->info.fbdefio) {
 			fb_deferred_io_cleanup(&ch->info);
 			ch->info.fbdefio = NULL;
 		}
-- 
cgit v0.10.2


From 475049809977bf3975d78f2d2fd992e19ce2d59e Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 10 Mar 2009 12:55:45 -0700
Subject: mm: get_nid_for_pfn() returns int

get_nid_for_pfn() returns int

Presumably the (nid < 0) case has never happened.

We do know that it is happening on one system while creating a symlink for
a memory section so it should also happen on the same system if
unregister_mem_sect_under_nodes() were called to remove the same symlink.

The test was actually added in response to a problem with an earlier
version reported by Yasunori Goto where one or more of the leading pages
of a memory section on the 2nd node of one of his systems was
uninitialized because I believe they coincided with a memory hole.

That earlier version did not ignore uninitialized pages and determined
the nid by considering only the 1st page of each memory section.  This
caused the symlink to the 1st memory section on the 2nd node to be
incorrectly created in /sys/devices/system/node/node0 instead of
/sys/devices/system/node/node1.  The problem was fixed by adding the
test to skip over uninitialized pages.

I suspect we have not seen any reports of the non-removal
of a symlink due to the incorrect declaration of the nid
variable in unregister_mem_sect_under_nodes() because
  - systems where a memory section could have an uninitialized
    range of leading pages are probably rare.
  - memory remove is probably not done very frequently on the
    systems that are capable of demonstrating the problem.
  - lingering symlink(s) that should have been removed may
    have simply gone unnoticed.

[garyhade@us.ibm.com: wrote changelog]
Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Gary Hade <garyhade@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 43fa90b..f8f578a 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -303,7 +303,7 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk)
 	sect_start_pfn = section_nr_to_pfn(mem_blk->phys_index);
 	sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
 	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
-		unsigned int nid;
+		int nid;
 
 		nid = get_nid_for_pfn(pfn);
 		if (nid < 0)
-- 
cgit v0.10.2


From b943c460ff8556a193b28e2145b513f8b978e869 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 10 Mar 2009 12:55:46 -0700
Subject: menu: fix embedded menu snafu

The COMPAT_BRK kconfig symbol does not depend on EMBEDDED, but it is in
the midst of the EMBEDDED menu symbols, so it mucks up the EMBEDDED menu.
Fix by moving it to just after all of the EMBEDDED menu symbols.  Also,
ANON_INODES has a similar problem, so move it to just above the EMBEDDED
menu items since it is used in the EMBEDDED menu.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/init/Kconfig b/init/Kconfig
index f068071..6a5c5fe 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -675,6 +675,9 @@ config CC_OPTIMIZE_FOR_SIZE
 config SYSCTL
 	bool
 
+config ANON_INODES
+	bool
+
 menuconfig EMBEDDED
 	bool "Configure standard kernel features (for small systems)"
 	help
@@ -780,18 +783,6 @@ config PCSPKR_PLATFORM
           This option allows to disable the internal PC-Speaker
           support, saving some memory.
 
-config COMPAT_BRK
-	bool "Disable heap randomization"
-	default y
-	help
-	  Randomizing heap placement makes heap exploits harder, but it
-	  also breaks ancient binaries (including anything libc5 based).
-	  This option changes the bootup default to heap randomization
-	  disabled, and can be overriden runtime by setting
-	  /proc/sys/kernel/randomize_va_space to 2.
-
-	  On non-ancient distros (post-2000 ones) N is usually a safe choice.
-
 config BASE_FULL
 	default y
 	bool "Enable full-sized data structures for core" if EMBEDDED
@@ -809,9 +800,6 @@ config FUTEX
 	  support for "fast userspace mutexes".  The resulting kernel may not
 	  run glibc-based applications correctly.
 
-config ANON_INODES
-	bool
-
 config EPOLL
 	bool "Enable eventpoll support" if EMBEDDED
 	default y
@@ -897,6 +885,18 @@ config SLUB_DEBUG
 	  SLUB sysfs support. /sys/slab will not exist and there will be
 	  no support for cache validation etc.
 
+config COMPAT_BRK
+	bool "Disable heap randomization"
+	default y
+	help
+	  Randomizing heap placement makes heap exploits harder, but it
+	  also breaks ancient binaries (including anything libc5 based).
+	  This option changes the bootup default to heap randomization
+	  disabled, and can be overriden runtime by setting
+	  /proc/sys/kernel/randomize_va_space to 2.
+
+	  On non-ancient distros (post-2000 ones) N is usually a safe choice.
+
 choice
 	prompt "Choose SLAB allocator"
 	default SLUB
-- 
cgit v0.10.2


From c15ade65788b70797c947f7de3e049e6a23f407f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Tue, 10 Mar 2009 12:55:47 -0700
Subject: lm85: fix the version check that broke adt7468 probing

The verstep check in the lm85 driver fails because the upper nibble of
the version register is 0x7, not 0x6, on the adt7468 chip.  Probing of
all adt7468s was broken by 69fc1feba2d5856ff74dedb6ae9d8c490210825c
("hwmon: (lm85) Rework the device detection"), and this patch fixes
that.  Also add in a missing i2c_device_id that accidentally got dropped
from the original patch.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Cc: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/hwmon/lm85.c b/drivers/hwmon/lm85.c
index cfc1ee9..cf1422e 100644
--- a/drivers/hwmon/lm85.c
+++ b/drivers/hwmon/lm85.c
@@ -72,6 +72,7 @@ I2C_CLIENT_INSMOD_7(lm85b, lm85c, adm1027, adt7463, adt7468, emc6d100,
 #define	LM85_COMPANY_SMSC		0x5c
 #define	LM85_VERSTEP_VMASK              0xf0
 #define	LM85_VERSTEP_GENERIC		0x60
+#define	LM85_VERSTEP_GENERIC2		0x70
 #define	LM85_VERSTEP_LM85C		0x60
 #define	LM85_VERSTEP_LM85B		0x62
 #define	LM85_VERSTEP_ADM1027		0x60
@@ -334,6 +335,7 @@ static struct lm85_data *lm85_update_device(struct device *dev);
 static const struct i2c_device_id lm85_id[] = {
 	{ "adm1027", adm1027 },
 	{ "adt7463", adt7463 },
+	{ "adt7468", adt7468 },
 	{ "lm85", any_chip },
 	{ "lm85b", lm85b },
 	{ "lm85c", lm85c },
@@ -1153,7 +1155,8 @@ static int lm85_detect(struct i2c_client *client, int kind,
 			address, company, verstep);
 
 		/* All supported chips have the version in common */
-		if ((verstep & LM85_VERSTEP_VMASK) != LM85_VERSTEP_GENERIC) {
+		if ((verstep & LM85_VERSTEP_VMASK) != LM85_VERSTEP_GENERIC &&
+		    (verstep & LM85_VERSTEP_VMASK) != LM85_VERSTEP_GENERIC2) {
 			dev_dbg(&adapter->dev, "Autodetection failed: "
 				"unsupported version\n");
 			return -ENODEV;
-- 
cgit v0.10.2


From 8ef1f0291a5d126f678b2f0225843c1ab550559c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Tue, 10 Mar 2009 12:55:48 -0700
Subject: lm85: add VRM10 support for adt7468 chip

The adt7468 chip supports VRM10 sensors just like the adt7463; add a
missing check for it.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Cc: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/hwmon/lm85.c b/drivers/hwmon/lm85.c
index cf1422e..b251d86 100644
--- a/drivers/hwmon/lm85.c
+++ b/drivers/hwmon/lm85.c
@@ -410,7 +410,8 @@ static ssize_t show_vid_reg(struct device *dev, struct device_attribute *attr,
 	struct lm85_data *data = lm85_update_device(dev);
 	int vid;
 
-	if (data->type == adt7463 && (data->vid & 0x80)) {
+	if ((data->type == adt7463 || data->type == adt7468) &&
+	    (data->vid & 0x80)) {
 		/* 6-pin VID (VRM 10) */
 		vid = vid_from_reg(data->vid & 0x3f, data->vrm);
 	} else {
-- 
cgit v0.10.2


From 5a891ed5adef39aca0b7662c58a2566c7a16237e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 10 Mar 2009 12:55:49 -0700
Subject: xtensa: fix compilation somewhat

* ->put_char changes
 * HIGHMEM is bogus it seems, there is no kmap_atomic() et al
 * some includes

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Chris Zankel <zankel@tensilica.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 6c873dc..9812008 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -103,9 +103,6 @@ config MATH_EMULATION
 	help
 	Can we use information of configuration file?
 
-config HIGHMEM
-	bool "High memory support"
-
 endmenu
 
 menu "Platform options"
diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c
index 9606d2b..4ec1633 100644
--- a/arch/xtensa/kernel/setup.c
+++ b/arch/xtensa/kernel/setup.c
@@ -44,6 +44,8 @@
 #include <asm/setup.h>
 #include <asm/param.h>
 
+#include <platform/hardware.h>
+
 #if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE)
 struct screen_info screen_info = { 0, 24, 0, 0, 0, 80, 0, 0, 0, 24, 1, 16};
 #endif
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index c7a021d..c44f830 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -30,6 +30,7 @@
 #include <linux/stringify.h>
 #include <linux/kallsyms.h>
 #include <linux/delay.h>
+#include <linux/hardirq.h>
 
 #include <asm/ptrace.h>
 #include <asm/timex.h>
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 33f366b..bdd860d 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -14,6 +14,7 @@
 
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/hardirq.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 #include <asm/hardirq.h>
diff --git a/arch/xtensa/platforms/iss/console.c b/arch/xtensa/platforms/iss/console.c
index efed889..25d46c8 100644
--- a/arch/xtensa/platforms/iss/console.c
+++ b/arch/xtensa/platforms/iss/console.c
@@ -140,16 +140,14 @@ static void rs_poll(unsigned long priv)
 }
 
 
-static void rs_put_char(struct tty_struct *tty, unsigned char ch)
+static int rs_put_char(struct tty_struct *tty, unsigned char ch)
 {
 	char buf[2];
 
-	if (!tty)
-		return;
-
 	buf[0] = ch;
 	buf[1] = '\0';		/* Is this NULL necessary? */
 	__simc (SYS_write, 1, (unsigned long) buf, 1, 0, 0);
+	return 1;
 }
 
 static void rs_flush_chars(struct tty_struct *tty)
-- 
cgit v0.10.2


From 2f68891314b14e7e0ef07b4e77a8ea6e917fc74b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 10 Mar 2009 12:55:50 -0700
Subject: x86/agp: tighten check to update amd nb aperture

Impact: fix bug to make agp work with dri

Jeffrey reported that dri does work with 64bit, but doesn't work with
32bit it turns out NB aperture is 32M, aperture on agp is 128M

64bit is using 64M for vaidation for 64 iommu/gart 32bit is only using
32M..., and will not update the nb aperture.

So try to compare nb apterture and agp apterture before leaving not
touch nb aperture.

Reported-by: Jeffrey Trull <jetrull@sbcglobal.net>
Tested-by: Jeffrey Trull <jetrull@sbcglobal.net>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Dave Airlie <airlied@linux.ie>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 52f4361..d765afd 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -271,15 +271,15 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
 	nb_order = (nb_order >> 1) & 7;
 	pci_read_config_dword(nb, AMD64_GARTAPERTUREBASE, &nb_base);
 	nb_aper = nb_base << 25;
-	if (agp_aperture_valid(nb_aper, (32*1024*1024)<<nb_order)) {
-		return 0;
-	}
 
 	/* Northbridge seems to contain crap. Try the AGP bridge. */
 
 	pci_read_config_word(agp, cap+0x14, &apsize);
-	if (apsize == 0xffff)
+	if (apsize == 0xffff) {
+		if (agp_aperture_valid(nb_aper, (32*1024*1024)<<nb_order))
+			return 0;
 		return -1;
+	}
 
 	apsize &= 0xfff;
 	/* Some BIOS use weird encodings not in the AGPv3 table. */
@@ -301,6 +301,11 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
 		order = nb_order;
 	}
 
+	if (nb_order >= order) {
+		if (agp_aperture_valid(nb_aper, (32*1024*1024)<<nb_order))
+			return 0;
+	}
+
 	dev_info(&agp->dev, "aperture from AGP @ %Lx size %u MB\n",
 		 aper, 32 << order);
 	if (order < 0 || !agp_aperture_valid(aper, (32*1024*1024)<<order))
-- 
cgit v0.10.2


From 260219cc48cfb22486e5d0d706c978228a080d63 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 10 Mar 2009 12:55:51 -0700
Subject: devpts: remove graffiti

Very annoying when working with containters.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5f3231b..bff4052 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -198,9 +198,6 @@ static int mknod_ptmx(struct super_block *sb)
 
 	fsi->ptmx_dentry = dentry;
 	rc = 0;
-
-	printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
-			inode->i_ino);
 out:
 	mutex_unlock(&root->d_inode->i_mutex);
 	return rc;
@@ -369,8 +366,6 @@ static int new_pts_mount(struct file_system_type *fs_type, int flags,
 	struct pts_fs_info *fsi;
 	struct pts_mount_opts *opts;
 
-	printk(KERN_NOTICE "devpts: newinstance mount\n");
-
 	err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
 	if (err)
 		return err;
-- 
cgit v0.10.2


From 1b23336ad98b3666c216617227c7767cd60a22be Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 10 Mar 2009 12:55:52 -0700
Subject: idr: make idr_remove_all() do removal -before- free_layer()

Fix a problem in the IDR system, where an idr_remove_all() hands a data
element to call_rcu() (via free_layer()) before making that data element
inaccessible to new readers.  This is very bad, and results in readers
still having a reference to this data element at the end of the grace
period.

Tests on large machines that concurrently map and unmap user-space memory
within the same multithreaded process result in crashes within about five
minutes.  Applying this patch increases the kernel's longevity to the
three-to-eight-hour range.

There appear to be other similar problems in idr_get_empty_slot() and
sub_remove(), but I fixed the easy one in idr_remove_all() first.  It is
therefore no surprise that failures still occur.

Located-by: Milton Miller II <miltonm@austin.ibm.com>
Tested-by: Milton Miller II <miltonm@austin.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/lib/idr.c b/lib/idr.c
index c11c576..dab4bca 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -449,6 +449,7 @@ void idr_remove_all(struct idr *idp)
 
 	n = idp->layers * IDR_BITS;
 	p = idp->top;
+	rcu_assign_pointer(idp->top, NULL);
 	max = 1 << n;
 
 	id = 0;
@@ -467,7 +468,6 @@ void idr_remove_all(struct idr *idp)
 			p = *--paa;
 		}
 	}
-	rcu_assign_pointer(idp->top, NULL);
 	idp->layers = 0;
 }
 EXPORT_SYMBOL(idr_remove_all);
-- 
cgit v0.10.2


From 229cc58ba2b5a83b0b55764c6cb98695c106238a Mon Sep 17 00:00:00 2001
From: Will Newton <will.newton@gmail.com>
Date: Tue, 10 Mar 2009 12:55:53 -0700
Subject: mtd_dataflash: fix probing of AT45DB321C chips.

Commit 771999b65f79264acde4b855e5d35696eca5e80c ("[MTD] DataFlash: bugfix,
binary page sizes now handled") broke support for probing AT45DB321C flash
chips.  These chips do not support the "page size" status bit, so if we
match the JEDEC id return early.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Will Newton <will.newton@gmail.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/mtd/devices/mtd_dataflash.c b/drivers/mtd/devices/mtd_dataflash.c
index d44f741..6d9f810 100644
--- a/drivers/mtd/devices/mtd_dataflash.c
+++ b/drivers/mtd/devices/mtd_dataflash.c
@@ -821,7 +821,8 @@ static struct flash_info *__devinit jedec_probe(struct spi_device *spi)
 					if (!(info->flags & IS_POW2PS))
 						return info;
 				}
-			}
+			} else
+				return info;
 		}
 	}
 
-- 
cgit v0.10.2


From 9c1e8a4ebcc04226cb6f3a1bf1d72f4cafd6b089 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Tue, 10 Mar 2009 12:55:54 -0700
Subject: intel-agp: fix a panic with 1M of shared memory, no GTT entries

When GTT size is equal to amount of video memory, the amount of GTT
entries is computed lower than zero, which is invalid and leads to
off-by-one error in intel_i915_configure()

Originally posted here:
http://bugzilla.kernel.org/show_bug.cgi?id=12539
http://bugzilla.redhat.com/show_bug.cgi?id=445592

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Cc: Lubomir Rintel <lkundrak@v3.sk>
Cc: Dave Airlie <airlied@linux.ie>
Reviewed-by: Eric Anholt <eric@anholt.net>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index c771418..4373adb 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -633,13 +633,15 @@ static void intel_i830_init_gtt_entries(void)
 			break;
 		}
 	}
-	if (gtt_entries > 0)
+	if (gtt_entries > 0) {
 		dev_info(&agp_bridge->dev->dev, "detected %dK %s memory\n",
 		       gtt_entries / KB(1), local ? "local" : "stolen");
-	else
+		gtt_entries /= KB(4);
+	} else {
 		dev_info(&agp_bridge->dev->dev,
 		       "no pre-allocated video memory detected\n");
-	gtt_entries /= KB(4);
+		gtt_entries = 0;
+	}
 
 	intel_private.gtt_entries = gtt_entries;
 }
-- 
cgit v0.10.2


From d58ab5cf09679d8cb4824e22cae900c0eab5ab31 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Tue, 10 Mar 2009 12:55:55 -0700
Subject: mtd: physmap: fix NULL pointer dereference in error path

commit e480814f138cd5d78a8efe397756ba6b6518fdb6 ("[MTD] [MAPS] physmap:
fix wrong free and del_mtd_{partition,device}") introduces a NULL pointer
dereference in physmap_flash_remove when called from the error path in
physmap_flash_probe (if map_probe failed).

Call del_mtd_{partition,device} only if info->cmtd was not NULL.

Reported-by: pHilipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/mtd/maps/physmap.c b/drivers/mtd/maps/physmap.c
index 4b122e7..2297182 100644
--- a/drivers/mtd/maps/physmap.c
+++ b/drivers/mtd/maps/physmap.c
@@ -46,16 +46,19 @@ static int physmap_flash_remove(struct platform_device *dev)
 
 	physmap_data = dev->dev.platform_data;
 
+	if (info->cmtd) {
 #ifdef CONFIG_MTD_PARTITIONS
-	if (info->nr_parts) {
-		del_mtd_partitions(info->cmtd);
-		kfree(info->parts);
-	} else if (physmap_data->nr_parts)
-		del_mtd_partitions(info->cmtd);
-	else
-		del_mtd_device(info->cmtd);
+		if (info->nr_parts || physmap_data->nr_parts)
+			del_mtd_partitions(info->cmtd);
+		else
+			del_mtd_device(info->cmtd);
 #else
-	del_mtd_device(info->cmtd);
+		del_mtd_device(info->cmtd);
+#endif
+	}
+#ifdef CONFIG_MTD_PARTITIONS
+	if (info->nr_parts)
+		kfree(info->parts);
 #endif
 
 #ifdef CONFIG_MTD_CONCAT
-- 
cgit v0.10.2


From be50b8342dead8cacf57d4839240106b225d31f5 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Tue, 10 Mar 2009 12:55:56 -0700
Subject: kernel/user.c: fix a memory leak when freeing up non-init
 usernamespaces users

We were returning early in the sysfs directory cleanup function if the
user belonged to a non init usernamespace.  Due to this a lot of the
cleanup was not done and we were left with a leak.  Fix the leak.

Reported-by: Serge Hallyn <serue@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Tested-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/user.c b/kernel/user.c
index 6a9b696..fbb300e 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -286,14 +286,12 @@ int __init uids_sysfs_init(void)
 /* work function to remove sysfs directory for a user and free up
  * corresponding structures.
  */
-static void remove_user_sysfs_dir(struct work_struct *w)
+static void cleanup_user_struct(struct work_struct *w)
 {
 	struct user_struct *up = container_of(w, struct user_struct, work);
 	unsigned long flags;
 	int remove_user = 0;
 
-	if (up->user_ns != &init_user_ns)
-		return;
 	/* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
 	 * atomic.
 	 */
@@ -312,9 +310,11 @@ static void remove_user_sysfs_dir(struct work_struct *w)
 	if (!remove_user)
 		goto done;
 
-	kobject_uevent(&up->kobj, KOBJ_REMOVE);
-	kobject_del(&up->kobj);
-	kobject_put(&up->kobj);
+	if (up->user_ns == &init_user_ns) {
+		kobject_uevent(&up->kobj, KOBJ_REMOVE);
+		kobject_del(&up->kobj);
+		kobject_put(&up->kobj);
+	}
 
 	sched_destroy_user(up);
 	key_put(up->uid_keyring);
@@ -335,7 +335,7 @@ static void free_user(struct user_struct *up, unsigned long flags)
 	atomic_inc(&up->__count);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 
-	INIT_WORK(&up->work, remove_user_sysfs_dir);
+	INIT_WORK(&up->work, cleanup_user_struct);
 	schedule_work(&up->work);
 }
 
-- 
cgit v0.10.2


From 0612ea00a010e36fde61e7b7649a1105b0ef1080 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 10 Mar 2009 12:55:57 -0700
Subject: rcu: documentation 1Q09 update

Update the RCU documentation to call out the need for callers of
primitives like call_rcu() and synchronize_rcu() to prevent subsequent RCU
readers from hazard.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 6e25340..accfe2f 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -298,3 +298,15 @@ over a rather long period of time, but improvements are always welcome!
 
 	Note that, rcu_assign_pointer() and rcu_dereference() relate to
 	SRCU just as they do to other forms of RCU.
+
+15.	The whole point of call_rcu(), synchronize_rcu(), and friends
+	is to wait until all pre-existing readers have finished before
+	carrying out some otherwise-destructive operation.  It is
+	therefore critically important to -first- remove any path
+	that readers can follow that could be affected by the
+	destructive operation, and -only- -then- invoke call_rcu(),
+	synchronize_rcu(), or friends.
+
+	Because these primitives only wait for pre-existing readers,
+	it is the caller's responsibility to guarantee safety to
+	any subsequent readers.
-- 
cgit v0.10.2


From 9333d826813a2722d3babfbfd0f65c21b75127b3 Mon Sep 17 00:00:00 2001
From: Steven King <sfking@fdwdc.com>
Date: Tue, 10 Mar 2009 12:55:57 -0700
Subject: m68knommu: m5206e build fix

Signed-off-by: Steven King <sfking@fdwdc.com>
Acked-by: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/m68knommu/platform/5206e/config.c b/arch/m68knommu/platform/5206e/config.c
index d01a5d2..db90254 100644
--- a/arch/m68knommu/platform/5206e/config.c
+++ b/arch/m68knommu/platform/5206e/config.c
@@ -17,6 +17,7 @@
 #include <asm/coldfire.h>
 #include <asm/mcfsim.h>
 #include <asm/mcfdma.h>
+#include <asm/mcfuart.h>
 
 /***************************************************************************/
 
-- 
cgit v0.10.2


From 9233ecc5c4b5f31921d914b3ef3baa3dc1ae4cc6 Mon Sep 17 00:00:00 2001
From: Steven King <sfking@fdwdc.com>
Date: Tue, 10 Mar 2009 12:55:58 -0700
Subject: m68knommu: m528x build fix

There isn't any mcfqspi.h in the tree, and without it everything inside the
#ifdef CONFIG_SPI is uncompilable.

Signed-off-by: Steven King <sfking@fdwdc.com>
Acked-by: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/m68knommu/platform/528x/config.c b/arch/m68knommu/platform/528x/config.c
index dfdb5c2e..44baeb2 100644
--- a/arch/m68knommu/platform/528x/config.c
+++ b/arch/m68knommu/platform/528x/config.c
@@ -24,7 +24,6 @@
 #include <asm/coldfire.h>
 #include <asm/mcfsim.h>
 #include <asm/mcfuart.h>
-#include <asm/mcfqspi.h>
 
 #ifdef CONFIG_MTD_PARTITIONS
 #include <linux/mtd/partitions.h>
@@ -33,233 +32,6 @@
 /***************************************************************************/
 
 void coldfire_reset(void);
-static void coldfire_qspi_cs_control(u8 cs, u8 command);
-
-/***************************************************************************/
-
-#if defined(CONFIG_SPI)
-
-#if defined(CONFIG_WILDFIRE)
-#define SPI_NUM_CHIPSELECTS 	0x02
-#define SPI_PAR_VAL		0x07  /* Enable DIN, DOUT, CLK */
-#define SPI_CS_MASK		0x18
-
-#define FLASH_BLOCKSIZE		(1024*64)
-#define FLASH_NUMBLOCKS		16
-#define FLASH_TYPE		"m25p80"
-
-#define M25P80_CS		0
-#define MMC_CS			1
-
-#ifdef CONFIG_MTD_PARTITIONS
-static struct mtd_partition stm25p_partitions[] = {
-	/* sflash */
-	[0] = {
-		.name = "stm25p80",
-		.offset = 0x00000000,
-		.size = FLASH_BLOCKSIZE * FLASH_NUMBLOCKS,
-		.mask_flags = 0
-	}
-};
-
-#endif
-
-#elif defined(CONFIG_WILDFIREMOD)
-
-#define SPI_NUM_CHIPSELECTS	0x08
-#define SPI_PAR_VAL		0x07  /* Enable DIN, DOUT, CLK */
-#define SPI_CS_MASK		0x78
-
-#define FLASH_BLOCKSIZE		(1024*64)
-#define FLASH_NUMBLOCKS		64
-#define FLASH_TYPE		"m25p32"
-/* Reserve 1M for the kernel parition */
-#define FLASH_KERNEL_SIZE   (1024 * 1024)
-
-#define M25P80_CS		5
-#define MMC_CS			6
-
-#ifdef CONFIG_MTD_PARTITIONS
-static struct mtd_partition stm25p_partitions[] = {
-	/* sflash */
-	[0] = {
-		.name = "kernel",
-		.offset = FLASH_BLOCKSIZE * FLASH_NUMBLOCKS - FLASH_KERNEL_SIZE,
-		.size = FLASH_KERNEL_SIZE,
-		.mask_flags = 0
-	},
-	[1] = {
-		.name = "image",
-		.offset = 0x00000000,
-		.size = FLASH_BLOCKSIZE * FLASH_NUMBLOCKS - FLASH_KERNEL_SIZE,
-		.mask_flags = 0
-	},
-	[2] = {
-		.name = "all",
-		.offset = 0x00000000,
-		.size = FLASH_BLOCKSIZE * FLASH_NUMBLOCKS,
-		.mask_flags = 0
-	}
-};
-#endif
-
-#else
-#define SPI_NUM_CHIPSELECTS 	0x04
-#define SPI_PAR_VAL		0x7F  /* Enable DIN, DOUT, CLK, CS0 - CS4 */
-#endif
-
-#ifdef MMC_CS
-static struct coldfire_spi_chip flash_chip_info = {
-	.mode = SPI_MODE_0,
-	.bits_per_word = 16,
-	.del_cs_to_clk = 17,
-	.del_after_trans = 1,
-	.void_write_data = 0
-};
-
-static struct coldfire_spi_chip mmc_chip_info = {
-	.mode = SPI_MODE_0,
-	.bits_per_word = 16,
-	.del_cs_to_clk = 17,
-	.del_after_trans = 1,
-	.void_write_data = 0xFFFF
-};
-#endif
-
-#ifdef M25P80_CS
-static struct flash_platform_data stm25p80_platform_data = {
-	.name = "ST M25P80 SPI Flash chip",
-#ifdef CONFIG_MTD_PARTITIONS
-	.parts = stm25p_partitions,
-	.nr_parts = sizeof(stm25p_partitions) / sizeof(*stm25p_partitions),
-#endif
-	.type = FLASH_TYPE
-};
-#endif
-
-static struct spi_board_info spi_board_info[] __initdata = {
-#ifdef M25P80_CS
-	{
-		.modalias = "m25p80",
-		.max_speed_hz = 16000000,
-		.bus_num = 1,
-		.chip_select = M25P80_CS,
-		.platform_data = &stm25p80_platform_data,
-		.controller_data = &flash_chip_info
-	},
-#endif
-#ifdef MMC_CS
-	{
-		.modalias = "mmc_spi",
-		.max_speed_hz = 16000000,
-		.bus_num = 1,
-		.chip_select = MMC_CS,
-		.controller_data = &mmc_chip_info
-	}
-#endif
-};
-
-static struct coldfire_spi_master coldfire_master_info = {
-	.bus_num = 1,
-	.num_chipselect = SPI_NUM_CHIPSELECTS,
-	.irq_source = MCF5282_QSPI_IRQ_SOURCE,
-	.irq_vector = MCF5282_QSPI_IRQ_VECTOR,
-	.irq_mask = ((0x01 << MCF5282_QSPI_IRQ_SOURCE) | 0x01),
-	.irq_lp = 0x2B,  /* Level 5 and Priority 3 */
-	.par_val = SPI_PAR_VAL,
-	.cs_control = coldfire_qspi_cs_control,
-};
-
-static struct resource coldfire_spi_resources[] = {
-	[0] = {
-		.name = "qspi-par",
-		.start = MCF5282_QSPI_PAR,
-		.end = MCF5282_QSPI_PAR,
-		.flags = IORESOURCE_MEM
-	},
-
-	[1] = {
-		.name = "qspi-module",
-		.start = MCF5282_QSPI_QMR,
-		.end = MCF5282_QSPI_QMR + 0x18,
-		.flags = IORESOURCE_MEM
-	},
-
-	[2] = {
-		.name = "qspi-int-level",
-		.start = MCF5282_INTC0 + MCFINTC_ICR0 + MCF5282_QSPI_IRQ_SOURCE,
-		.end = MCF5282_INTC0 + MCFINTC_ICR0 + MCF5282_QSPI_IRQ_SOURCE,
-		.flags = IORESOURCE_MEM
-	},
-
-	[3] = {
-		.name = "qspi-int-mask",
-		.start = MCF5282_INTC0 + MCFINTC_IMRL,
-		.end = MCF5282_INTC0 + MCFINTC_IMRL,
-		.flags = IORESOURCE_MEM
-	}
-};
-
-static struct platform_device coldfire_spi = {
-	.name = "spi_coldfire",
-	.id = -1,
-	.resource = coldfire_spi_resources,
-	.num_resources = ARRAY_SIZE(coldfire_spi_resources),
-	.dev = {
-		.platform_data = &coldfire_master_info,
-	}
-};
-
-static void coldfire_qspi_cs_control(u8 cs, u8 command)
-{
-	u8 cs_bit = ((0x01 << cs) << 3) & SPI_CS_MASK;
-
-#if defined(CONFIG_WILDFIRE)
-	u8 cs_mask = ~(((0x01 << cs) << 3) & SPI_CS_MASK);
-#endif
-#if defined(CONFIG_WILDFIREMOD)
-	u8 cs_mask = (cs << 3) & SPI_CS_MASK;
-#endif
-
-	/*
-	 * Don't do anything if the chip select is not
-	 * one of the port qs pins.
-	 */
-	if (command & QSPI_CS_INIT) {
-#if defined(CONFIG_WILDFIRE)
-		MCF5282_GPIO_DDRQS  |= cs_bit;
-		MCF5282_GPIO_PQSPAR &= ~cs_bit;
-#endif
-
-#if defined(CONFIG_WILDFIREMOD)
-		MCF5282_GPIO_DDRQS  |= SPI_CS_MASK;
-		MCF5282_GPIO_PQSPAR &= ~SPI_CS_MASK;
-#endif
-	}
-
-	if (command & QSPI_CS_ASSERT) {
-		MCF5282_GPIO_PORTQS &= ~SPI_CS_MASK;
-		MCF5282_GPIO_PORTQS |= cs_mask;
-	} else if (command & QSPI_CS_DROP) {
-		MCF5282_GPIO_PORTQS |= SPI_CS_MASK;
-	}
-}
-
-static int __init spi_dev_init(void)
-{
-	int retval;
-
-	retval = platform_device_register(&coldfire_spi);
-	if (retval < 0)
-		return retval;
-
-	if (ARRAY_SIZE(spi_board_info))
-		retval = spi_register_board_info(spi_board_info, ARRAY_SIZE(spi_board_info));
-
-	return retval;
-}
-
-#endif /* CONFIG_SPI */
 
 /***************************************************************************/
 
-- 
cgit v0.10.2


From 16b71fdf97599f1b1b7f38418ee9922d9f117396 Mon Sep 17 00:00:00 2001
From: Samuel CUELLA <samuel.cuella@supinfo.com>
Date: Tue, 10 Mar 2009 12:56:00 -0700
Subject: i810: fix kernel crash fix when struct fb_var_screeninfo is supplied

Prevent the kernel from being crashed by a divide-by-zero operation when
supplied an incorrectly filled 'struct fb_var_screeninfo' from userland.

Previously i810_main.c:1005 (i810_check_params) was using the global
'yres' symbol previously defined at i810_main.c:145 as a module parameter
value holder (i810_main.c:2174).  If i810fb is compiled-in or if this
param doesn't get a default value, this direct usage leads to a
divide-by-zero at i810_main.c:1005 (i810_check_params).  The patch simply
replace the 'yres' global, perhaps undefined symbol usage by a given
parameter structure lookup.

This problem occurs with directfb, mplayer -vo fbdev, SDL library.
It was also reported ( but non solved ) at:

	http://mail.directfb.org/pipermail/directfb-dev/2008-March/004050.html

Signed-off-by: Samuel CUELLA <samuel.cuella@supinfo.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/i810/i810_main.c b/drivers/video/i810/i810_main.c
index a24e680..2e94019 100644
--- a/drivers/video/i810/i810_main.c
+++ b/drivers/video/i810/i810_main.c
@@ -993,6 +993,7 @@ static int i810_check_params(struct fb_var_screeninfo *var,
 	struct i810fb_par *par = info->par;
 	int line_length, vidmem, mode_valid = 0, retval = 0;
 	u32 vyres = var->yres_virtual, vxres = var->xres_virtual;
+
 	/*
 	 *  Memory limit
 	 */
@@ -1002,12 +1003,12 @@ static int i810_check_params(struct fb_var_screeninfo *var,
 	if (vidmem > par->fb.size) {
 		vyres = par->fb.size/line_length;
 		if (vyres < var->yres) {
-			vyres = yres;
+			vyres = info->var.yres;
 			vxres = par->fb.size/vyres;
 			vxres /= var->bits_per_pixel >> 3;
 			line_length = get_line_length(par, vxres, 
 						      var->bits_per_pixel);
-			vidmem = line_length * yres;
+			vidmem = line_length * info->var.yres;
 			if (vxres < var->xres) {
 				printk("i810fb: required video memory, "
 				       "%d bytes, for %dx%d-%d (virtual) "
-- 
cgit v0.10.2


From fef3a7a17418814733ebde0b40d8e32747677c8f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 10 Mar 2009 10:56:57 +0800
Subject: x86, kexec: fix kexec x86 coding style

Impact: Cleanup

Fix some coding style issue for kexec x86.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index f5fc8c7..e7368c1 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -14,12 +14,12 @@
 #include <linux/ftrace.h>
 #include <linux/suspend.h>
 #include <linux/gfp.h>
+#include <linux/io.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
-#include <asm/io.h>
 #include <asm/apic.h>
 #include <asm/cpufeature.h>
 #include <asm/desc.h>
@@ -63,7 +63,7 @@ static void load_segments(void)
 		"\tmovl %%eax,%%fs\n"
 		"\tmovl %%eax,%%gs\n"
 		"\tmovl %%eax,%%ss\n"
-		::: "eax", "memory");
+		: : : "eax", "memory");
 #undef STR
 #undef __STR
 }
@@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image)
 
 	if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
-		/* We need to put APICs in legacy mode so that we can
+		/*
+		 * We need to put APICs in legacy mode so that we can
 		 * get timer interrupts in second kernel. kexec/kdump
 		 * paths already have calls to disable_IO_APIC() in
 		 * one form or other. kexec jump path also need
@@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image)
 		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
 						<< PAGE_SHIFT);
 
-	/* The segment registers are funny things, they have both a
+	/*
+	 * The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
 	 * set to a specific selector, the invisible part is loaded
 	 * with from a table in memory.  At no other time is the
@@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image)
 	 * segments, before I zap the gdt with an invalid value.
 	 */
 	load_segments();
-	/* The gdt & idt are now invalid.
+	/*
+	 * The gdt & idt are now invalid.
 	 * If you want to load them you must set up your own idt & gdt.
 	 */
-	set_gdt(phys_to_virt(0),0);
-	set_idt(phys_to_virt(0),0);
+	set_gdt(phys_to_virt(0), 0);
+	set_idt(phys_to_virt(0), 0);
 
 	/* now call it */
 	image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6993d51..f8c796f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -12,11 +12,11 @@
 #include <linux/reboot.h>
 #include <linux/numa.h>
 #include <linux/ftrace.h>
+#include <linux/io.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
-#include <asm/io.h>
 
 static void init_level2_page(pmd_t *level2p, unsigned long addr)
 {
@@ -83,9 +83,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
 		}
 		level3p = (pud_t *)page_address(page);
 		result = init_level3_page(image, level3p, addr, last_addr);
-		if (result) {
+		if (result)
 			goto out;
-		}
 		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
 		addr += PGDIR_SIZE;
 	}
@@ -242,7 +241,8 @@ void machine_kexec(struct kimage *image)
 	page_list[PA_TABLE_PAGE] =
 	  (unsigned long)__pa(page_address(image->control_code_page));
 
-	/* The segment registers are funny things, they have both a
+	/*
+	 * The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
 	 * set to a specific selector, the invisible part is loaded
 	 * with from a table in memory.  At no other time is the
@@ -252,11 +252,12 @@ void machine_kexec(struct kimage *image)
 	 * segments, before I zap the gdt with an invalid value.
 	 */
 	load_segments();
-	/* The gdt & idt are now invalid.
+	/*
+	 * The gdt & idt are now invalid.
 	 * If you want to load them you must set up your own idt & gdt.
 	 */
-	set_gdt(phys_to_virt(0),0);
-	set_idt(phys_to_virt(0),0);
+	set_gdt(phys_to_virt(0), 0);
+	set_idt(phys_to_virt(0), 0);
 
 	/* now call it */
 	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 2064d0a..4123553 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -17,7 +17,8 @@
 
 #define PTR(x) (x << 2)
 
-/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
  * ~ control_page + PAGE_SIZE are used as data storage and stack for
  * jumping back
  */
@@ -76,8 +77,10 @@ relocate_kernel:
 	movl	%eax, CP_PA_SWAP_PAGE(%edi)
 	movl	%ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
 
-	/* get physical address of control page now */
-	/* this is impossible after page table switch */
+	/*
+	 * get physical address of control page now
+	 * this is impossible after page table switch
+	 */
 	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edi
 
 	/* switch to new set of page tables */
@@ -97,7 +100,8 @@ identity_mapped:
 	/* store the start address on the stack */
 	pushl   %edx
 
-	/* Set cr0 to a known state:
+	/*
+	 * Set cr0 to a known state:
 	 *  - Paging disabled
 	 *  - Alignment check disabled
 	 *  - Write protect disabled
@@ -113,7 +117,8 @@ identity_mapped:
 	/* clear cr4 if applicable */
 	testl	%ecx, %ecx
 	jz	1f
-	/* Set cr4 to a known state:
+	/*
+	 * Set cr4 to a known state:
 	 * Setting everything to zero seems safe.
 	 */
 	xorl	%eax, %eax
@@ -132,15 +137,18 @@ identity_mapped:
 	call	swap_pages
 	addl	$8, %esp
 
-	/* To be certain of avoiding problems with self-modifying code
+	/*
+	 * To be certain of avoiding problems with self-modifying code
 	 * I need to execute a serializing instruction here.
 	 * So I flush the TLB, it's handy, and not processor dependent.
 	 */
 	xorl	%eax, %eax
 	movl	%eax, %cr3
 
-	/* set all of the registers to known values */
-	/* leave %esp alone */
+	/*
+	 * set all of the registers to known values
+	 * leave %esp alone
+	 */
 
 	testl	%esi, %esi
 	jnz 1f
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index d32cfb2..cfc0d24 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -24,7 +24,8 @@
 	.code64
 	.globl relocate_kernel
 relocate_kernel:
-	/* %rdi indirection_page
+	/*
+	 * %rdi indirection_page
 	 * %rsi page_list
 	 * %rdx start address
 	 */
@@ -33,8 +34,10 @@ relocate_kernel:
 	pushq $0
 	popfq
 
-	/* get physical address of control page now */
-	/* this is impossible after page table switch */
+	/*
+	 * get physical address of control page now
+	 * this is impossible after page table switch
+	 */
 	movq	PTR(PA_CONTROL_PAGE)(%rsi), %r8
 
 	/* get physical address of page table now too */
@@ -55,7 +58,8 @@ identity_mapped:
 	/* store the start address on the stack */
 	pushq   %rdx
 
-	/* Set cr0 to a known state:
+	/*
+	 * Set cr0 to a known state:
 	 *  - Paging enabled
 	 *  - Alignment check disabled
 	 *  - Write protect disabled
@@ -68,7 +72,8 @@ identity_mapped:
 	orl	$(X86_CR0_PG | X86_CR0_PE), %eax
 	movq	%rax, %cr0
 
-	/* Set cr4 to a known state:
+	/*
+	 * Set cr4 to a known state:
 	 *  - physical address extension enabled
 	 */
 	movq	$X86_CR4_PAE, %rax
@@ -117,7 +122,8 @@ identity_mapped:
 	jmp	0b
 3:
 
-	/* To be certain of avoiding problems with self-modifying code
+	/*
+	 * To be certain of avoiding problems with self-modifying code
 	 * I need to execute a serializing instruction here.
 	 * So I flush the TLB by reloading %cr3 here, it's handy,
 	 * and not processor dependent.
@@ -125,8 +131,10 @@ identity_mapped:
 	movq	%cr3, %rax
 	movq	%rax, %cr3
 
-	/* set all of the registers to known values */
-	/* leave %rsp alone */
+	/*
+	 * set all of the registers to known values
+	 * leave %rsp alone
+	 */
 
 	xorq	%rax, %rax
 	xorq	%rbx, %rbx
-- 
cgit v0.10.2


From 5359454701ce51a4626b1ef6eb7b16ec35bd458d Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 10 Mar 2009 10:57:04 +0800
Subject: x86, kexec: x86_64: add identity map for pages at image->start

Impact: Fix corner case that cannot yet occur

image->start may be outside of 0 ~ max_pfn, for example when jumping
back to original kernel from kexeced kenrel. This patch add identity
map for pages at image->start.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index f8c796f..7cc5d3d0 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,41 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 
+static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
+				unsigned long addr)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	struct page *page;
+	int result = -ENOMEM;
+
+	addr &= PMD_MASK;
+	pgd += pgd_index(addr);
+	if (!pgd_present(*pgd)) {
+		page = kimage_alloc_control_pages(image, 0);
+		if (!page)
+			goto out;
+		pud = (pud_t *)page_address(page);
+		memset(pud, 0, PAGE_SIZE);
+		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+	}
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud)) {
+		page = kimage_alloc_control_pages(image, 0);
+		if (!page)
+			goto out;
+		pmd = (pmd_t *)page_address(page);
+		memset(pmd, 0, PAGE_SIZE);
+		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+	}
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+	result = 0;
+out:
+	return result;
+}
+
 static void init_level2_page(pmd_t *level2p, unsigned long addr)
 {
 	unsigned long end_addr;
@@ -155,6 +190,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 	result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
 	if (result)
 		return result;
+	/*
+	 * image->start may be outside 0 ~ max_pfn, for example when
+	 * jump back to original kernel from kexeced kernel
+	 */
+	result = init_one_level2_page(image, level4p, image->start);
+	if (result)
+		return result;
 	return init_transition_pgtable(image, level4p);
 }
 
-- 
cgit v0.10.2


From fee7b0d84cc8c7bc5dc212901c79e93eaf83a5b5 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 10 Mar 2009 10:57:16 +0800
Subject: x86, kexec: x86_64: add kexec jump support for x86_64

Impact: New major feature

This patch add kexec jump support for x86_64. More information about
kexec jump can be found in corresponding x86_32 support patch.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3175837..87717f3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1431,7 +1431,7 @@ config CRASH_DUMP
 config KEXEC_JUMP
 	bool "kexec jump (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
-	depends on KEXEC && HIBERNATION && X86_32
+	depends on KEXEC && HIBERNATION
 	---help---
 	  Jump between original kernel and kexeced kernel and invoke
 	  code in physical address mode via KEXEC
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 0ceb6d1..317ff17 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -9,13 +9,13 @@
 # define PAGES_NR		4
 #else
 # define PA_CONTROL_PAGE	0
-# define PA_TABLE_PAGE		1
-# define PAGES_NR		2
+# define VA_CONTROL_PAGE	1
+# define PA_TABLE_PAGE		2
+# define PA_SWAP_PAGE		3
+# define PAGES_NR		4
 #endif
 
-#ifdef CONFIG_X86_32
 # define KEXEC_CONTROL_CODE_MAX_SIZE	2048
-#endif
 
 #ifndef __ASSEMBLY__
 
@@ -136,10 +136,11 @@ relocate_kernel(unsigned long indirection_page,
 		unsigned int has_pae,
 		unsigned int preserve_context);
 #else
-NORET_TYPE void
+unsigned long
 relocate_kernel(unsigned long indirection_page,
 		unsigned long page_list,
-		unsigned long start_address) ATTRIB_NORET;
+		unsigned long start_address,
+		unsigned int preserve_context);
 #endif
 
 #define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 7cc5d3d0..89cea4d 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -13,6 +13,7 @@
 #include <linux/numa.h>
 #include <linux/ftrace.h>
 #include <linux/io.h>
+#include <linux/suspend.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -270,19 +271,43 @@ void machine_kexec(struct kimage *image)
 {
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
+	int save_ftrace_enabled;
 
-	tracer_disable();
+#ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context)
+		save_processor_state();
+#endif
+
+	save_ftrace_enabled = __ftrace_enabled_save();
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
+	if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+		/*
+		 * We need to put APICs in legacy mode so that we can
+		 * get timer interrupts in second kernel. kexec/kdump
+		 * paths already have calls to disable_IO_APIC() in
+		 * one form or other. kexec jump path also need
+		 * one.
+		 */
+		disable_IO_APIC();
+#endif
+	}
+
 	control_page = page_address(image->control_code_page) + PAGE_SIZE;
-	memcpy(control_page, relocate_kernel, PAGE_SIZE);
+	memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
 
 	page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
+	page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
 	page_list[PA_TABLE_PAGE] =
 	  (unsigned long)__pa(page_address(image->control_code_page));
 
+	if (image->type == KEXEC_TYPE_DEFAULT)
+		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
+						<< PAGE_SHIFT);
+
 	/*
 	 * The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
@@ -302,8 +327,17 @@ void machine_kexec(struct kimage *image)
 	set_idt(phys_to_virt(0), 0);
 
 	/* now call it */
-	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
-			image->start);
+	image->start = relocate_kernel((unsigned long)image->head,
+				       (unsigned long)page_list,
+				       image->start,
+				       image->preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context)
+		restore_processor_state();
+#endif
+
+	__ftrace_enabled_restore(save_ftrace_enabled);
 }
 
 void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index cfc0d24..4de8f5b 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -19,6 +19,24 @@
 #define PTR(x) (x << 3)
 #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+ * ~ control_page + PAGE_SIZE are used as data storage and stack for
+ * jumping back
+ */
+#define DATA(offset)		(KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+
+/* Minimal CPU state */
+#define RSP			DATA(0x0)
+#define CR0			DATA(0x8)
+#define CR3			DATA(0x10)
+#define CR4			DATA(0x18)
+
+/* other data */
+#define CP_PA_TABLE_PAGE	DATA(0x20)
+#define CP_PA_SWAP_PAGE		DATA(0x28)
+#define CP_PA_BACKUP_PAGES_MAP	DATA(0x30)
+
 	.text
 	.align PAGE_SIZE
 	.code64
@@ -28,8 +46,27 @@ relocate_kernel:
 	 * %rdi indirection_page
 	 * %rsi page_list
 	 * %rdx start address
+	 * %rcx preserve_context
 	 */
 
+	/* Save the CPU context, used for jumping back */
+	pushq %rbx
+	pushq %rbp
+	pushq %r12
+	pushq %r13
+	pushq %r14
+	pushq %r15
+	pushf
+
+	movq	PTR(VA_CONTROL_PAGE)(%rsi), %r11
+	movq	%rsp, RSP(%r11)
+	movq	%cr0, %rax
+	movq	%rax, CR0(%r11)
+	movq	%cr3, %rax
+	movq	%rax, CR3(%r11)
+	movq	%cr4, %rax
+	movq	%rax, CR4(%r11)
+
 	/* zero out flags, and disable interrupts */
 	pushq $0
 	popfq
@@ -41,10 +78,18 @@ relocate_kernel:
 	movq	PTR(PA_CONTROL_PAGE)(%rsi), %r8
 
 	/* get physical address of page table now too */
-	movq	PTR(PA_TABLE_PAGE)(%rsi), %rcx
+	movq	PTR(PA_TABLE_PAGE)(%rsi), %r9
+
+	/* get physical address of swap page now */
+	movq	PTR(PA_SWAP_PAGE)(%rsi), %r10
+
+	/* save some information for jumping back */
+	movq	%r9, CP_PA_TABLE_PAGE(%r11)
+	movq	%r10, CP_PA_SWAP_PAGE(%r11)
+	movq	%rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
 
 	/* Switch to the identity mapped page tables */
-	movq	%rcx, %cr3
+	movq	%r9, %cr3
 
 	/* setup a new stack at the end of the physical control page */
 	lea	PAGE_SIZE(%r8), %rsp
@@ -83,9 +128,87 @@ identity_mapped:
 1:
 
 	/* Flush the TLB (needed?) */
-	movq	%rcx, %cr3
+	movq	%r9, %cr3
+
+	movq	%rcx, %r11
+	call	swap_pages
+
+	/*
+	 * To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB by reloading %cr3 here, it's handy,
+	 * and not processor dependent.
+	 */
+	movq	%cr3, %rax
+	movq	%rax, %cr3
+
+	/*
+	 * set all of the registers to known values
+	 * leave %rsp alone
+	 */
+
+	testq	%r11, %r11
+	jnz 1f
+	xorq	%rax, %rax
+	xorq	%rbx, %rbx
+	xorq    %rcx, %rcx
+	xorq    %rdx, %rdx
+	xorq    %rsi, %rsi
+	xorq    %rdi, %rdi
+	xorq    %rbp, %rbp
+	xorq	%r8,  %r8
+	xorq	%r9,  %r9
+	xorq	%r10, %r9
+	xorq	%r11, %r11
+	xorq	%r12, %r12
+	xorq	%r13, %r13
+	xorq	%r14, %r14
+	xorq	%r15, %r15
+
+	ret
+
+1:
+	popq	%rdx
+	leaq	PAGE_SIZE(%r10), %rsp
+	call	*%rdx
+
+	/* get the re-entry point of the peer system */
+	movq	0(%rsp), %rbp
+	call	1f
+1:
+	popq	%r8
+	subq	$(1b - relocate_kernel), %r8
+	movq	CP_PA_SWAP_PAGE(%r8), %r10
+	movq	CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
+	movq	CP_PA_TABLE_PAGE(%r8), %rax
+	movq	%rax, %cr3
+	lea	PAGE_SIZE(%r8), %rsp
+	call	swap_pages
+	movq	$virtual_mapped, %rax
+	pushq	%rax
+	ret
+
+virtual_mapped:
+	movq	RSP(%r8), %rsp
+	movq	CR4(%r8), %rax
+	movq	%rax, %cr4
+	movq	CR3(%r8), %rax
+	movq	CR0(%r8), %r8
+	movq	%rax, %cr3
+	movq	%r8, %cr0
+	movq	%rbp, %rax
+
+	popf
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbp
+	popq	%rbx
+	ret
 
 	/* Do the copies */
+swap_pages:
 	movq	%rdi, %rcx 	/* Put the page_list in %rcx */
 	xorq	%rdi, %rdi
 	xorq	%rsi, %rsi
@@ -117,39 +240,27 @@ identity_mapped:
 	movq	%rcx,   %rsi  /* For ever source page do a copy */
 	andq	$0xfffffffffffff000, %rsi
 
+	movq	%rdi, %rdx
+	movq	%rsi, %rax
+
+	movq	%r10, %rdi
 	movq	$512,   %rcx
 	rep ; movsq
-	jmp	0b
-3:
 
-	/*
-	 * To be certain of avoiding problems with self-modifying code
-	 * I need to execute a serializing instruction here.
-	 * So I flush the TLB by reloading %cr3 here, it's handy,
-	 * and not processor dependent.
-	 */
-	movq	%cr3, %rax
-	movq	%rax, %cr3
-
-	/*
-	 * set all of the registers to known values
-	 * leave %rsp alone
-	 */
+	movq	%rax, %rdi
+	movq	%rdx, %rsi
+	movq	$512,   %rcx
+	rep ; movsq
 
-	xorq	%rax, %rax
-	xorq	%rbx, %rbx
-	xorq    %rcx, %rcx
-	xorq    %rdx, %rdx
-	xorq    %rsi, %rsi
-	xorq    %rdi, %rdi
-	xorq    %rbp, %rbp
-	xorq	%r8,  %r8
-	xorq	%r9,  %r9
-	xorq	%r10, %r9
-	xorq	%r11, %r11
-	xorq	%r12, %r12
-	xorq	%r13, %r13
-	xorq	%r14, %r14
-	xorq	%r15, %r15
+	movq	%rdx, %rdi
+	movq	%r10, %rsi
+	movq	$512,   %rcx
+	rep ; movsq
 
+	lea	PAGE_SIZE(%rax), %rsi
+	jmp	0b
+3:
 	ret
+
+	.globl kexec_control_code_size
+.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fbfced6..5bf54e4 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -275,3 +275,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
 ASSERT((per_cpu__irq_stack_union == 0),
         "irq_stack_union is not at start of per-cpu area");
 #endif
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+       "kexec control code size is too big")
+#endif
-- 
cgit v0.10.2


From 5490fa96735ce0e2af270c0868987d644b9a38ec Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 11 Mar 2009 10:14:26 +0900
Subject: x86, mce: use round_jiffies() instead round_jiffies_relative()

Impact: saving power _very_ little

round_jiffies() round up absolute jiffies to full second.
round_jiffies_relative() round up relative jiffies to full second.

The "t->expires" is absolute jiffies. Then, round_jiffies() should be
used instead round_jiffies_relative().

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index bfbd532..ca14604 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -639,7 +639,7 @@ static void mce_init_timer(void)
 	if (!next_interval)
 		return;
 	setup_timer(t, mcheck_timer, smp_processor_id());
-	t->expires = round_jiffies_relative(jiffies + next_interval);
+	t->expires = round_jiffies(jiffies + next_interval);
 	add_timer(t);
 }
 
@@ -1110,7 +1110,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 		break;
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
-		t->expires = round_jiffies_relative(jiffies + next_interval);
+		t->expires = round_jiffies(jiffies + next_interval);
 		add_timer_on(t, cpu);
 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
 		break;
-- 
cgit v0.10.2