From 08604bd9935dc98fb62ef61d5b7baa7ccc10f8c2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 16 Jun 2009 15:31:12 -0700
Subject: time: move PIT_TICK_RATE to linux/timex.h

PIT_TICK_RATE is currently defined in four architectures, but in three
different places.  While linux/timex.h is not the perfect place for it, it
is still a reasonable replacement for those drivers that traditionally use
asm/timex.h to get CLOCK_TICK_RATE and expect it to be the PIT frequency.

Note that for Alpha, the actual value changed from 1193182UL to 1193180UL.
 This is unlikely to make a difference, and probably can only improve
accuracy.  There was a discussion on the correct value of CLOCK_TICK_RATE
a few years ago, after which every existing instance was getting changed
to 1193182.  According to the specification, it should be
1193181.818181...

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Len Brown <lenb@kernel.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Dmitry Torokhov <dtor@mail.ru>
Cc: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/alpha/include/asm/8253pit.h b/arch/alpha/include/asm/8253pit.h
index fef5c14..a71c9c1 100644
--- a/arch/alpha/include/asm/8253pit.h
+++ b/arch/alpha/include/asm/8253pit.h
@@ -1,10 +1,3 @@
 /*
  * 8253/8254 Programmable Interval Timer
  */
-
-#ifndef _8253PIT_H
-#define _8253PIT_H
-
-#define PIT_TICK_RATE 	1193180UL
-
-#endif
diff --git a/arch/alpha/kernel/sys_ruffian.c b/arch/alpha/kernel/sys_ruffian.c
index f15a329..d9f9cfe 100644
--- a/arch/alpha/kernel/sys_ruffian.c
+++ b/arch/alpha/kernel/sys_ruffian.c
@@ -14,6 +14,7 @@
 #include <linux/sched.h>
 #include <linux/pci.h>
 #include <linux/ioport.h>
+#include <linux/timex.h>
 #include <linux/init.h>
 
 #include <asm/ptrace.h>
diff --git a/arch/mips/include/asm/i8253.h b/arch/mips/include/asm/i8253.h
index 5dabc87..032ca73 100644
--- a/arch/mips/include/asm/i8253.h
+++ b/arch/mips/include/asm/i8253.h
@@ -12,8 +12,6 @@
 #define PIT_CH0			0x40
 #define PIT_CH2			0x42
 
-#define PIT_TICK_RATE		1193182UL
-
 extern spinlock_t i8253_lock;
 
 extern void setup_pit_timer(void);
diff --git a/arch/powerpc/include/asm/8253pit.h b/arch/powerpc/include/asm/8253pit.h
index b70d6e5..a71c9c1 100644
--- a/arch/powerpc/include/asm/8253pit.h
+++ b/arch/powerpc/include/asm/8253pit.h
@@ -1,10 +1,3 @@
-#ifndef _ASM_POWERPC_8253PIT_H
-#define _ASM_POWERPC_8253PIT_H
-
 /*
  * 8253/8254 Programmable Interval Timer
  */
-
-#define PIT_TICK_RATE	1193182UL
-
-#endif	/* _ASM_POWERPC_8253PIT_H */
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
index b5c9d45..1375cfc 100644
--- a/arch/x86/include/asm/timex.h
+++ b/arch/x86/include/asm/timex.h
@@ -4,9 +4,7 @@
 #include <asm/processor.h>
 #include <asm/tsc.h>
 
-/* The PIT ticks at this frequency (in HZ): */
-#define PIT_TICK_RATE		1193182
-
+/* Assume we use the PIT time source for the clock tick */
 #define CLOCK_TICK_RATE		PIT_TICK_RATE
 
 #define ARCH_HAS_READ_CURRENT_TIMER
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c2e0bb0..5cf36c0 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -7,6 +7,7 @@
 #include <linux/spinlock.h>
 #include <linux/jiffies.h>
 #include <linux/module.h>
+#include <linux/timex.h>
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3e1c057..ae3180c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -9,6 +9,7 @@
 #include <linux/delay.h>
 #include <linux/clocksource.h>
 #include <linux/percpu.h>
+#include <linux/timex.h>
 
 #include <asm/hpet.h>
 #include <asm/timer.h>
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index 40bd8c6..72a633a 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -18,6 +18,7 @@
 
 #include <linux/acpi_pmtmr.h>
 #include <linux/clocksource.h>
+#include <linux/timex.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/pci.h>
diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index 356b3a2..1c0b529 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -35,7 +35,7 @@
 #include <linux/input.h>
 #include <linux/gameport.h>
 #include <linux/jiffies.h>
-#include <asm/timex.h>
+#include <linux/timex.h>
 
 #define DRIVER_DESC	"Analog joystick and gamepad driver"
 
diff --git a/drivers/input/misc/pcspkr.c b/drivers/input/misc/pcspkr.c
index d6a30ce..6d67af5 100644
--- a/drivers/input/misc/pcspkr.c
+++ b/drivers/input/misc/pcspkr.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/input.h>
 #include <linux/platform_device.h>
+#include <linux/timex.h>
 #include <asm/io.h>
 
 MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 9910e3b..e6967d1 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -280,6 +280,9 @@ extern int do_adjtimex(struct timex *);
 
 int read_current_timer(unsigned long *timer_val);
 
+/* The clock frequency of the i8253/i8254 PIT */
+#define PIT_TICK_RATE 1193182ul
+
 #endif /* KERNEL */
 
 #endif /* LINUX_TIMEX_H */
diff --git a/sound/drivers/pcsp/pcsp.h b/sound/drivers/pcsp/pcsp.h
index cdef266..174dd2f 100644
--- a/sound/drivers/pcsp/pcsp.h
+++ b/sound/drivers/pcsp/pcsp.h
@@ -10,6 +10,7 @@
 #define __PCSP_H__
 
 #include <linux/hrtimer.h>
+#include <linux/timex.h>
 #if defined(CONFIG_MIPS) || defined(CONFIG_X86)
 /* Use the global PIT lock ! */
 #include <asm/i8253.h>
diff --git a/sound/oss/pas2_pcm.c b/sound/oss/pas2_pcm.c
index 36c3ea6..8f7d175 100644
--- a/sound/oss/pas2_pcm.c
+++ b/sound/oss/pas2_pcm.c
@@ -17,7 +17,7 @@
 
 #include <linux/init.h>
 #include <linux/spinlock.h>
-#include <asm/timex.h>
+#include <linux/timex.h>
 #include "sound_config.h"
 
 #include "pas2.h"
-- 
cgit v0.10.2


From 021415468c889979117b1a07b96f7e36de33e995 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 16 Jun 2009 15:31:15 -0700
Subject: spi: takes size of a pointer to determine the size of the pointed-to
 type

Do not take the size of a pointer to determine the size of the pointed-to
type.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Kumar Gala <galak@gate.crashing.org>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c
index f4573a9..a32ccb4 100644
--- a/drivers/spi/spi_mpc83xx.c
+++ b/drivers/spi/spi_mpc83xx.c
@@ -711,12 +711,12 @@ static int of_mpc83xx_spi_get_chipselects(struct device *dev)
 		return 0;
 	}
 
-	pinfo->gpios = kmalloc(ngpios * sizeof(pinfo->gpios), GFP_KERNEL);
+	pinfo->gpios = kmalloc(ngpios * sizeof(*pinfo->gpios), GFP_KERNEL);
 	if (!pinfo->gpios)
 		return -ENOMEM;
-	memset(pinfo->gpios, -1, ngpios * sizeof(pinfo->gpios));
+	memset(pinfo->gpios, -1, ngpios * sizeof(*pinfo->gpios));
 
-	pinfo->alow_flags = kzalloc(ngpios * sizeof(pinfo->alow_flags),
+	pinfo->alow_flags = kzalloc(ngpios * sizeof(*pinfo->alow_flags),
 				    GFP_KERNEL);
 	if (!pinfo->alow_flags) {
 		ret = -ENOMEM;
-- 
cgit v0.10.2


From 3b0fde0fac19c180317eb0601b3504083f4b9bf5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 16 Jun 2009 15:31:16 -0700
Subject: firmware_map: fix hang with x86/32bit

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=13484

Peer reported:
| The bug is introduced from kernel 2.6.27, if E820 table reserve the memory
| above 4G in 32bit OS(BIOS-e820: 00000000fff80000 - 0000000120000000
| (reserved)), system will report Int 6 error and hang up. The bug is caused by
| the following code in drivers/firmware/memmap.c, the resource_size_t is 32bit
| variable in 32bit OS, the BUG_ON() will be invoked to result in the Int 6
| error. I try the latest 32bit Ubuntu and Fedora distributions, all hit this
| bug.
|======
|static int firmware_map_add_entry(resource_size_t start, resource_size_t end,
|                  const char *type,
|                  struct firmware_map_entry *entry)

and it only happen with CONFIG_PHYS_ADDR_T_64BIT is not set.

it turns out we need to pass u64 instead of resource_size_t for that.

[akpm@linux-foundation.org: add comment]
Reported-and-tested-by: Peer Chen <pchen@nvidia.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index 05aa2d4..d5ea8a6 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -31,8 +31,12 @@
  * information is necessary as for the resource tree.
  */
 struct firmware_map_entry {
-	resource_size_t		start;	/* start of the memory range */
-	resource_size_t		end;	/* end of the memory range (incl.) */
+	/*
+	 * start and end must be u64 rather than resource_size_t, because e820
+	 * resources can lie at addresses above 4G.
+	 */
+	u64			start;	/* start of the memory range */
+	u64			end;	/* end of the memory range (incl.) */
 	const char		*type;	/* type of the memory range */
 	struct list_head	list;	/* entry for the linked list */
 	struct kobject		kobj;   /* kobject for each entry */
@@ -101,7 +105,7 @@ static LIST_HEAD(map_entries);
  * Common implementation of firmware_map_add() and firmware_map_add_early()
  * which expects a pre-allocated struct firmware_map_entry.
  **/
-static int firmware_map_add_entry(resource_size_t start, resource_size_t end,
+static int firmware_map_add_entry(u64 start, u64 end,
 				  const char *type,
 				  struct firmware_map_entry *entry)
 {
@@ -132,8 +136,7 @@ static int firmware_map_add_entry(resource_size_t start, resource_size_t end,
  *
  * Returns 0 on success, or -ENOMEM if no memory could be allocated.
  **/
-int firmware_map_add(resource_size_t start, resource_size_t end,
-		     const char *type)
+int firmware_map_add(u64 start, u64 end, const char *type)
 {
 	struct firmware_map_entry *entry;
 
@@ -157,8 +160,7 @@ int firmware_map_add(resource_size_t start, resource_size_t end,
  *
  * Returns 0 on success, or -ENOMEM if no memory could be allocated.
  **/
-int __init firmware_map_add_early(resource_size_t start, resource_size_t end,
-				  const char *type)
+int __init firmware_map_add_early(u64 start, u64 end, const char *type)
 {
 	struct firmware_map_entry *entry;
 
diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h
index cca686b..875451f 100644
--- a/include/linux/firmware-map.h
+++ b/include/linux/firmware-map.h
@@ -24,21 +24,17 @@
  */
 #ifdef CONFIG_FIRMWARE_MEMMAP
 
-int firmware_map_add(resource_size_t start, resource_size_t end,
-		     const char *type);
-int firmware_map_add_early(resource_size_t start, resource_size_t end,
-			   const char *type);
+int firmware_map_add(u64 start, u64 end, const char *type);
+int firmware_map_add_early(u64 start, u64 end, const char *type);
 
 #else /* CONFIG_FIRMWARE_MEMMAP */
 
-static inline int firmware_map_add(resource_size_t start, resource_size_t end,
-				   const char *type)
+static inline int firmware_map_add(u64 start, u64 end, const char *type)
 {
 	return 0;
 }
 
-static inline int firmware_map_add_early(resource_size_t start,
-					 resource_size_t end, const char *type)
+static inline int firmware_map_add_early(u64 start, u64 end, const char *type)
 {
 	return 0;
 }
-- 
cgit v0.10.2


From bb1f17b0372de93758653ca3454bc0df18dc2e5c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jun 2009 15:31:18 -0700
Subject: mm: consolidate init_mm definition

* create mm/init-mm.c, move init_mm there
* remove INIT_MM, initialize init_mm with C99 initializer
* unexport init_mm on all arches:

  init_mm is already unexported on x86.

  One strange place is some OMAP driver (drivers/video/omap/) which
  won't build modular, but it's already wants get_vm_area() export.
  Somebody should look there.

[akpm@linux-foundation.org: add missing #includes]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Mike Frysinger <vapier.adi@gmail.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/alpha/kernel/init_task.c b/arch/alpha/kernel/init_task.c
index c2938e5..19b8632 100644
--- a/arch/alpha/kernel/init_task.c
+++ b/arch/alpha/kernel/init_task.c
@@ -10,10 +10,7 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
 struct task_struct init_task = INIT_TASK(init_task);
-
-EXPORT_SYMBOL(init_mm);
 EXPORT_SYMBOL(init_task);
 
 union thread_union init_thread_union
diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c
index e859af3..3f47086 100644
--- a/arch/arm/kernel/init_task.c
+++ b/arch/arm/kernel/init_task.c
@@ -14,10 +14,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/avr32/kernel/init_task.c b/arch/avr32/kernel/init_task.c
index 993d56e..57ec9f2 100644
--- a/arch/avr32/kernel/init_task.c
+++ b/arch/avr32/kernel/init_task.c
@@ -15,10 +15,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure. Must be aligned on an 8192-byte boundary.
  */
diff --git a/arch/blackfin/kernel/init_task.c b/arch/blackfin/kernel/init_task.c
index 2c228c0..c26c34d 100644
--- a/arch/blackfin/kernel/init_task.c
+++ b/arch/blackfin/kernel/init_task.c
@@ -35,10 +35,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-
-struct mm_struct init_mm = INIT_MM(init_mm);
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial task structure.
  *
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 4df0b32..51dcd04 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -38,10 +38,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/frv/kernel/init_task.c b/arch/frv/kernel/init_task.c
index 29429a8..1d3df1d 100644
--- a/arch/frv/kernel/init_task.c
+++ b/arch/frv/kernel/init_task.c
@@ -12,10 +12,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/h8300/kernel/init_task.c b/arch/h8300/kernel/init_task.c
index cb5dc55..089c65e 100644
--- a/arch/h8300/kernel/init_task.c
+++ b/arch/h8300/kernel/init_task.c
@@ -14,10 +14,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial task structure.
  *
diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c
index 5b0e830..c475fc2 100644
--- a/arch/ia64/kernel/init_task.c
+++ b/arch/ia64/kernel/init_task.c
@@ -19,10 +19,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial task structure.
  *
diff --git a/arch/m32r/kernel/init_task.c b/arch/m32r/kernel/init_task.c
index 016885c..fce57e5 100644
--- a/arch/m32r/kernel/init_task.c
+++ b/arch/m32r/kernel/init_task.c
@@ -13,10 +13,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index ec37fb56..72bad65 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -42,10 +42,6 @@
  */
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 union thread_union init_thread_union
 __attribute__((section(".data.init_task"), aligned(THREAD_SIZE)))
        = { INIT_THREAD_INFO(init_task) };
diff --git a/arch/m68knommu/kernel/init_task.c b/arch/m68knommu/kernel/init_task.c
index fe282de..45e97a2 100644
--- a/arch/m68knommu/kernel/init_task.c
+++ b/arch/m68knommu/kernel/init_task.c
@@ -14,10 +14,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial task structure.
  *
diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c
index 149cd91..5b457a4 100644
--- a/arch/mips/kernel/init_task.c
+++ b/arch/mips/kernel/init_task.c
@@ -11,10 +11,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/mn10300/kernel/init_task.c b/arch/mn10300/kernel/init_task.c
index 5ac3566..80d423b 100644
--- a/arch/mn10300/kernel/init_task.c
+++ b/arch/mn10300/kernel/init_task.c
@@ -20,9 +20,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/parisc/kernel/init_task.c b/arch/parisc/kernel/init_task.c
index 1e25a45..82974b2 100644
--- a/arch/parisc/kernel/init_task.c
+++ b/arch/parisc/kernel/init_task.c
@@ -36,10 +36,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial task structure.
  *
diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c
index 688b329..ffc4253 100644
--- a/arch/powerpc/kernel/init_task.c
+++ b/arch/powerpc/kernel/init_task.c
@@ -9,10 +9,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/s390/kernel/init_task.c b/arch/s390/kernel/init_task.c
index 7db95c0..fe787f9 100644
--- a/arch/s390/kernel/init_task.c
+++ b/arch/s390/kernel/init_task.c
@@ -18,10 +18,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/sh/kernel/init_task.c b/arch/sh/kernel/init_task.c
index 80c35ff..1719957 100644
--- a/arch/sh/kernel/init_task.c
+++ b/arch/sh/kernel/init_task.c
@@ -10,9 +10,6 @@
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct pt_regs fake_swapper_regs;
-struct mm_struct init_mm = INIT_MM(init_mm);
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c
index f28cb82..28125c5 100644
--- a/arch/sparc/kernel/init_task.c
+++ b/arch/sparc/kernel/init_task.c
@@ -10,10 +10,7 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
 struct task_struct init_task = INIT_TASK(init_task);
-
-EXPORT_SYMBOL(init_mm);
 EXPORT_SYMBOL(init_task);
 
 /* .text section in head.S is aligned at 8k boundary and this gets linked
diff --git a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c
index 806d381..b25121b 100644
--- a/arch/um/kernel/init_task.c
+++ b/arch/um/kernel/init_task.c
@@ -10,11 +10,8 @@
 #include "linux/mqueue.h"
 #include "asm/uaccess.h"
 
-struct mm_struct init_mm = INIT_MM(init_mm);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial task structure.
  *
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index df3bf26..270ff83 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -12,7 +12,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
 
 /*
  * Initial thread structure.
diff --git a/arch/xtensa/kernel/init_task.c b/arch/xtensa/kernel/init_task.c
index e07f5c9..c4302f0 100644
--- a/arch/xtensa/kernel/init_task.c
+++ b/arch/xtensa/kernel/init_task.c
@@ -23,10 +23,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 union thread_union init_thread_union
 	__attribute__((__section__(".data.init_task"))) =
 { INIT_THREAD_INFO(init_task) };
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 28b1f30..5368fbd 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,18 +15,6 @@
 extern struct files_struct init_files;
 extern struct fs_struct init_fs;
 
-#define INIT_MM(name) \
-{			 					\
-	.mm_rb		= RB_ROOT,				\
-	.pgd		= swapper_pg_dir, 			\
-	.mm_users	= ATOMIC_INIT(2), 			\
-	.mm_count	= ATOMIC_INIT(1), 			\
-	.mmap_sem	= __RWSEM_INITIALIZER(name.mmap_sem),	\
-	.page_table_lock =  __SPIN_LOCK_UNLOCKED(name.page_table_lock),	\
-	.mmlist		= LIST_HEAD_INIT(name.mmlist),		\
-	.cpu_vm_mask	= CPU_MASK_ALL,				\
-}
-
 #define INIT_SIGNALS(sig) {						\
 	.count		= ATOMIC_INIT(1), 				\
 	.wait_chldexit	= __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
diff --git a/mm/Makefile b/mm/Makefile
index e89acb0..cf76be7 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,6 +12,7 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
 			   page_isolation.o mm_init.o $(mmu-y)
+obj-y += init-mm.o
 
 obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
 obj-$(CONFIG_BOUNCE)	+= bounce.o
diff --git a/mm/init-mm.c b/mm/init-mm.c
new file mode 100644
index 0000000..57aba0d
--- /dev/null
+++ b/mm/init-mm.c
@@ -0,0 +1,20 @@
+#include <linux/mm_types.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/cpumask.h>
+
+#include <asm/atomic.h>
+#include <asm/pgtable.h>
+
+struct mm_struct init_mm = {
+	.mm_rb		= RB_ROOT,
+	.pgd		= swapper_pg_dir,
+	.mm_users	= ATOMIC_INIT(2),
+	.mm_count	= ATOMIC_INIT(1),
+	.mmap_sem	= __RWSEM_INITIALIZER(init_mm.mmap_sem),
+	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
+	.cpu_vm_mask	= CPU_MASK_ALL,
+};
-- 
cgit v0.10.2


From 1ebf26a9b338534def47f307c6c8694b6dfc0a79 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:19 -0700
Subject: readahead: make mmap_miss an unsigned int

This makes the performance impact of possible mmap_miss wrap around to be
temporary and tolerable: i.e.  MMAP_LOTSAMISS=100 extra readarounds.

Otherwise if ever mmap_miss wraps around to negative, it takes INT_MAX
cache misses to bring it back to normal state.  During the time mmap
readaround will be _enabled_ for whatever wild random workload.  That's
almost permanent performance impact.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ede84fa..8146e02 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -879,7 +879,7 @@ struct file_ra_state {
 					   there are only # of pages ahead */
 
 	unsigned int ra_pages;		/* Maximum readahead window */
-	int mmap_miss;			/* Cache miss stat for mmap accesses */
+	unsigned int mmap_miss;		/* Cache miss stat for mmap accesses */
 	loff_t prev_pos;		/* Cache last read() position */
 };
 
-- 
cgit v0.10.2


From f7e839dd36fd940b0202cfb7d39b2a1b2dc59b1b Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:20 -0700
Subject: readahead: move max_sane_readahead() calls into
 force_page_cache_readahead()

Impact: code simplification.

Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/fadvise.c b/mm/fadvise.c
index 54a0f80..e433592 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 		
 		ret = force_page_cache_readahead(mapping, file,
 				start_index,
-				max_sane_readahead(nrpages));
+				nrpages);
 		if (ret > 0)
 			ret = 0;
 		break;
diff --git a/mm/filemap.c b/mm/filemap.c
index 1b60f30..dcef9fd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1390,8 +1390,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
 	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
 		return -EINVAL;
 
-	force_page_cache_readahead(mapping, filp, index,
-					max_sane_readahead(nr));
+	force_page_cache_readahead(mapping, filp, index, nr);
 	return 0;
 }
 
diff --git a/mm/madvise.c b/mm/madvise.c
index b9ce574..e994dcb 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -123,8 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
 		end = vma->vm_end;
 	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 
-	force_page_cache_readahead(file->f_mapping,
-			file, start, max_sane_readahead(end - start));
+	force_page_cache_readahead(file->f_mapping, file, start, end - start);
 	return 0;
 }
 
diff --git a/mm/readahead.c b/mm/readahead.c
index 133b6d5..a224182 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -210,6 +210,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
 		return -EINVAL;
 
+	nr_to_read = max_sane_readahead(nr_to_read);
 	while (nr_to_read) {
 		int err;
 
-- 
cgit v0.10.2


From fc31d16add13773265cc53d59f2e7594cb3c0a14 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:21 -0700
Subject: readahead: apply max_sane_readahead() limit in ondemand_readahead()

Just in case someone aggressively sets a huge readahead size.

Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/readahead.c b/mm/readahead.c
index a224182..bd0f0e8 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -357,7 +357,7 @@ ondemand_readahead(struct address_space *mapping,
 		   bool hit_readahead_marker, pgoff_t offset,
 		   unsigned long req_size)
 {
-	int	max = ra->ra_pages;	/* max readahead pages */
+	unsigned long max = max_sane_readahead(ra->ra_pages);
 	pgoff_t prev_offset;
 	int	sequential;
 
-- 
cgit v0.10.2


From caca7cb748571a5b39943a9b3e7081feef055e5e Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:21 -0700
Subject: readahead: remove one unnecessary radix tree lookup

(hit_readahead_marker != 0) means the page at @offset is present, so we
can search for non-present page starting from @offset+1.

Reported-by: Xu Chenfeng <xcf@ustc.edu.cn>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/readahead.c b/mm/readahead.c
index bd0f0e8..5673115 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -395,7 +395,7 @@ ondemand_readahead(struct address_space *mapping,
 		pgoff_t start;
 
 		rcu_read_lock();
-		start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
+		start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
 		rcu_read_unlock();
 
 		if (!start || start - offset > max)
-- 
cgit v0.10.2


From 160334a0cfa8e578b718f81038026326845d07d7 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:23 -0700
Subject: readahead: increase interleaved readahead size

Make sure interleaved readahead size is larger than request size.  This
also makes the readahead window grow up more quickly.

Reported-by: Xu Chenfeng <xcf@ustc.edu.cn>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/readahead.c b/mm/readahead.c
index 5673115..16378b9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -403,6 +403,7 @@ ondemand_readahead(struct address_space *mapping,
 
 		ra->start = start;
 		ra->size = start - offset;	/* old async_size */
+		ra->size += req_size;
 		ra->size = get_next_ra_size(ra, max);
 		ra->async_size = ra->size;
 		goto readit;
-- 
cgit v0.10.2


From 51daa88ebd8e0d437289f589af29d4b39379ea76 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:24 -0700
Subject: readahead: remove sync/async readahead call dependency

The readahead call scheme is error-prone in that it expects the call sites
to check for async readahead after doing a sync one.  I.e.

			if (!page)
				page_cache_sync_readahead();
			page = find_get_page();
			if (page && PageReadahead(page))
				page_cache_async_readahead();

This is because PG_readahead could be set by a sync readahead for the
_current_ newly faulted in page, and the readahead code simply expects one
more callback on the same page to start the async readahead.  If the
caller fails to do so, it will miss the PG_readahead bits and never able
to start an async readahead.

Eliminate this insane constraint by piggy-backing the async part into the
current readahead window.

Now if an async readahead should be started immediately after a sync one,
the readahead logic itself will do it.  So the following code becomes
valid: (the 'else' in particular)

			if (!page)
				page_cache_sync_readahead();
			else if (PageReadahead(page))
				page_cache_async_readahead();

Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/readahead.c b/mm/readahead.c
index 16378b9..d7c6e14 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -421,6 +421,16 @@ ondemand_readahead(struct address_space *mapping,
 	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
 
 readit:
+	/*
+	 * Will this read hit the readahead marker made by itself?
+	 * If so, trigger the readahead marker hit now, and merge
+	 * the resulted next readahead window into the current one.
+	 */
+	if (offset == ra->start && ra->size == ra->async_size) {
+		ra->async_size = get_next_ra_size(ra, max);
+		ra->size += ra->async_size;
+	}
+
 	return ra_submit(ra, mapping, filp);
 }
 
-- 
cgit v0.10.2


From ef00e08e26dd5d84271ef706262506b82195e752 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 16 Jun 2009 15:31:25 -0700
Subject: readahead: clean up and simplify the code for filemap page fault
 readahead

This shouldn't really change behavior all that much, but the single rather
complex function with read-ahead inside a loop etc is broken up into more
manageable pieces.

The behaviour is also less subtle, with the read-ahead being done up-front
rather than inside some subtle loop and thus avoiding the now unnecessary
extra state variables (ie "did_readaround" is gone).

Fengguang: the code split in fact fixed a bug reported by Pavel Levshin:
the PGMAJFAULT accounting used to be bypassed when MADV_RANDOM is set, in
which case the original code will directly jump to no_cached_page reading.

Cc: Pavel Levshin <lpk@581.spb.su>
Cc: <wli@movementarian.org>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/filemap.c b/mm/filemap.c
index dcef9fd..8275364 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1456,6 +1456,68 @@ static int page_cache_read(struct file *file, pgoff_t offset)
 
 #define MMAP_LOTSAMISS  (100)
 
+/*
+ * Synchronous readahead happens when we don't even find
+ * a page in the page cache at all.
+ */
+static void do_sync_mmap_readahead(struct vm_area_struct *vma,
+				   struct file_ra_state *ra,
+				   struct file *file,
+				   pgoff_t offset)
+{
+	unsigned long ra_pages;
+	struct address_space *mapping = file->f_mapping;
+
+	/* If we don't want any read-ahead, don't bother */
+	if (VM_RandomReadHint(vma))
+		return;
+
+	if (VM_SequentialReadHint(vma)) {
+		page_cache_sync_readahead(mapping, ra, file, offset, 1);
+		return;
+	}
+
+	if (ra->mmap_miss < INT_MAX)
+		ra->mmap_miss++;
+
+	/*
+	 * Do we miss much more than hit in this file? If so,
+	 * stop bothering with read-ahead. It will only hurt.
+	 */
+	if (ra->mmap_miss > MMAP_LOTSAMISS)
+		return;
+
+	ra_pages = max_sane_readahead(ra->ra_pages);
+	if (ra_pages) {
+		pgoff_t start = 0;
+
+		if (offset > ra_pages / 2)
+			start = offset - ra_pages / 2;
+		do_page_cache_readahead(mapping, file, start, ra_pages);
+	}
+}
+
+/*
+ * Asynchronous readahead happens when we find the page and PG_readahead,
+ * so we want to possibly extend the readahead further..
+ */
+static void do_async_mmap_readahead(struct vm_area_struct *vma,
+				    struct file_ra_state *ra,
+				    struct file *file,
+				    struct page *page,
+				    pgoff_t offset)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	/* If we don't want any read-ahead, don't bother */
+	if (VM_RandomReadHint(vma))
+		return;
+	if (ra->mmap_miss > 0)
+		ra->mmap_miss--;
+	if (PageReadahead(page))
+		page_cache_async_readahead(mapping, ra, file, page, offset, 1);
+}
+
 /**
  * filemap_fault - read in file data for page fault handling
  * @vma:	vma in which the fault was taken
@@ -1475,78 +1537,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct address_space *mapping = file->f_mapping;
 	struct file_ra_state *ra = &file->f_ra;
 	struct inode *inode = mapping->host;
+	pgoff_t offset = vmf->pgoff;
 	struct page *page;
 	pgoff_t size;
-	int did_readaround = 0;
 	int ret = 0;
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (vmf->pgoff >= size)
+	if (offset >= size)
 		return VM_FAULT_SIGBUS;
 
-	/* If we don't want any read-ahead, don't bother */
-	if (VM_RandomReadHint(vma))
-		goto no_cached_page;
-
 	/*
 	 * Do we have something in the page cache already?
 	 */
-retry_find:
-	page = find_lock_page(mapping, vmf->pgoff);
-	/*
-	 * For sequential accesses, we use the generic readahead logic.
-	 */
-	if (VM_SequentialReadHint(vma)) {
-		if (!page) {
-			page_cache_sync_readahead(mapping, ra, file,
-							   vmf->pgoff, 1);
-			page = find_lock_page(mapping, vmf->pgoff);
-			if (!page)
-				goto no_cached_page;
-		}
-		if (PageReadahead(page)) {
-			page_cache_async_readahead(mapping, ra, file, page,
-							   vmf->pgoff, 1);
-		}
-	}
-
-	if (!page) {
-		unsigned long ra_pages;
-
-		ra->mmap_miss++;
-
+	page = find_get_page(mapping, offset);
+	if (likely(page)) {
 		/*
-		 * Do we miss much more than hit in this file? If so,
-		 * stop bothering with read-ahead. It will only hurt.
+		 * We found the page, so try async readahead before
+		 * waiting for the lock.
 		 */
-		if (ra->mmap_miss > MMAP_LOTSAMISS)
-			goto no_cached_page;
+		do_async_mmap_readahead(vma, ra, file, page, offset);
+		lock_page(page);
 
-		/*
-		 * To keep the pgmajfault counter straight, we need to
-		 * check did_readaround, as this is an inner loop.
-		 */
-		if (!did_readaround) {
-			ret = VM_FAULT_MAJOR;
-			count_vm_event(PGMAJFAULT);
-		}
-		did_readaround = 1;
-		ra_pages = max_sane_readahead(file->f_ra.ra_pages);
-		if (ra_pages) {
-			pgoff_t start = 0;
-
-			if (vmf->pgoff > ra_pages / 2)
-				start = vmf->pgoff - ra_pages / 2;
-			do_page_cache_readahead(mapping, file, start, ra_pages);
+		/* Did it get truncated? */
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			put_page(page);
+			goto no_cached_page;
 		}
-		page = find_lock_page(mapping, vmf->pgoff);
+	} else {
+		/* No page in the page cache at all */
+		do_sync_mmap_readahead(vma, ra, file, offset);
+		count_vm_event(PGMAJFAULT);
+		ret = VM_FAULT_MAJOR;
+retry_find:
+		page = find_lock_page(mapping, offset);
 		if (!page)
 			goto no_cached_page;
 	}
 
-	if (!did_readaround)
-		ra->mmap_miss--;
-
 	/*
 	 * We have a locked page in the page cache, now we need to check
 	 * that it's up-to-date. If not, it is going to be due to an error.
@@ -1554,18 +1582,18 @@ retry_find:
 	if (unlikely(!PageUptodate(page)))
 		goto page_not_uptodate;
 
-	/* Must recheck i_size under page lock */
+	/*
+	 * Found the page and have a reference on it.
+	 * We must recheck i_size under page lock.
+	 */
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (unlikely(vmf->pgoff >= size)) {
+	if (unlikely(offset >= size)) {
 		unlock_page(page);
 		page_cache_release(page);
 		return VM_FAULT_SIGBUS;
 	}
 
-	/*
-	 * Found the page and have a reference on it.
-	 */
-	ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
 	vmf->page = page;
 	return ret | VM_FAULT_LOCKED;
 
@@ -1574,7 +1602,7 @@ no_cached_page:
 	 * We're only likely to ever get here if MADV_RANDOM is in
 	 * effect.
 	 */
-	error = page_cache_read(file, vmf->pgoff);
+	error = page_cache_read(file, offset);
 
 	/*
 	 * The page we want has now been added to the page cache.
@@ -1594,12 +1622,6 @@ no_cached_page:
 	return VM_FAULT_SIGBUS;
 
 page_not_uptodate:
-	/* IO error path */
-	if (!did_readaround) {
-		ret = VM_FAULT_MAJOR;
-		count_vm_event(PGMAJFAULT);
-	}
-
 	/*
 	 * Umm, take care of errors if the page isn't up-to-date.
 	 * Try to re-read it _once_. We do this synchronously,
-- 
cgit v0.10.2


From 70ac23cfa31f68289d4b720c6162b3929ab4de36 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:28 -0700
Subject: readahead: sequential mmap readahead

Auto-detect sequential mmap reads and do readahead for them.

The sequential mmap readahead will be triggered when
- sync readahead: it's a major fault and (prev_offset == offset-1);
- async readahead: minor fault on PG_readahead page with valid readahead state.

The benefits of doing readahead instead of read-around:
- less I/O wait thanks to async readahead
- double real I/O size and no more cache hits

The single stream case is improved a little.
For 100,000 sequential mmap reads:

                                    user       system    cpu        total
(1-1)  plain -mm, 128KB readaround: 3.224      2.554     48.40%     11.838
(1-2)  plain -mm, 256KB readaround: 3.170      2.392     46.20%     11.976
(2)  patched -mm, 128KB readahead:  3.117      2.448     47.33%     11.607

The patched (2) has smallest total time, since it has no cache hit overheads
and less I/O block time(thanks to async readahead). Here the I/O size
makes no much difference, since there's only one single stream.

Note that (1-1)'s real I/O size is 64KB and (1-2)'s real I/O size is 128KB,
since the half of the read-around pages will be readahead cache hits.

This is going to make _real_ differences for _concurrent_ IO streams.

Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/filemap.c b/mm/filemap.c
index 8275364..99977f0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1472,7 +1472,8 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
 	if (VM_RandomReadHint(vma))
 		return;
 
-	if (VM_SequentialReadHint(vma)) {
+	if (VM_SequentialReadHint(vma) ||
+			offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
 		page_cache_sync_readahead(mapping, ra, file, offset, 1);
 		return;
 	}
-- 
cgit v0.10.2


From 2fad6f5deee5556f511eab58da78737a23ddb35d Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:29 -0700
Subject: readahead: enforce full readahead size on async mmap readahead

We need this in one particular case and two more general ones.

Now we do async readahead for sequential mmap reads, and do it with the
help of PG_readahead.  For normal reads, PG_readahead is the sufficient
condition to do a sequential readahead.  But unfortunately, for mmap
reads, there is a tiny nuisance:

[11736.998347] readahead-init0(process: sh/23926, file: sda1/w3m, offset=0:4503599627370495, ra=0+4-3) = 4
[11737.014985] readahead-around(process: w3m/23926, file: sda1/w3m, offset=0:0, ra=290+32-0) = 17
[11737.019488] readahead-around(process: w3m/23926, file: sda1/w3m, offset=0:0, ra=118+32-0) = 32
[11737.024921] readahead-interleaved(process: w3m/23926, file: sda1/w3m, offset=0:2, ra=4+6-6) = 6
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                                                 ~~~~~~~~~~~~~

An unfavorably small readahead.  The original dumb read-around size could
be more efficient.

That happened because ld-linux.so does a read(832) in L1 before mmap(),
which triggers a 4-page readahead, with the second page tagged
PG_readahead.

L0: open("/lib/libc.so.6", O_RDONLY)        = 3
L1: read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\340\342"..., 832) = 832
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
L2: fstat(3, {st_mode=S_IFREG|0755, st_size=1420624, ...}) = 0
L3: mmap(NULL, 3527256, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fac6e51d000
L4: mprotect(0x7fac6e671000, 2097152, PROT_NONE) = 0
L5: mmap(0x7fac6e871000, 20480, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x154000) = 0x7fac6e871000
L6: mmap(0x7fac6e876000, 16984, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fac6e876000
L7: close(3)                                = 0

In general, the PG_readahead flag will also be hit in cases

- sequential reads

- clustered random reads

A full readahead size is desirable in both cases.

Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/filemap.c b/mm/filemap.c
index 99977f0..5c0c651 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1516,7 +1516,8 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
 	if (ra->mmap_miss > 0)
 		ra->mmap_miss--;
 	if (PageReadahead(page))
-		page_cache_async_readahead(mapping, ra, file, page, offset, 1);
+		page_cache_async_readahead(mapping, ra, file,
+					   page, offset, ra->ra_pages);
 }
 
 /**
-- 
cgit v0.10.2


From d30a11004e3411909f2448546f036a011978062e Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:30 -0700
Subject: readahead: record mmap read-around states in file_ra_state

Mmap read-around now shares the same code style and data structure with
readahead code.

This also removes do_page_cache_readahead().  Its last user, mmap
read-around, has been changed to call ra_submit().

The no-readahead-if-congested logic is dumped by the way.  Users will be
pretty sensitive about the slow loading of executables.  So it's
unfavorable to disabled mmap read-around on a congested queue.

[akpm@linux-foundation.org: coding-style fixes]
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ad613ed..33da7f5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1178,8 +1178,6 @@ void task_dirty_inc(struct task_struct *tsk);
 #define VM_MAX_READAHEAD	128	/* kbytes */
 #define VM_MIN_READAHEAD	16	/* kbytes (includes current page) */
 
-int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
-			pgoff_t offset, unsigned long nr_to_read);
 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 			pgoff_t offset, unsigned long nr_to_read);
 
@@ -1197,6 +1195,9 @@ void page_cache_async_readahead(struct address_space *mapping,
 				unsigned long size);
 
 unsigned long max_sane_readahead(unsigned long nr);
+unsigned long ra_submit(struct file_ra_state *ra,
+			struct address_space *mapping,
+			struct file *filp);
 
 /* Do stack extension */
 extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
diff --git a/mm/filemap.c b/mm/filemap.c
index 5c0c651..734891d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1488,13 +1488,15 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
 	if (ra->mmap_miss > MMAP_LOTSAMISS)
 		return;
 
+	/*
+	 * mmap read-around
+	 */
 	ra_pages = max_sane_readahead(ra->ra_pages);
 	if (ra_pages) {
-		pgoff_t start = 0;
-
-		if (offset > ra_pages / 2)
-			start = offset - ra_pages / 2;
-		do_page_cache_readahead(mapping, file, start, ra_pages);
+		ra->start = max_t(long, 0, offset - ra_pages/2);
+		ra->size = ra_pages;
+		ra->async_size = 0;
+		ra_submit(ra, mapping, file);
 	}
 }
 
diff --git a/mm/readahead.c b/mm/readahead.c
index d7c6e14..a7f01fc 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -133,15 +133,12 @@ out:
 }
 
 /*
- * do_page_cache_readahead actually reads a chunk of disk.  It allocates all
+ * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates all
  * the pages first, then submits them all for I/O. This avoids the very bad
  * behaviour which would occur if page allocations are causing VM writeback.
  * We really don't want to intermingle reads and writes like that.
  *
  * Returns the number of pages requested, or the maximum amount of I/O allowed.
- *
- * do_page_cache_readahead() returns -1 if it encountered request queue
- * congestion.
  */
 static int
 __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
@@ -232,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 }
 
 /*
- * This version skips the IO if the queue is read-congested, and will tell the
- * block layer to abandon the readahead if request allocation would block.
- *
- * force_page_cache_readahead() will ignore queue congestion and will block on
- * request queues.
- */
-int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
-			pgoff_t offset, unsigned long nr_to_read)
-{
-	if (bdi_read_congested(mapping->backing_dev_info))
-		return -1;
-
-	return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
-}
-
-/*
  * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
  * sensible upper limit.
  */
@@ -260,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr)
 /*
  * Submit IO for the read-ahead request in file_ra_state.
  */
-static unsigned long ra_submit(struct file_ra_state *ra,
+unsigned long ra_submit(struct file_ra_state *ra,
 		       struct address_space *mapping, struct file *filp)
 {
 	int actual;
-- 
cgit v0.10.2


From dc566127dd161b6c997466a2349ac179527ea89b Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:32 -0700
Subject: radix-tree: add radix_tree_prev_hole()

The counterpart of radix_tree_next_hole(). To be used by context readahead.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Vladislav Bolkhovitin <vst@vlnb.net>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 355f6e8..c5da749 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -167,6 +167,8 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
 			unsigned long first_index, unsigned int max_items);
 unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan);
+unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
+				unsigned long index, unsigned long max_scan);
 int radix_tree_preload(gfp_t gfp_mask);
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 4bb42a0..5301a52 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -666,6 +666,43 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_next_hole);
 
+/**
+ *	radix_tree_prev_hole    -    find the prev hole (not-present entry)
+ *	@root:		tree root
+ *	@index:		index key
+ *	@max_scan:	maximum range to search
+ *
+ *	Search backwards in the range [max(index-max_scan+1, 0), index]
+ *	for the first hole.
+ *
+ *	Returns: the index of the hole if found, otherwise returns an index
+ *	outside of the set specified (in which case 'index - return >= max_scan'
+ *	will be true). In rare cases of wrap-around, LONG_MAX will be returned.
+ *
+ *	radix_tree_next_hole may be called under rcu_read_lock. However, like
+ *	radix_tree_gang_lookup, this will not atomically search a snapshot of
+ *	the tree at a single point in time. For example, if a hole is created
+ *	at index 10, then subsequently a hole is created at index 5,
+ *	radix_tree_prev_hole covering both indexes may return 5 if called under
+ *	rcu_read_lock.
+ */
+unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
+				   unsigned long index, unsigned long max_scan)
+{
+	unsigned long i;
+
+	for (i = 0; i < max_scan; i++) {
+		if (!radix_tree_lookup(root, index))
+			break;
+		index--;
+		if (index == LONG_MAX)
+			break;
+	}
+
+	return index;
+}
+EXPORT_SYMBOL(radix_tree_prev_hole);
+
 static unsigned int
 __lookup(struct radix_tree_node *slot, void ***results, unsigned long index,
 	unsigned int max_items, unsigned long *next_index)
-- 
cgit v0.10.2


From 045a2529a3513faed2d45bd82f9013b124309d94 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:33 -0700
Subject: readahead: move the random read case to bottom

Split all readahead cases, and move the random one to bottom.

No behavior changes.

This is to prepare for the introduction of context readahead, and make it
easy for inserting accounting/tracing points for each case.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Vladislav Bolkhovitin <vst@vlnb.net>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/readahead.c b/mm/readahead.c
index a7f01fc..ceed7e4 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -339,33 +339,25 @@ ondemand_readahead(struct address_space *mapping,
 		   unsigned long req_size)
 {
 	unsigned long max = max_sane_readahead(ra->ra_pages);
-	pgoff_t prev_offset;
-	int	sequential;
+
+	/*
+	 * start of file
+	 */
+	if (!offset)
+		goto initial_readahead;
 
 	/*
 	 * It's the expected callback offset, assume sequential access.
 	 * Ramp up sizes, and push forward the readahead window.
 	 */
-	if (offset && (offset == (ra->start + ra->size - ra->async_size) ||
-			offset == (ra->start + ra->size))) {
+	if ((offset == (ra->start + ra->size - ra->async_size) ||
+	     offset == (ra->start + ra->size))) {
 		ra->start += ra->size;
 		ra->size = get_next_ra_size(ra, max);
 		ra->async_size = ra->size;
 		goto readit;
 	}
 
-	prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
-	sequential = offset - prev_offset <= 1UL || req_size > max;
-
-	/*
-	 * Standalone, small read.
-	 * Read as is, and do not pollute the readahead state.
-	 */
-	if (!hit_readahead_marker && !sequential) {
-		return __do_page_cache_readahead(mapping, filp,
-						offset, req_size, 0);
-	}
-
 	/*
 	 * Hit a marked page without valid readahead state.
 	 * E.g. interleaved reads.
@@ -391,12 +383,24 @@ ondemand_readahead(struct address_space *mapping,
 	}
 
 	/*
-	 * It may be one of
-	 * 	- first read on start of file
-	 * 	- sequential cache miss
-	 * 	- oversize random read
-	 * Start readahead for it.
+	 * oversize read
 	 */
+	if (req_size > max)
+		goto initial_readahead;
+
+	/*
+	 * sequential cache miss
+	 */
+	if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
+		goto initial_readahead;
+
+	/*
+	 * standalone, small random read
+	 * Read as is, and do not pollute the readahead state.
+	 */
+	return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
+
+initial_readahead:
 	ra->start = offset;
 	ra->size = get_init_ra_size(req_size, max);
 	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
-- 
cgit v0.10.2


From 10be0b372cac50e2e7a477852f98bf069a97a3fa Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:36 -0700
Subject: readahead: introduce context readahead algorithm

Introduce page cache context based readahead algorithm.
This is to better support concurrent read streams in general.

RATIONALE
---------
The current readahead algorithm detects interleaved reads in a _passive_ way.
Given a sequence of interleaved streams 1,1001,2,1002,3,4,1003,5,1004,1005,6,...
By checking for (offset == prev_offset + 1), it will discover the sequentialness
between 3,4 and between 1004,1005, and start doing sequential readahead for the
individual streams since page 4 and page 1005.

The context readahead algorithm guarantees to discover the sequentialness no
matter how the streams are interleaved. For the above example, it will start
sequential readahead since page 2 and 1002.

The trick is to poke for page @offset-1 in the page cache when it has no other
clues on the sequentialness of request @offset: if the current requenst belongs
to a sequential stream, that stream must have accessed page @offset-1 recently,
and the page will still be cached now. So if page @offset-1 is there, we can
take request @offset as a sequential access.

BENEFICIARIES
-------------
- strictly interleaved reads  i.e. 1,1001,2,1002,3,1003,...
  the current readahead will take them as silly random reads;
  the context readahead will take them as two sequential streams.

- cooperative IO processes   i.e. NFS and SCST
  They create a thread pool, farming off (sequential) IO requests to different
  threads which will be performing interleaved IO.

  It was not easy(or possible) to reliably tell from file->f_ra all those
  cooperative processes working on the same sequential stream, since they will
  have different file->f_ra instances. And NFSD's file->f_ra is particularly
  unusable, since their file objects are dynamically created for each request.
  The nfsd does have code trying to restore the f_ra bits, but not satisfactory.

  The new scheme is to detect the sequential pattern via looking up the page
  cache, which provides one single and consistent view of the pages recently
  accessed. That makes sequential detection for cooperative processes possible.

USER REPORT
-----------
Vladislav recommends the addition of context readahead as a result of his SCST
benchmarks. It leads to 6%~40% performance gains in various cases and achieves
equal performance in others.                http://lkml.org/lkml/2009/3/19/239

OVERHEADS
---------
In theory, it introduces one extra page cache lookup per random read.  However
the below benchmark shows context readahead to be slightly faster, wondering..

Randomly reading 200MB amount of data on a sparse file, repeat 20 times for
each block size. The average throughputs are:

                       	original ra	context ra	gain
 4K random reads:	 65.561MB/s	 65.648MB/s	+0.1%
16K random reads:	124.767MB/s	124.951MB/s	+0.1%
64K random reads: 	162.123MB/s	162.278MB/s	+0.1%

Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Tested-by: Vladislav Bolkhovitin <vst@vlnb.net>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/readahead.c b/mm/readahead.c
index ceed7e4..aa1aa23 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -330,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
  */
 
 /*
+ * Count contiguously cached pages from @offset-1 to @offset-@max,
+ * this count is a conservative estimation of
+ * 	- length of the sequential read sequence, or
+ * 	- thrashing threshold in memory tight systems
+ */
+static pgoff_t count_history_pages(struct address_space *mapping,
+				   struct file_ra_state *ra,
+				   pgoff_t offset, unsigned long max)
+{
+	pgoff_t head;
+
+	rcu_read_lock();
+	head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
+	rcu_read_unlock();
+
+	return offset - 1 - head;
+}
+
+/*
+ * page cache context based read-ahead
+ */
+static int try_context_readahead(struct address_space *mapping,
+				 struct file_ra_state *ra,
+				 pgoff_t offset,
+				 unsigned long req_size,
+				 unsigned long max)
+{
+	pgoff_t size;
+
+	size = count_history_pages(mapping, ra, offset, max);
+
+	/*
+	 * no history pages:
+	 * it could be a random read
+	 */
+	if (!size)
+		return 0;
+
+	/*
+	 * starts from beginning of file:
+	 * it is a strong indication of long-run stream (or whole-file-read)
+	 */
+	if (size >= offset)
+		size *= 2;
+
+	ra->start = offset;
+	ra->size = get_init_ra_size(size + req_size, max);
+	ra->async_size = ra->size;
+
+	return 1;
+}
+
+/*
  * A minimal readahead algorithm for trivial sequential/random reads.
  */
 static unsigned long
@@ -395,6 +448,13 @@ ondemand_readahead(struct address_space *mapping,
 		goto initial_readahead;
 
 	/*
+	 * Query the page cache and look for the traces(cached history pages)
+	 * that a sequential stream would leave behind.
+	 */
+	if (try_context_readahead(mapping, ra, offset, req_size, max))
+		goto readit;
+
+	/*
 	 * standalone, small random read
 	 * Read as is, and do not pollute the readahead state.
 	 */
-- 
cgit v0.10.2


From 61b7cbdba2f3c588a0cf3db574c562805454b09b Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:36 -0700
Subject: readahead: remove redundant test in shrink_readahead_size_eio()

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/filemap.c b/mm/filemap.c
index 734891d..2e9bcc2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait);
 static void shrink_readahead_size_eio(struct file *filp,
 					struct file_ra_state *ra)
 {
-	if (!ra->ra_pages)
-		return;
-
 	ra->ra_pages /= 4;
 }
 
-- 
cgit v0.10.2


From 7ffc59b4d0bdfa00e882339f85b8a969bb7021e2 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:38 -0700
Subject: readahead: enforce full sync mmap readahead size

Now that we do readahead for sequential mmap reads, here is a simple
evaluation of the impacts, and one further optimization.

It's an NFS-root debian desktop system, readahead size = 60 pages.
The numbers are grabbed after a fresh boot into console.

approach        pgmajfault      RA miss ratio   mmap IO count   avg IO size(pages)
   A            383             31.6%           383             11
   B            225             32.4%           390             11
   C            224             32.6%           307             13

case A: mmap sync/async readahead disabled
case B: mmap sync/async readahead enabled, with enforced full async readahead size
case C: mmap sync/async readahead enabled, with enforced full sync/async readahead size
or:
A = vanilla 2.6.30-rc1
B = A plus mmap readahead
C = B plus this patch

The numbers show that
- there are good possibilities for random mmap reads to trigger readahead
- 'pgmajfault' is reduced by 1/3, due to the _async_ nature of readahead
- case C can further reduce IO count by 1/4
- readahead miss ratios are not quite affected

The theory is
- readahead is _good_ for clustered random reads, and can perform
  _better_ than readaround because they could be _async_.
- async readahead size is guaranteed to be larger than readaround
  size, and they are _async_, hence will mostly behave better
However for B
- sync readahead size could be smaller than readaround size, hence may
  make things worse by produce more smaller IOs
which will be fixed by this patch.

Final conclusion:
- mmap readahead reduced major faults by 1/3 and no obvious overheads;
- mmap io can be further reduced by 1/4 with this patch.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/filemap.c b/mm/filemap.c
index 2e9bcc2..6846a90 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1471,7 +1471,8 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
 
 	if (VM_SequentialReadHint(vma) ||
 			offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
-		page_cache_sync_readahead(mapping, ra, file, offset, 1);
+		page_cache_sync_readahead(mapping, ra, file, offset,
+					  ra->ra_pages);
 		return;
 	}
 
-- 
cgit v0.10.2


From d2bf6be8ab63aa84e6149aac934649aadf3828b1 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 16 Jun 2009 15:31:39 -0700
Subject: mm: clean up get_user_pages_fast() documentation

Move more documentation for get_user_pages_fast into the new kerneldoc comment.
Add some comments for get_user_pages as well.

Also, move get_user_pages_fast declaration up to get_user_pages. It wasn't
there initially because it was once a static inline function.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Andy Grover <andy.grover@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 33da7f5..a880161 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -824,8 +824,11 @@ static inline int handle_mm_fault(struct mm_struct *mm,
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
-		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+			unsigned long start, int len, int write, int force,
+			struct page **pages, struct vm_area_struct **vmas);
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			struct page **pages);
 
 extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
 extern void do_invalidatepage(struct page *page, unsigned long offset);
@@ -850,19 +853,6 @@ extern int mprotect_fixup(struct vm_area_struct *vma,
 			  unsigned long end, unsigned long newflags);
 
 /*
- * get_user_pages_fast provides equivalent functionality to get_user_pages,
- * operating on current and current->mm (force=0 and doesn't return any vmas).
- *
- * get_user_pages_fast may take mmap_sem and page tables, so no assumptions
- * can be made about locking. get_user_pages_fast is to be implemented in a
- * way that is advantageous (vs get_user_pages()) when the user memory area is
- * already faulted in and present in ptes. However if the pages have to be
- * faulted in, it may turn out to be slightly slower).
- */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-			struct page **pages);
-
-/*
  * A callback you can register to apply pressure to ageable caches.
  *
  * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'.  It should
diff --git a/mm/memory.c b/mm/memory.c
index 4126dd1..891bad0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 	return i;
 }
 
+/**
+ * get_user_pages() - pin user pages in memory
+ * @tsk:	task_struct of target task
+ * @mm:		mm_struct of target mm
+ * @start:	starting user address
+ * @len:	number of pages from start to pin
+ * @write:	whether pages will be written to by the caller
+ * @force:	whether to force write access even if user mapping is
+ *		readonly. This will result in the page being COWed even
+ *		in MAP_SHARED mappings. You do not want this.
+ * @pages:	array that receives pointers to the pages pinned.
+ *		Should be at least nr_pages long. Or NULL, if caller
+ *		only intends to ensure the pages are faulted in.
+ * @vmas:	array of pointers to vmas corresponding to each page.
+ *		Or NULL if the caller does not require them.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If len is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If write=0, the page must not be written to. If the page is written to,
+ * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
+ * after the page is finished with, and before put_page is called.
+ *
+ * get_user_pages is typically used for fewer-copy IO operations, to get a
+ * handle on the memory by some means other than accesses via the user virtual
+ * addresses. The pages may be submitted for DMA to devices or accessed via
+ * their kernel linear mapping (via the kmap APIs). Care should be taken to
+ * use the correct cache flushing APIs.
+ *
+ * See also get_user_pages_fast, for performance critical applications.
+ */
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, int len, int write, int force,
 		struct page **pages, struct vm_area_struct **vmas)
diff --git a/mm/util.c b/mm/util.c
index abc65aa..d5d2213 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -233,13 +233,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
  * @pages:	array that receives pointers to the pages pinned.
  *		Should be at least nr_pages long.
  *
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
- * If not successful, it will fall back to taking the lock and
- * calling get_user_pages().
- *
  * Returns number of pages pinned. This may be fewer than the number
  * requested. If nr_pages is 0 or negative, returns 0. If no pages
  * were pinned, returns -errno.
+ *
+ * get_user_pages_fast provides equivalent functionality to get_user_pages,
+ * operating on current and current->mm, with force=0 and vma=NULL. However
+ * unlike get_user_pages, it must be called without mmap_sem held.
+ *
+ * get_user_pages_fast may take mmap_sem and page table locks, so no
+ * assumptions can be made about lack of locking. get_user_pages_fast is to be
+ * implemented in a way that is advantageous (vs get_user_pages()) when the
+ * user memory area is already faulted in and present in ptes. However if the
+ * pages have to be faulted in, it may turn out to be slightly slower so
+ * callers need to carefully consider what to use. On many architectures,
+ * get_user_pages_fast simply falls back to get_user_pages.
  */
 int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 				int nr_pages, int write, struct page **pages)
-- 
cgit v0.10.2


From 78dc583d3ab43115579cb5f3f7bd12e3548dd5a5 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 16 Jun 2009 15:31:40 -0700
Subject: vmscan: low order lumpy reclaim also should use PAGEOUT_IO_SYNC

Commit 33c120ed2843090e2bd316de1588b8bf8b96cbde ("more aggressively use
lumpy reclaim") increased how aggressive lumpy reclaim was by isolating
both active and inactive pages for asynchronous lumpy reclaim on
costly-high-order pages and for cheap-high-order when memory pressure is
high.  However, if the system is under heavy pressure and there are dirty
pages, asynchronous IO may not be sufficient to reclaim a suitable page in
time.

This patch causes the caller to enter synchronous lumpy reclaim for
costly-high-order pages and for cheap-high-order pages when under memory
pressure.

Minchan.kim@gmail.com said:

Andy added synchronous lumpy reclaim with
c661b078fd62abe06fd11fab4ac5e4eeafe26b6d.  At that time, lumpy reclaim is
not agressive.  His intension is just for high-order users.(above
PAGE_ALLOC_COSTLY_ORDER).

After some time, Rik added aggressive lumpy reclaim with
33c120ed2843090e2bd316de1588b8bf8b96cbde.  His intention was to do lumpy
reclaim when high-order users and trouble getting a small set of
contiguous pages.

So we also have to add synchronous pageout for small set of contiguous
pages.

Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Minchan Kim <Minchan.kim@gmail.com>
Reviewed-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 95c08a8..a6b7d14 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1061,6 +1061,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 	unsigned long nr_scanned = 0;
 	unsigned long nr_reclaimed = 0;
 	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+	int lumpy_reclaim = 0;
+
+	/*
+	 * If we need a large contiguous chunk of memory, or have
+	 * trouble getting a small set of contiguous pages, we
+	 * will reclaim both active and inactive pages.
+	 *
+	 * We use the same threshold as pageout congestion_wait below.
+	 */
+	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+		lumpy_reclaim = 1;
+	else if (sc->order && priority < DEF_PRIORITY - 2)
+		lumpy_reclaim = 1;
 
 	pagevec_init(&pvec, 1);
 
@@ -1073,19 +1086,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		unsigned long nr_freed;
 		unsigned long nr_active;
 		unsigned int count[NR_LRU_LISTS] = { 0, };
-		int mode = ISOLATE_INACTIVE;
-
-		/*
-		 * If we need a large contiguous chunk of memory, or have
-		 * trouble getting a small set of contiguous pages, we
-		 * will reclaim both active and inactive pages.
-		 *
-		 * We use the same threshold as pageout congestion_wait below.
-		 */
-		if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-			mode = ISOLATE_BOTH;
-		else if (sc->order && priority < DEF_PRIORITY - 2)
-			mode = ISOLATE_BOTH;
+		int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
 
 		nr_taken = sc->isolate_pages(sc->swap_cluster_max,
 			     &page_list, &nr_scan, sc->order, mode,
@@ -1122,7 +1123,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		 * but that should be acceptable to the caller
 		 */
 		if (nr_freed < nr_taken && !current_is_kswapd() &&
-					sc->order > PAGE_ALLOC_COSTLY_ORDER) {
+		    lumpy_reclaim) {
 			congestion_wait(WRITE, HZ/10);
 
 			/*
-- 
cgit v0.10.2


From dcf975d58565880a134afb13bde511d1b873ce79 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Tue, 16 Jun 2009 15:31:44 -0700
Subject: mm/page-writeback.c: dirty limit type should be unsigned long

get_dirty_limits() calls clip_bdi_dirty_limit() and task_dirty_limit()
with variable pbdi_dirty as one of the arguments.  This variable is an
unsigned long * but both functions expect it to be a long *.  This causes
the following sparse warnings:

  warning: incorrect type in argument 3 (different signedness)
     expected long *pbdi_dirty
     got unsigned long *pbdi_dirty
  warning: incorrect type in argument 2 (different signedness)
     expected long *pdirty
     got unsigned long *pbdi_dirty

Fix the warnings by changing the long * to unsigned long * in both
functions.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bb553c3..7b0dcea 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
  * This avoids exceeding the total dirty_limit when the floating averages
  * fluctuate too quickly.
  */
-static void
-clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
+static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
+		unsigned long dirty, unsigned long *pbdi_dirty)
 {
-	long avail_dirty;
+	unsigned long avail_dirty;
 
-	avail_dirty = dirty -
-		(global_page_state(NR_FILE_DIRTY) +
+	avail_dirty = global_page_state(NR_FILE_DIRTY) +
 		 global_page_state(NR_WRITEBACK) +
 		 global_page_state(NR_UNSTABLE_NFS) +
-		 global_page_state(NR_WRITEBACK_TEMP));
+		 global_page_state(NR_WRITEBACK_TEMP);
 
-	if (avail_dirty < 0)
+	if (avail_dirty < dirty)
+		avail_dirty = dirty - avail_dirty;
+	else
 		avail_dirty = 0;
 
 	avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
@@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
  *
  *   dirty -= (dirty/8) * p_{t}
  */
-static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
 {
 	long numerator, denominator;
-	long dirty = *pdirty;
+	unsigned long dirty = *pdirty;
 	u64 inv = dirty >> 3;
 
 	task_dirties_fraction(tsk, &numerator, &denominator);
-- 
cgit v0.10.2


From f3b39d47ebc51416fc3b690a32dfe030a2035e67 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 16 Jun 2009 15:31:46 -0700
Subject: cpusets: restructure the function cpuset_update_task_memory_state()

The kernel still allocates the page caches on old node after modifying its
cpuset's mems when 'memory_spread_page' was set, or it didn't spread the
page cache evenly over all the nodes that faulting task is allowed to usr
after memory_spread_page was set.  it is caused by the old mem_allowed and
flags of the task, the current kernel doesn't updates them unless some
function invokes cpuset_update_task_memory_state(), it is too late
sometimes.We must update the mem_allowed and the flags of the tasks in
time.

Slab has the same problem.

The following patches fix this bug by updating tasks' mem_allowed and
spread flag after its cpuset's mems or spread flag is changed.

This patch:

Extract a function from cpuset_update_task_memory_state().  It will be
used later for update tasks' page/slab spread flags after its cpuset's
flag is set

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5a7e17..66b24d9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -331,6 +331,24 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 	BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
 }
 
+/*
+ * update task's spread flag if cpuset's page/slab spread flag is set
+ *
+ * Called with callback_mutex/cgroup_mutex held
+ */
+static void cpuset_update_task_spread_flag(struct cpuset *cs,
+					struct task_struct *tsk)
+{
+	if (is_spread_page(cs))
+		tsk->flags |= PF_SPREAD_PAGE;
+	else
+		tsk->flags &= ~PF_SPREAD_PAGE;
+	if (is_spread_slab(cs))
+		tsk->flags |= PF_SPREAD_SLAB;
+	else
+		tsk->flags &= ~PF_SPREAD_SLAB;
+}
+
 /**
  * cpuset_update_task_memory_state - update task memory placement
  *
@@ -388,14 +406,7 @@ void cpuset_update_task_memory_state(void)
 		cs = task_cs(tsk); /* Maybe changed when task not locked */
 		guarantee_online_mems(cs, &tsk->mems_allowed);
 		tsk->cpuset_mems_generation = cs->mems_generation;
-		if (is_spread_page(cs))
-			tsk->flags |= PF_SPREAD_PAGE;
-		else
-			tsk->flags &= ~PF_SPREAD_PAGE;
-		if (is_spread_slab(cs))
-			tsk->flags |= PF_SPREAD_SLAB;
-		else
-			tsk->flags &= ~PF_SPREAD_SLAB;
+		cpuset_update_task_spread_flag(cs, tsk);
 		task_unlock(tsk);
 		mutex_unlock(&callback_mutex);
 		mpol_rebind_task(tsk, &tsk->mems_allowed);
-- 
cgit v0.10.2


From 950592f7b991f267d707d372b90f508bbe72acbc Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 16 Jun 2009 15:31:47 -0700
Subject: cpusets: update tasks' page/slab spread flags in time

Fix the bug that the kernel didn't spread page cache/slab object evenly
over all the allowed nodes when spread flags were set by updating tasks'
page/slab spread flags in time.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 66b24d9..af5a83d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -406,7 +406,6 @@ void cpuset_update_task_memory_state(void)
 		cs = task_cs(tsk); /* Maybe changed when task not locked */
 		guarantee_online_mems(cs, &tsk->mems_allowed);
 		tsk->cpuset_mems_generation = cs->mems_generation;
-		cpuset_update_task_spread_flag(cs, tsk);
 		task_unlock(tsk);
 		mutex_unlock(&callback_mutex);
 		mpol_rebind_task(tsk, &tsk->mems_allowed);
@@ -1204,6 +1203,46 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 }
 
 /*
+ * cpuset_change_flag - make a task's spread flags the same as its cpuset's
+ * @tsk: task to be updated
+ * @scan: struct cgroup_scanner containing the cgroup of the task
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup.
+ *
+ * We don't need to re-check for the cgroup/cpuset membership, since we're
+ * holding cgroup_lock() at this point.
+ */
+static void cpuset_change_flag(struct task_struct *tsk,
+				struct cgroup_scanner *scan)
+{
+	cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
+}
+
+/*
+ * update_tasks_flags - update the spread flags of tasks in the cpuset.
+ * @cs: the cpuset in which each task's spread flags needs to be changed
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
+ *
+ * Called with cgroup_mutex held
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ *
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
+ */
+static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
+{
+	struct cgroup_scanner scan;
+
+	scan.cg = cs->css.cgroup;
+	scan.test_task = NULL;
+	scan.process_task = cpuset_change_flag;
+	scan.heap = heap;
+	cgroup_scan_tasks(&scan);
+}
+
+/*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:		the bit to update (see cpuset_flagbits_t)
  * cs:		the cpuset to update
@@ -1216,8 +1255,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 		       int turning_on)
 {
 	struct cpuset *trialcs;
-	int err;
 	int balance_flag_changed;
+	int spread_flag_changed;
+	struct ptr_heap heap;
+	int err;
 
 	trialcs = alloc_trial_cpuset(cs);
 	if (!trialcs)
@@ -1232,9 +1273,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	if (err < 0)
 		goto out;
 
+	err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+	if (err < 0)
+		goto out;
+
 	balance_flag_changed = (is_sched_load_balance(cs) !=
 				is_sched_load_balance(trialcs));
 
+	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
+			|| (is_spread_page(cs) != is_spread_page(trialcs)));
+
 	mutex_lock(&callback_mutex);
 	cs->flags = trialcs->flags;
 	mutex_unlock(&callback_mutex);
@@ -1242,6 +1290,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
 		async_rebuild_sched_domains();
 
+	if (spread_flag_changed)
+		update_tasks_flags(cs, &heap);
+	heap_free(&heap);
 out:
 	free_trial_cpuset(trialcs);
 	return err;
@@ -1392,6 +1443,8 @@ static void cpuset_attach(struct cgroup_subsys *ss,
 	if (err)
 		return;
 
+	cpuset_update_task_spread_flag(cs, tsk);
+
 	from = oldcs->mems_allowed;
 	to = cs->mems_allowed;
 	mm = get_task_mm(tsk);
@@ -1453,11 +1506,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 		break;
 	case FILE_SPREAD_PAGE:
 		retval = update_flag(CS_SPREAD_PAGE, cs, val);
-		cs->mems_generation = cpuset_mems_generation++;
 		break;
 	case FILE_SPREAD_SLAB:
 		retval = update_flag(CS_SPREAD_SLAB, cs, val);
-		cs->mems_generation = cpuset_mems_generation++;
 		break;
 	default:
 		retval = -EINVAL;
-- 
cgit v0.10.2


From 58568d2a8215cb6f55caf2332017d7bdff954e1c Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 16 Jun 2009 15:31:49 -0700
Subject: cpuset,mm: update tasks' mems_allowed in time

Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.

In order to update tasks' mems_allowed in time, we must modify the code of
memory policy.  Because the memory policy is applied in the process's
context originally.  After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.

But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression.  But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set.  In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.

[lee.schermerhorn@hp.com:
  The rework of mpol_new() to extract the adjusting of the node mask to
  apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
  with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
  allocation.  Fix this by adding the check for MPOL_PREFERRED and empty
  node mask to mpol_new_mpolicy().

  Remove the now unneeded 'nodes = NULL' from mpol_new().

  Note that mpol_new_mempolicy() is always called with a non-NULL
  'nodes' parameter now that it has been removed from mpol_new().
  Therefore, we don't need to test nodes for NULL before testing it for
  'empty'.  However, just to be extra paranoid, add a VM_BUG_ON() to
  verify this assumption.]
[lee.schermerhorn@hp.com:

  I don't think the function name 'mpol_new_mempolicy' is descriptive
  enough to differentiate it from mpol_new().

  This function applies cpuset set context, usually constraining nodes
  to those allowed by the cpuset.  However, when the 'RELATIVE_NODES flag
  is set, it also translates the nodes.  So I settled on
  'mpol_set_nodemask()', because the comment block for mpol_new() mentions
  that we need to call this function to "set nodes".

  Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 05ea1dd..a5740fc 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -18,7 +18,6 @@
 
 extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 
-extern int cpuset_init_early(void);
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
@@ -27,7 +26,6 @@ extern void cpuset_cpus_allowed_locked(struct task_struct *p,
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
-void cpuset_update_task_memory_state(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
 extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask);
@@ -92,9 +90,13 @@ extern void rebuild_sched_domains(void);
 
 extern void cpuset_print_task_mems_allowed(struct task_struct *p);
 
+static inline void set_mems_allowed(nodemask_t nodemask)
+{
+	current->mems_allowed = nodemask;
+}
+
 #else /* !CONFIG_CPUSETS */
 
-static inline int cpuset_init_early(void) { return 0; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
@@ -116,7 +118,6 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 
 #define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY])
 static inline void cpuset_init_current_mems_allowed(void) {}
-static inline void cpuset_update_task_memory_state(void) {}
 
 static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 {
@@ -188,6 +189,10 @@ static inline void cpuset_print_task_mems_allowed(struct task_struct *p)
 {
 }
 
+static inline void set_mems_allowed(nodemask_t nodemask)
+{
+}
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c900aa5..1048bf5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1318,7 +1318,8 @@ struct task_struct {
 /* Thread group tracking */
    	u32 parent_exec_id;
    	u32 self_exec_id;
-/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
+/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
+ * mempolicy */
 	spinlock_t alloc_lock;
 
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -1386,8 +1387,7 @@ struct task_struct {
 	cputime_t acct_timexpd;	/* stime + utime since last update */
 #endif
 #ifdef CONFIG_CPUSETS
-	nodemask_t mems_allowed;
-	int cpuset_mems_generation;
+	nodemask_t mems_allowed;	/* Protected by alloc_lock */
 	int cpuset_mem_spread_rotor;
 #endif
 #ifdef CONFIG_CGROUPS
@@ -1410,7 +1410,7 @@ struct task_struct {
 	struct list_head perf_counter_list;
 #endif
 #ifdef CONFIG_NUMA
-	struct mempolicy *mempolicy;
+	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
 	short il_next;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
diff --git a/init/main.c b/init/main.c
index f6204f7..5e0d3f0 100644
--- a/init/main.c
+++ b/init/main.c
@@ -670,7 +670,6 @@ asmlinkage void __init start_kernel(void)
 		initrd_start = 0;
 	}
 #endif
-	cpuset_init_early();
 	page_cgroup_init();
 	enable_debug_pagealloc();
 	cpu_hotplug_init();
@@ -867,6 +866,11 @@ static noinline int init_post(void)
 static int __init kernel_init(void * unused)
 {
 	lock_kernel();
+
+	/*
+	 * init can allocate pages on any node
+	 */
+	set_mems_allowed(node_possible_map);
 	/*
 	 * init can run on any cpu.
 	 */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index af5a83d..7e75a41 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -97,12 +97,6 @@ struct cpuset {
 
 	struct cpuset *parent;		/* my parent */
 
-	/*
-	 * Copy of global cpuset_mems_generation as of the most
-	 * recent time this cpuset changed its mems_allowed.
-	 */
-	int mems_generation;
-
 	struct fmeter fmeter;		/* memory_pressure filter */
 
 	/* partition number for rebuild_sched_domains() */
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
 	return test_bit(CS_SPREAD_SLAB, &cs->flags);
 }
 
-/*
- * Increment this integer everytime any cpuset changes its
- * mems_allowed value.  Users of cpusets can track this generation
- * number, and avoid having to lock and reload mems_allowed unless
- * the cpuset they're using changes generation.
- *
- * A single, global generation is needed because cpuset_attach_task() could
- * reattach a task to a different cpuset, which must not have its
- * generation numbers aliased with those of that tasks previous cpuset.
- *
- * Generations are needed for mems_allowed because one task cannot
- * modify another's memory placement.  So we must enable every task,
- * on every visit to __alloc_pages(), to efficiently check whether
- * its current->cpuset->mems_allowed has changed, requiring an update
- * of its current->mems_allowed.
- *
- * Since writes to cpuset_mems_generation are guarded by the cgroup lock
- * there is no need to mark it atomic.
- */
-static int cpuset_mems_generation;
-
 static struct cpuset top_cpuset = {
 	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
 };
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
  * If a task is only holding callback_mutex, then it has read-only
  * access to cpusets.
  *
- * The task_struct fields mems_allowed and mems_generation may only
- * be accessed in the context of that task, so require no locks.
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
+ * by other task, we use alloc_lock in the task_struct fields to protect
+ * them.
  *
  * The cpuset_common_file_read() handlers only hold callback_mutex across
  * small pieces of code, such as when reading out possibly multi-word
@@ -349,69 +323,6 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
 		tsk->flags &= ~PF_SPREAD_SLAB;
 }
 
-/**
- * cpuset_update_task_memory_state - update task memory placement
- *
- * If the current tasks cpusets mems_allowed changed behind our
- * backs, update current->mems_allowed, mems_generation and task NUMA
- * mempolicy to the new value.
- *
- * Task mempolicy is updated by rebinding it relative to the
- * current->cpuset if a task has its memory placement changed.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_mutex or task_lock() held.  May be
- * called with or without cgroup_mutex held.  Thanks in part to
- * 'the_top_cpuset_hack', the task's cpuset pointer will never
- * be NULL.  This routine also might acquire callback_mutex during
- * call.
- *
- * Reading current->cpuset->mems_generation doesn't need task_lock
- * to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset using RCU.
- *
- * The rcu_dereference() is technically probably not needed,
- * as I don't actually mind if I see a new cpuset pointer but
- * an old value of mems_generation.  However this really only
- * matters on alpha systems using cpusets heavily.  If I dropped
- * that rcu_dereference(), it would save them a memory barrier.
- * For all other arch's, rcu_dereference is a no-op anyway, and for
- * alpha systems not using cpusets, another planned optimization,
- * avoiding the rcu critical section for tasks in the root cpuset
- * which is statically allocated, so can't vanish, will make this
- * irrelevant.  Better to use RCU as intended, than to engage in
- * some cute trick to save a memory barrier that is impossible to
- * test, for alpha systems using cpusets heavily, which might not
- * even exist.
- *
- * This routine is needed to update the per-task mems_allowed data,
- * within the tasks context, when it is trying to allocate memory
- * (in various mm/mempolicy.c routines) and notices that some other
- * task has been modifying its cpuset.
- */
-
-void cpuset_update_task_memory_state(void)
-{
-	int my_cpusets_mem_gen;
-	struct task_struct *tsk = current;
-	struct cpuset *cs;
-
-	rcu_read_lock();
-	my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
-	rcu_read_unlock();
-
-	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
-		mutex_lock(&callback_mutex);
-		task_lock(tsk);
-		cs = task_cs(tsk); /* Maybe changed when task not locked */
-		guarantee_online_mems(cs, &tsk->mems_allowed);
-		tsk->cpuset_mems_generation = cs->mems_generation;
-		task_unlock(tsk);
-		mutex_unlock(&callback_mutex);
-		mpol_rebind_task(tsk, &tsk->mems_allowed);
-	}
-}
-
 /*
  * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
  *
@@ -1017,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  *    other task, the task_struct mems_allowed that we are hacking
  *    is for our current task, which must allocate new pages for that
  *    migrating memory region.
- *
- *    We call cpuset_update_task_memory_state() before hacking
- *    our tasks mems_allowed, so that we are assured of being in
- *    sync with our tasks cpuset, and in particular, callbacks to
- *    cpuset_update_task_memory_state() from nested page allocations
- *    won't see any mismatch of our cpuset and task mems_generation
- *    values, so won't overwrite our hacked tasks mems_allowed
- *    nodemask.
  */
 
 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1032,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 {
 	struct task_struct *tsk = current;
 
-	cpuset_update_task_memory_state();
-
-	mutex_lock(&callback_mutex);
 	tsk->mems_allowed = *to;
-	mutex_unlock(&callback_mutex);
 
 	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
 
-	mutex_lock(&callback_mutex);
 	guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
-	mutex_unlock(&callback_mutex);
 }
 
 /*
- * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
- * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
+ * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
+ * @tsk: the task to change
+ * @newmems: new nodes that the task will be set
+ *
+ * In order to avoid seeing no nodes if the old and new nodes are disjoint,
+ * we structure updates as setting all new allowed nodes, then clearing newly
+ * disallowed ones.
+ *
+ * Called with task's alloc_lock held
+ */
+static void cpuset_change_task_nodemask(struct task_struct *tsk,
+					nodemask_t *newmems)
+{
+	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+	mpol_rebind_task(tsk, &tsk->mems_allowed);
+	mpol_rebind_task(tsk, newmems);
+	tsk->mems_allowed = *newmems;
+}
+
+/*
+ * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
+ * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
+ * memory_migrate flag is set. Called with cgroup_mutex held.
  */
 static void cpuset_change_nodemask(struct task_struct *p,
 				   struct cgroup_scanner *scan)
@@ -1056,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	struct cpuset *cs;
 	int migrate;
 	const nodemask_t *oldmem = scan->data;
+	nodemask_t newmems;
+
+	cs = cgroup_cs(scan->cg);
+	guarantee_online_mems(cs, &newmems);
+
+	task_lock(p);
+	cpuset_change_task_nodemask(p, &newmems);
+	task_unlock(p);
 
 	mm = get_task_mm(p);
 	if (!mm)
 		return;
 
-	cs = cgroup_cs(scan->cg);
 	migrate = is_memory_migrate(cs);
 
 	mpol_rebind_mm(mm, &cs->mems_allowed);
@@ -1114,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 /*
  * Handle user request to change the 'mems' memory placement
  * of a cpuset.  Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies and if
- * the cpuset is marked 'memory_migrate', migrate the tasks
- * pages to the new memory.
+ * cpusets mems_allowed, and for each task in the cpuset,
+ * update mems_allowed and rebind task's mempolicy and any vma
+ * mempolicies and if the cpuset is marked 'memory_migrate',
+ * migrate the tasks pages to the new memory.
  *
  * Call with cgroup_mutex held.  May take callback_mutex during call.
  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1170,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 
 	mutex_lock(&callback_mutex);
 	cs->mems_allowed = trialcs->mems_allowed;
-	cs->mems_generation = cpuset_mems_generation++;
 	mutex_unlock(&callback_mutex);
 
 	update_tasks_nodemask(cs, &oldmem, &heap);
@@ -1434,15 +1358,18 @@ static void cpuset_attach(struct cgroup_subsys *ss,
 
 	if (cs == &top_cpuset) {
 		cpumask_copy(cpus_attach, cpu_possible_mask);
+		to = node_possible_map;
 	} else {
-		mutex_lock(&callback_mutex);
 		guarantee_online_cpus(cs, cpus_attach);
-		mutex_unlock(&callback_mutex);
+		guarantee_online_mems(cs, &to);
 	}
 	err = set_cpus_allowed_ptr(tsk, cpus_attach);
 	if (err)
 		return;
 
+	task_lock(tsk);
+	cpuset_change_task_nodemask(tsk, &to);
+	task_unlock(tsk);
 	cpuset_update_task_spread_flag(cs, tsk);
 
 	from = oldcs->mems_allowed;
@@ -1848,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
 	struct cpuset *parent;
 
 	if (!cont->parent) {
-		/* This is early initialization for the top cgroup */
-		top_cpuset.mems_generation = cpuset_mems_generation++;
 		return &top_cpuset.css;
 	}
 	parent = cgroup_cs(cont->parent);
@@ -1861,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
 		return ERR_PTR(-ENOMEM);
 	}
 
-	cpuset_update_task_memory_state();
 	cs->flags = 0;
 	if (is_spread_page(parent))
 		set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1870,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	cpumask_clear(cs->cpus_allowed);
 	nodes_clear(cs->mems_allowed);
-	cs->mems_generation = cpuset_mems_generation++;
 	fmeter_init(&cs->fmeter);
 	cs->relax_domain_level = -1;
 
@@ -1889,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	struct cpuset *cs = cgroup_cs(cont);
 
-	cpuset_update_task_memory_state();
-
 	if (is_sched_load_balance(cs))
 		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
 
@@ -1911,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
 	.early_init = 1,
 };
 
-/*
- * cpuset_init_early - just enough so that the calls to
- * cpuset_update_task_memory_state() in early init code
- * are harmless.
- */
-
-int __init cpuset_init_early(void)
-{
-	alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
-
-	top_cpuset.mems_generation = cpuset_mems_generation++;
-	return 0;
-}
-
-
 /**
  * cpuset_init - initialize cpusets at system boot
  *
@@ -1936,11 +1842,13 @@ int __init cpuset_init(void)
 {
 	int err = 0;
 
+	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
+		BUG();
+
 	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
 
 	fmeter_init(&top_cpuset.fmeter);
-	top_cpuset.mems_generation = cpuset_mems_generation++;
 	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
 	top_cpuset.relax_domain_level = -1;
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 41c88fe..7fa4413 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -9,6 +9,7 @@
 #include <linux/kthread.h>
 #include <linux/completion.h>
 #include <linux/err.h>
+#include <linux/cpuset.h>
 #include <linux/unistd.h>
 #include <linux/file.h>
 #include <linux/module.h>
@@ -236,6 +237,7 @@ int kthreadd(void *unused)
 	ignore_signals(tsk);
 	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
 	set_cpus_allowed_ptr(tsk, cpu_all_mask);
+	set_mems_allowed(node_possible_map);
 
 	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3eb4a6f..46bdf9d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 	return 0;
 }
 
-/* Create a new policy */
+/*
+ * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
+ * any, for the new policy.  mpol_new() has already validated the nodes
+ * parameter with respect to the policy mode and flags.  But, we need to
+ * handle an empty nodemask with MPOL_PREFERRED here.
+ *
+ * Must be called holding task's alloc_lock to protect task's mems_allowed
+ * and mempolicy.  May also be called holding the mmap_semaphore for write.
+ */
+static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
+{
+	nodemask_t cpuset_context_nmask;
+	int ret;
+
+	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
+	if (pol == NULL)
+		return 0;
+
+	VM_BUG_ON(!nodes);
+	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
+		nodes = NULL;	/* explicit local allocation */
+	else {
+		if (pol->flags & MPOL_F_RELATIVE_NODES)
+			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
+					       &cpuset_current_mems_allowed);
+		else
+			nodes_and(cpuset_context_nmask, *nodes,
+				  cpuset_current_mems_allowed);
+		if (mpol_store_user_nodemask(pol))
+			pol->w.user_nodemask = *nodes;
+		else
+			pol->w.cpuset_mems_allowed =
+						cpuset_current_mems_allowed;
+	}
+
+	ret = mpol_ops[pol->mode].create(pol,
+				nodes ? &cpuset_context_nmask : NULL);
+	return ret;
+}
+
+/*
+ * This function just creates a new policy, does some check and simple
+ * initialization. You must invoke mpol_set_nodemask() to set nodes.
+ */
 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 				  nodemask_t *nodes)
 {
 	struct mempolicy *policy;
-	nodemask_t cpuset_context_nmask;
-	int ret;
 
 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 			if (((flags & MPOL_F_STATIC_NODES) ||
 			     (flags & MPOL_F_RELATIVE_NODES)))
 				return ERR_PTR(-EINVAL);
-			nodes = NULL;	/* flag local alloc */
 		}
 	} else if (nodes_empty(*nodes))
 		return ERR_PTR(-EINVAL);
@@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	policy->mode = mode;
 	policy->flags = flags;
 
-	if (nodes) {
-		/*
-		 * cpuset related setup doesn't apply to local allocation
-		 */
-		cpuset_update_task_memory_state();
-		if (flags & MPOL_F_RELATIVE_NODES)
-			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
-					       &cpuset_current_mems_allowed);
-		else
-			nodes_and(cpuset_context_nmask, *nodes,
-				  cpuset_current_mems_allowed);
-		if (mpol_store_user_nodemask(policy))
-			policy->w.user_nodemask = *nodes;
-		else
-			policy->w.cpuset_mems_allowed =
-						cpuset_mems_allowed(current);
-	}
-
-	ret = mpol_ops[mode].create(policy,
-				nodes ? &cpuset_context_nmask : NULL);
-	if (ret < 0) {
-		kmem_cache_free(policy_cache, policy);
-		return ERR_PTR(ret);
-	}
 	return policy;
 }
 
@@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 /*
  * Wrapper for mpol_rebind_policy() that just requires task
  * pointer, and updates task mempolicy.
+ *
+ * Called with task's alloc_lock held.
  */
 
 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
@@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void)
 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 			     nodemask_t *nodes)
 {
-	struct mempolicy *new;
+	struct mempolicy *new, *old;
 	struct mm_struct *mm = current->mm;
+	int ret;
 
 	new = mpol_new(mode, flags, nodes);
 	if (IS_ERR(new))
@@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	 */
 	if (mm)
 		down_write(&mm->mmap_sem);
-	mpol_put(current->mempolicy);
+	task_lock(current);
+	ret = mpol_set_nodemask(new, nodes);
+	if (ret) {
+		task_unlock(current);
+		if (mm)
+			up_write(&mm->mmap_sem);
+		mpol_put(new);
+		return ret;
+	}
+	old = current->mempolicy;
 	current->mempolicy = new;
 	mpol_set_task_struct_flag();
 	if (new && new->mode == MPOL_INTERLEAVE &&
 	    nodes_weight(new->v.nodes))
 		current->il_next = first_node(new->v.nodes);
+	task_unlock(current);
 	if (mm)
 		up_write(&mm->mmap_sem);
 
+	mpol_put(old);
 	return 0;
 }
 
 /*
  * Return nodemask for policy for get_mempolicy() query
+ *
+ * Called with task's alloc_lock held
  */
 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 {
@@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
 
-	cpuset_update_task_memory_state();
 	if (flags &
 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 		return -EINVAL;
@@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 			return -EINVAL;
 		*policy = 0;	/* just so it's initialized */
+		task_lock(current);
 		*nmask  = cpuset_current_mems_allowed;
+		task_unlock(current);
 		return 0;
 	}
 
@@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	}
 
 	err = 0;
-	if (nmask)
+	if (nmask) {
+		task_lock(current);
 		get_policy_nodemask(pol, nmask);
+		task_unlock(current);
+	}
 
  out:
 	mpol_cond_put(pol);
@@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len,
 			return err;
 	}
 	down_write(&mm->mmap_sem);
+	task_lock(current);
+	err = mpol_set_nodemask(new, nmask);
+	task_unlock(current);
+	if (err) {
+		up_write(&mm->mmap_sem);
+		mpol_put(new);
+		return err;
+	}
 	vma = check_range(mm, start, end, nmask,
 			  flags | MPOL_MF_INVERT, &pagelist);
 
@@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
 
-	cpuset_update_task_memory_state();
-
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
 
@@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
 	struct mempolicy *pol = current->mempolicy;
 
-	if ((gfp & __GFP_WAIT) && !in_interrupt())
-		cpuset_update_task_memory_state();
 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
 		pol = &default_policy;
 
@@ -1854,6 +1894,8 @@ restart:
  */
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 {
+	int ret;
+
 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
 	spin_lock_init(&sp->lock);
 
@@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 
 		/* contextualize the tmpfs mount point mempolicy */
 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
-		mpol_put(mpol);	/* drop our ref on sb mpol */
-		if (IS_ERR(new))
+		if (IS_ERR(new)) {
+			mpol_put(mpol);	/* drop our ref on sb mpol */
 			return;		/* no valid nodemask intersection */
+		}
+
+		task_lock(current);
+		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
+		task_unlock(current);
+		mpol_put(mpol);	/* drop our ref on sb mpol */
+		if (ret) {
+			mpol_put(new);
+			return;
+		}
 
 		/* Create pseudo-vma that contains just the policy */
 		memset(&pvma, 0, sizeof(struct vm_area_struct));
@@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 	new = mpol_new(mode, mode_flags, &nodes);
 	if (IS_ERR(new))
 		err = 1;
-	else if (no_context)
-		new->w.user_nodemask = nodes;	/* save for contextualization */
+	else {
+		int ret;
+
+		task_lock(current);
+		ret = mpol_set_nodemask(new, &nodes);
+		task_unlock(current);
+		if (ret)
+			err = 1;
+		else if (no_context) {
+			/* save for contextualization */
+			new->w.user_nodemask = nodes;
+		}
+	}
 
 out:
 	/* Restore string for error message */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17d5f53..7cc3179 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1569,10 +1569,7 @@ nofail_alloc:
 
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
-	/*
-	 * The task's cpuset might have expanded its set of allowable nodes
-	 */
-	cpuset_update_task_memory_state();
+
 	p->flags |= PF_MEMALLOC;
 
 	lockdep_set_current_reclaim_state(gfp_mask);
-- 
cgit v0.10.2


From 6c0db4664b49417d80988953e69c323721353227 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Tue, 16 Jun 2009 15:31:50 -0700
Subject: mm: alloc_large_system_hash check order

On an x86_64 with 4GB ram, tcp_init()'s call to alloc_large_system_hash(),
to allocate tcp_hashinfo.ehash, is now triggering an mmotm WARN_ON_ONCE on
order >= MAX_ORDER - it's hoping for order 11.  alloc_large_system_hash()
had better make its own check on the order.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: David Miller <davem@davemloft.net>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7cc3179..cbed869 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4511,7 +4511,10 @@ void *__init alloc_large_system_hash(const char *tablename,
 			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
 		else {
 			unsigned long order = get_order(size);
-			table = (void*) __get_free_pages(GFP_ATOMIC, order);
+
+			if (order < MAX_ORDER)
+				table = (void *)__get_free_pages(GFP_ATOMIC,
+								order);
 			/*
 			 * If bucketsize is not a power-of-two, we may free
 			 * some pages at the end of hash table.
-- 
cgit v0.10.2


From d239171e4f6efd58d7e423853056b1b6a74f1446 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:31:52 -0700
Subject: page allocator: replace __alloc_pages_internal() with
 __alloc_pages_nodemask()

The start of a large patch series to clean up and optimise the page
allocator.

The performance improvements are in a wide range depending on the exact
machine but the results I've seen so fair are approximately;

kernbench:	0	to	 0.12% (elapsed time)
		0.49%	to	 3.20% (sys time)
aim9:		-4%	to	30% (for page_test and brk_test)
tbench:		-1%	to	 4%
hackbench:	-2.5%	to	 3.45% (mostly within the noise though)
netperf-udp	-1.34%  to	 4.06% (varies between machines a bit)
netperf-tcp	-0.44%  to	 5.22% (varies between machines a bit)

I haven't sysbench figures at hand, but previously they were within the
-0.5% to 2% range.

On netperf, the client and server were bound to opposite number CPUs to
maximise the problems with cache line bouncing of the struct pages so I
expect different people to report different results for netperf depending
on their exact machine and how they ran the test (different machines, same
cpus client/server, shared cache but two threads client/server, different
socket client/server etc).

I also measured the vmlinux sizes for a single x86-based config with
CONFIG_DEBUG_INFO enabled but not CONFIG_DEBUG_VM.  The core of the
.config is based on the Debian Lenny kernel config so I expect it to be
reasonably typical.

This patch:

__alloc_pages_internal is the core page allocator function but essentially
it is an alias of __alloc_pages_nodemask.  Naming a publicly available and
exported function "internal" is also a big ugly.  This patch renames
__alloc_pages_internal() to __alloc_pages_nodemask() and deletes the old
nodemask function.

Warning - This patch renames an exported symbol.  No kernel driver is
affected by external drivers calling __alloc_pages_internal() should
change the call to __alloc_pages_nodemask() without any alteration of
parameters.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 3760e7c..549ec55 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -172,24 +172,16 @@ static inline void arch_alloc_page(struct page *page, int order) { }
 #endif
 
 struct page *
-__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		       struct zonelist *zonelist, nodemask_t *nodemask);
 
 static inline struct page *
 __alloc_pages(gfp_t gfp_mask, unsigned int order,
 		struct zonelist *zonelist)
 {
-	return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
+	return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
 }
 
-static inline struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-		struct zonelist *zonelist, nodemask_t *nodemask)
-{
-	return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
-}
-
-
 static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 						unsigned int order)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cbed869..d58df90 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1458,7 +1458,7 @@ try_next_zone:
  * This is the 'heart' of the zoned buddy allocator.
  */
 struct page *
-__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
@@ -1667,7 +1667,7 @@ nopage:
 got_pg:
 	return page;
 }
-EXPORT_SYMBOL(__alloc_pages_internal);
+EXPORT_SYMBOL(__alloc_pages_nodemask);
 
 /*
  * Common helper functions.
-- 
cgit v0.10.2


From b3c466ce512923298ae8c0121d3e9f397a3f1210 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:31:53 -0700
Subject: page allocator: do not sanity check order in the fast path

No user of the allocator API should be passing in an order >= MAX_ORDER
but we check for it on each and every allocation.  Delete this check and
make it a VM_BUG_ON check further down the call path.

[akpm@linux-foundation.org: s/VM_BUG_ON/WARN_ON_ONCE/]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 549ec55..c2d3fe0 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -185,9 +185,6 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
 static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 						unsigned int order)
 {
-	if (unlikely(order >= MAX_ORDER))
-		return NULL;
-
 	/* Unknown node is current node */
 	if (nid < 0)
 		nid = numa_node_id();
@@ -201,9 +198,6 @@ extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
 static inline struct page *
 alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
-	if (unlikely(order >= MAX_ORDER))
-		return NULL;
-
 	return alloc_pages_current(gfp_mask, order);
 }
 extern struct page *alloc_page_vma(gfp_t gfp_mask,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d58df90..bfbd95c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1401,6 +1401,9 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 
 	classzone_idx = zone_idx(preferred_zone);
 
+	if (WARN_ON_ONCE(order >= MAX_ORDER))
+		return NULL;
+
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
-- 
cgit v0.10.2


From 6484eb3e2a81807722c5f28efef94d8338b7b996 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:31:54 -0700
Subject: page allocator: do not check NUMA node ID when the caller knows the
 node is valid

Callers of alloc_pages_node() can optionally specify -1 as a node to mean
"allocate from the current node".  However, a number of the callers in
fast paths know for a fact their node is valid.  To avoid a comparison and
branch, this patch adds alloc_pages_exact_node() that only checks the nid
with VM_BUG_ON().  Callers that know their node is valid are then
converted.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Paul Mundt <lethal@linux-sh.org>	[for the SLOB NUMA bits]
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 56ceb68..fe63b2dc9 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1131,7 +1131,7 @@ sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp
 #ifdef CONFIG_NUMA
 	{
 		struct page *page;
-		page = alloc_pages_node(ioc->node == MAX_NUMNODES ?
+		page = alloc_pages_exact_node(ioc->node == MAX_NUMNODES ?
 		                        numa_node_id() : ioc->node, flags,
 		                        get_order(size));
 
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 8f33a88..5b17bd4 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1829,8 +1829,7 @@ ia64_mca_cpu_init(void *cpu_data)
 			data = mca_bootmem();
 			first_time = 0;
 		} else
-			data = page_address(alloc_pages_node(numa_node_id(),
-					GFP_KERNEL, get_order(sz)));
+			data = __get_free_pages(GFP_KERNEL, get_order(sz));
 		if (!data)
 			panic("Could not allocate MCA memory for cpu %d\n",
 					cpu);
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index 8eff8c1..6ba72ab 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -98,7 +98,8 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
 
 	/* attempt to allocate a granule's worth of cached memory pages */
 
-	page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+	page = alloc_pages_exact_node(nid,
+				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
 				IA64_GRANULE_SHIFT-PAGE_SHIFT);
 	if (!page) {
 		mutex_unlock(&uc_pool->add_chunk_mutex);
diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c
index d876423..98b6849 100644
--- a/arch/ia64/sn/pci/pci_dma.c
+++ b/arch/ia64/sn/pci/pci_dma.c
@@ -90,7 +90,8 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size,
 	 */
 	node = pcibus_to_node(pdev->bus);
 	if (likely(node >=0)) {
-		struct page *p = alloc_pages_node(node, flags, get_order(size));
+		struct page *p = alloc_pages_exact_node(node,
+						flags, get_order(size));
 
 		if (likely(p))
 			cpuaddr = page_address(p);
diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c
index 296b526..5e0a191 100644
--- a/arch/powerpc/platforms/cell/ras.c
+++ b/arch/powerpc/platforms/cell/ras.c
@@ -122,8 +122,8 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order)
 
 	area->nid = nid;
 	area->order = order;
-	area->pages = alloc_pages_node(area->nid, GFP_KERNEL | GFP_THISNODE,
-					area->order);
+	area->pages = alloc_pages_exact_node(area->nid, GFP_KERNEL|GFP_THISNODE,
+						area->order);
 
 	if (!area->pages) {
 		printk(KERN_WARNING "%s: no page on node %d\n",
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 32d6ae8..e770bf3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1277,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
 	struct page *pages;
 	struct vmcs *vmcs;
 
-	pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
+	pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
 	if (!pages)
 		return NULL;
 	vmcs = page_address(pages);
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index bbefe77..3ce2920 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -302,7 +302,7 @@ static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr)
 		pnode = uv_node_to_pnode(nid);
 		if (bid < 0 || gru_base[bid])
 			continue;
-		page = alloc_pages_node(nid, GFP_KERNEL, order);
+		page = alloc_pages_exact_node(nid, GFP_KERNEL, order);
 		if (!page)
 			goto fail;
 		gru_base[bid] = page_address(page);
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
index 9172fcd..c76677a 100644
--- a/drivers/misc/sgi-xp/xpc_uv.c
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -232,7 +232,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name,
 	mq->mmr_blade = uv_cpu_to_blade_id(cpu);
 
 	nid = cpu_to_node(cpu);
-	page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+	page = alloc_pages_exact_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
 				pg_order);
 	if (page == NULL) {
 		dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d "
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c2d3fe0..4efa330 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -5,6 +5,7 @@
 #include <linux/stddef.h>
 #include <linux/linkage.h>
 #include <linux/topology.h>
+#include <linux/mmdebug.h>
 
 struct vm_area_struct;
 
@@ -192,6 +193,14 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 	return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
 }
 
+static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
+						unsigned int order)
+{
+	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
+
+	return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
+}
+
 #ifdef CONFIG_NUMA
 extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a880161..7b548e7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -7,7 +7,6 @@
 
 #include <linux/gfp.h>
 #include <linux/list.h>
-#include <linux/mmdebug.h>
 #include <linux/mmzone.h>
 #include <linux/rbtree.h>
 #include <linux/prio_tree.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index 28cf26a..69911b5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -365,7 +365,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
 		node = cpu_to_node(cpu);
 		per_cpu(cpu_profile_flip, cpu) = 0;
 		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
-			page = alloc_pages_node(node,
+			page = alloc_pages_exact_node(node,
 					GFP_KERNEL | __GFP_ZERO,
 					0);
 			if (!page)
@@ -373,7 +373,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
 			per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
 		}
 		if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-			page = alloc_pages_node(node,
+			page = alloc_pages_exact_node(node,
 					GFP_KERNEL | __GFP_ZERO,
 					0);
 			if (!page)
@@ -564,14 +564,14 @@ static int create_hash_tables(void)
 		int node = cpu_to_node(cpu);
 		struct page *page;
 
-		page = alloc_pages_node(node,
+		page = alloc_pages_exact_node(node,
 				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
 				0);
 		if (!page)
 			goto out_cleanup;
 		per_cpu(cpu_profile_hits, cpu)[1]
 				= (struct profile_hit *)page_address(page);
-		page = alloc_pages_node(node,
+		page = alloc_pages_exact_node(node,
 				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
 				0);
 		if (!page)
diff --git a/mm/filemap.c b/mm/filemap.c
index 6846a90..2239671 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
 {
 	if (cpuset_do_page_mem_spread()) {
 		int n = cpuset_mem_spread_node();
-		return alloc_pages_node(n, gfp, 0);
+		return alloc_pages_exact_node(n, gfp, 0);
 	}
 	return alloc_pages(gfp, 0);
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e83ad2c..2f8241f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -630,7 +630,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 	if (h->order >= MAX_ORDER)
 		return NULL;
 
-	page = alloc_pages_node(nid,
+	page = alloc_pages_exact_node(nid,
 		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
 						__GFP_REPEAT|__GFP_NOWARN,
 		huge_page_order(h));
@@ -649,7 +649,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
  * Use a helper variable to find the next node and then
  * copy it back to hugetlb_next_nid afterwards:
  * otherwise there's a window in which a racer might
- * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+ * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
  * But we don't need to use a spin_lock here: it really
  * doesn't matter if occasionally a racer chooses the
  * same nid as we do.  Move nid forward in the mask even
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 46bdf9d..e08e2c4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -803,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 
 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 {
-	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
+	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 }
 
 /*
diff --git a/mm/migrate.c b/mm/migrate.c
index 068655d..5a24923 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
 
 	*result = &pm->status;
 
-	return alloc_pages_node(pm->node,
+	return alloc_pages_exact_node(pm->node,
 				GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
 }
 
diff --git a/mm/slab.c b/mm/slab.c
index 18e3164..bb3254c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1707,7 +1707,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		flags |= __GFP_RECLAIMABLE;
 
-	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
+	page = alloc_pages_exact_node(nodeid, flags, cachep->gfporder);
 	if (!page)
 		return NULL;
 
@@ -3261,7 +3261,7 @@ retry:
 		if (local_flags & __GFP_WAIT)
 			local_irq_enable();
 		kmem_flagcheck(cache, flags);
-		obj = kmem_getpages(cache, local_flags, -1);
+		obj = kmem_getpages(cache, local_flags, numa_node_id());
 		if (local_flags & __GFP_WAIT)
 			local_irq_disable();
 		if (obj) {
diff --git a/mm/slob.c b/mm/slob.c
index 12f2614..64f6db1 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -46,7 +46,7 @@
  * NUMA support in SLOB is fairly simplistic, pushing most of the real
  * logic down to the page allocator, and simply doing the node accounting
  * on the upper levels. In the event that a node id is explicitly
- * provided, alloc_pages_node() with the specified node id is used
+ * provided, alloc_pages_exact_node() with the specified node id is used
  * instead. The common case (or when the node id isn't explicitly provided)
  * will default to the current node, as per numa_node_id().
  *
@@ -244,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
 
 #ifdef CONFIG_NUMA
 	if (node != -1)
-		page = alloc_pages_node(node, gfp, order);
+		page = alloc_pages_exact_node(node, gfp, order);
 	else
 #endif
 		page = alloc_pages(gfp, order);
-- 
cgit v0.10.2


From 7f82af9742a9346794ecc1515139daed480e7025 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:31:56 -0700
Subject: page allocator: check only once if the zonelist is suitable for the
 allocation

It is possible with __GFP_THISNODE that no zones are suitable.  This patch
makes sure the check is only made once.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bfbd95c..6be8fcb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1483,9 +1483,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
 
-restart:
-	z = zonelist->_zonerefs;  /* the list of zones suitable for gfp_mask */
-
+	/* the list of zones suitable for gfp_mask */
+	z = zonelist->_zonerefs;
 	if (unlikely(!z->zone)) {
 		/*
 		 * Happens if we have an empty zonelist as a result of
@@ -1494,6 +1493,7 @@ restart:
 		return NULL;
 	}
 
+restart:
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
 	if (page)
-- 
cgit v0.10.2


From 11e33f6a55ed7847d9c8ffe185ef87faf7806abe Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:31:57 -0700
Subject: page allocator: break up the allocator entry point into fast and slow
 paths

The core of the page allocator is one giant function which allocates
memory on the stack and makes calculations that may not be needed for
every allocation.  This patch breaks up the allocator path into fast and
slow paths for clarity.  Note the slow paths are still inlined but the
entry is marked unlikely.  If they were not inlined, it actally increases
text size to generate the as there is only one call site.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6be8fcb..512bf9a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1457,47 +1457,171 @@ try_next_zone:
 	return page;
 }
 
-/*
- * This is the 'heart' of the zoned buddy allocator.
- */
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-			struct zonelist *zonelist, nodemask_t *nodemask)
+static inline int
+should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+				unsigned long pages_reclaimed)
 {
-	const gfp_t wait = gfp_mask & __GFP_WAIT;
-	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
-	struct zoneref *z;
-	struct zone *zone;
-	struct page *page;
-	struct reclaim_state reclaim_state;
-	struct task_struct *p = current;
-	int do_retry;
-	int alloc_flags;
-	unsigned long did_some_progress;
-	unsigned long pages_reclaimed = 0;
+	/* Do not loop if specifically requested */
+	if (gfp_mask & __GFP_NORETRY)
+		return 0;
 
-	lockdep_trace_alloc(gfp_mask);
+	/*
+	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
+	 * means __GFP_NOFAIL, but that may not be true in other
+	 * implementations.
+	 */
+	if (order <= PAGE_ALLOC_COSTLY_ORDER)
+		return 1;
 
-	might_sleep_if(wait);
+	/*
+	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
+	 * specified, then we retry until we no longer reclaim any pages
+	 * (above), or we've reclaimed an order of pages at least as
+	 * large as the allocation's order. In both cases, if the
+	 * allocation still fails, we stop retrying.
+	 */
+	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
+		return 1;
 
-	if (should_fail_alloc_page(gfp_mask, order))
-		return NULL;
+	/*
+	 * Don't let big-order allocations loop unless the caller
+	 * explicitly requests that.
+	 */
+	if (gfp_mask & __GFP_NOFAIL)
+		return 1;
 
-	/* the list of zones suitable for gfp_mask */
-	z = zonelist->_zonerefs;
-	if (unlikely(!z->zone)) {
-		/*
-		 * Happens if we have an empty zonelist as a result of
-		 * GFP_THISNODE being used on a memoryless node
-		 */
+	return 0;
+}
+
+static inline struct page *
+__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask)
+{
+	struct page *page;
+
+	/* Acquire the OOM killer lock for the zones in zonelist */
+	if (!try_set_zone_oom(zonelist, gfp_mask)) {
+		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 
-restart:
-	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
-			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+	/*
+	 * Go through the zonelist yet one more time, keep very high watermark
+	 * here, this is only to catch a parallel oom killing, we must fail if
+	 * we're still under heavy pressure.
+	 */
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+		order, zonelist, high_zoneidx,
+		ALLOC_WMARK_HIGH|ALLOC_CPUSET);
 	if (page)
-		goto got_pg;
+		goto out;
+
+	/* The OOM killer will not help higher order allocs */
+	if (order > PAGE_ALLOC_COSTLY_ORDER)
+		goto out;
+
+	/* Exhausted what can be done so it's blamo time */
+	out_of_memory(zonelist, gfp_mask, order);
+
+out:
+	clear_zonelist_oom(zonelist, gfp_mask);
+	return page;
+}
+
+/* The really slow allocator path where we enter direct reclaim */
+static inline struct page *
+__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress)
+{
+	struct page *page = NULL;
+	struct reclaim_state reclaim_state;
+	struct task_struct *p = current;
+
+	cond_resched();
+
+	/* We now go into synchronous reclaim */
+	cpuset_memory_pressure_bump();
+
+	/*
+	 * The task's cpuset might have expanded its set of allowable nodes
+	 */
+	p->flags |= PF_MEMALLOC;
+	lockdep_set_current_reclaim_state(gfp_mask);
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
+
+	*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
+
+	p->reclaim_state = NULL;
+	lockdep_clear_current_reclaim_state();
+	p->flags &= ~PF_MEMALLOC;
+
+	cond_resched();
+
+	if (order != 0)
+		drain_all_pages();
+
+	if (likely(*did_some_progress))
+		page = get_page_from_freelist(gfp_mask, nodemask, order,
+					zonelist, high_zoneidx, alloc_flags);
+	return page;
+}
+
+static inline int
+is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
+{
+	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+			&& !in_interrupt())
+		return 1;
+	return 0;
+}
+
+/*
+ * This is called in the allocator slow-path if the allocation request is of
+ * sufficient urgency to ignore watermarks and take other desperate measures
+ */
+static inline struct page *
+__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask)
+{
+	struct page *page;
+
+	do {
+		page = get_page_from_freelist(gfp_mask, nodemask, order,
+			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
+
+		if (!page && gfp_mask & __GFP_NOFAIL)
+			congestion_wait(WRITE, HZ/50);
+	} while (!page && (gfp_mask & __GFP_NOFAIL));
+
+	return page;
+}
+
+static inline
+void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
+						enum zone_type high_zoneidx)
+{
+	struct zoneref *z;
+	struct zone *zone;
+
+	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+		wakeup_kswapd(zone, order);
+}
+
+static inline struct page *
+__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask)
+{
+	const gfp_t wait = gfp_mask & __GFP_WAIT;
+	struct page *page = NULL;
+	int alloc_flags;
+	unsigned long pages_reclaimed = 0;
+	unsigned long did_some_progress;
+	struct task_struct *p = current;
 
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1510,8 +1634,7 @@ restart:
 	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-		wakeup_kswapd(zone, order);
+	wake_all_kswapd(order, zonelist, high_zoneidx);
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
@@ -1531,6 +1654,7 @@ restart:
 	if (wait)
 		alloc_flags |= ALLOC_CPUSET;
 
+restart:
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
 	 * coming from realtime tasks go deeper into reserves.
@@ -1544,23 +1668,18 @@ restart:
 	if (page)
 		goto got_pg;
 
-	/* This allocation should allow future memory freeing. */
-
 rebalance:
-	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
-			&& !in_interrupt()) {
+	/* Allocate without watermarks if the context allows */
+	if (is_allocation_high_priority(p, gfp_mask)) {
+		/* Do not dip into emergency reserves if specified */
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
-nofail_alloc:
-			/* go through the zonelist yet again, ignoring mins */
-			page = get_page_from_freelist(gfp_mask, nodemask, order,
-				zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
+			page = __alloc_pages_high_priority(gfp_mask, order,
+				zonelist, high_zoneidx, nodemask);
 			if (page)
 				goto got_pg;
-			if (gfp_mask & __GFP_NOFAIL) {
-				congestion_wait(WRITE, HZ/50);
-				goto nofail_alloc;
-			}
 		}
+
+		/* Ensure no recursion into the allocator */
 		goto nopage;
 	}
 
@@ -1568,93 +1687,42 @@ nofail_alloc:
 	if (!wait)
 		goto nopage;
 
-	cond_resched();
-
-	/* We now go into synchronous reclaim */
-	cpuset_memory_pressure_bump();
-
-	p->flags |= PF_MEMALLOC;
-
-	lockdep_set_current_reclaim_state(gfp_mask);
-	reclaim_state.reclaimed_slab = 0;
-	p->reclaim_state = &reclaim_state;
-
-	did_some_progress = try_to_free_pages(zonelist, order,
-						gfp_mask, nodemask);
-
-	p->reclaim_state = NULL;
-	lockdep_clear_current_reclaim_state();
-	p->flags &= ~PF_MEMALLOC;
+	/* Try direct reclaim and then allocating */
+	page = __alloc_pages_direct_reclaim(gfp_mask, order,
+					zonelist, high_zoneidx,
+					nodemask,
+					alloc_flags, &did_some_progress);
+	if (page)
+		goto got_pg;
 
-	cond_resched();
+	/*
+	 * If we failed to make any progress reclaiming, then we are
+	 * running out of options and have to consider going OOM
+	 */
+	if (!did_some_progress) {
+		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+			page = __alloc_pages_may_oom(gfp_mask, order,
+					zonelist, high_zoneidx,
+					nodemask);
+			if (page)
+				goto got_pg;
 
-	if (order != 0)
-		drain_all_pages();
+			/*
+			 * The OOM killer does not trigger for high-order allocations
+			 * but if no progress is being made, there are no other
+			 * options and retrying is unlikely to help
+			 */
+			if (order > PAGE_ALLOC_COSTLY_ORDER)
+				goto nopage;
 
-	if (likely(did_some_progress)) {
-		page = get_page_from_freelist(gfp_mask, nodemask, order,
-					zonelist, high_zoneidx, alloc_flags);
-		if (page)
-			goto got_pg;
-	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
-		if (!try_set_zone_oom(zonelist, gfp_mask)) {
-			schedule_timeout_uninterruptible(1);
 			goto restart;
 		}
-
-		/*
-		 * Go through the zonelist yet one more time, keep
-		 * very high watermark here, this is only to catch
-		 * a parallel oom killing, we must fail if we're still
-		 * under heavy pressure.
-		 */
-		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
-			order, zonelist, high_zoneidx,
-			ALLOC_WMARK_HIGH|ALLOC_CPUSET);
-		if (page) {
-			clear_zonelist_oom(zonelist, gfp_mask);
-			goto got_pg;
-		}
-
-		/* The OOM killer will not help higher order allocs so fail */
-		if (order > PAGE_ALLOC_COSTLY_ORDER) {
-			clear_zonelist_oom(zonelist, gfp_mask);
-			goto nopage;
-		}
-
-		out_of_memory(zonelist, gfp_mask, order);
-		clear_zonelist_oom(zonelist, gfp_mask);
-		goto restart;
 	}
 
-	/*
-	 * Don't let big-order allocations loop unless the caller explicitly
-	 * requests that.  Wait for some write requests to complete then retry.
-	 *
-	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
-	 * means __GFP_NOFAIL, but that may not be true in other
-	 * implementations.
-	 *
-	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
-	 * specified, then we retry until we no longer reclaim any pages
-	 * (above), or we've reclaimed an order of pages at least as
-	 * large as the allocation's order. In both cases, if the
-	 * allocation still fails, we stop retrying.
-	 */
+	/* Check if we should retry the allocation */
 	pages_reclaimed += did_some_progress;
-	do_retry = 0;
-	if (!(gfp_mask & __GFP_NORETRY)) {
-		if (order <= PAGE_ALLOC_COSTLY_ORDER) {
-			do_retry = 1;
-		} else {
-			if (gfp_mask & __GFP_REPEAT &&
-				pages_reclaimed < (1 << order))
-					do_retry = 1;
-		}
-		if (gfp_mask & __GFP_NOFAIL)
-			do_retry = 1;
-	}
-	if (do_retry) {
+	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+		/* Wait for some write requests to complete then retry */
 		congestion_wait(WRITE, HZ/50);
 		goto rebalance;
 	}
@@ -1669,6 +1737,41 @@ nopage:
 	}
 got_pg:
 	return page;
+
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+			struct zonelist *zonelist, nodemask_t *nodemask)
+{
+	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+	struct page *page;
+
+	lockdep_trace_alloc(gfp_mask);
+
+	might_sleep_if(gfp_mask & __GFP_WAIT);
+
+	if (should_fail_alloc_page(gfp_mask, order))
+		return NULL;
+
+	/*
+	 * Check the zones suitable for the gfp_mask contain at least one
+	 * valid zone. It's possible to have an empty zonelist as a result
+	 * of GFP_THISNODE and a memoryless node
+	 */
+	if (unlikely(!zonelist->_zonerefs->zone))
+		return NULL;
+
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+	if (unlikely(!page))
+		page = __alloc_pages_slowpath(gfp_mask, order,
+				zonelist, high_zoneidx, nodemask);
+
+	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
 
-- 
cgit v0.10.2


From 49255c619fbd482d704289b5eb2795f8e3b7ff2e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:31:58 -0700
Subject: page allocator: move check for disabled anti-fragmentation out of
 fastpath

On low-memory systems, anti-fragmentation gets disabled as there is
nothing it can do and it would just incur overhead shuffling pages between
lists constantly.  Currently the check is made in the free page fast path
for every page.  This patch moves it to a slow path.  On machines with low
memory, there will be small amount of additional overhead as pages get
shuffled between lists but it should quickly settle.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a47c879..0aa4445 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -50,9 +50,6 @@ extern int page_group_by_mobility_disabled;
 
 static inline int get_pageblock_migratetype(struct page *page)
 {
-	if (unlikely(page_group_by_mobility_disabled))
-		return MIGRATE_UNMOVABLE;
-
 	return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end);
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 512bf9a..b098596 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -168,6 +168,10 @@ int page_group_by_mobility_disabled __read_mostly;
 
 static void set_pageblock_migratetype(struct page *page, int migratetype)
 {
+
+	if (unlikely(page_group_by_mobility_disabled))
+		migratetype = MIGRATE_UNMOVABLE;
+
 	set_pageblock_flags_group(page, (unsigned long)migratetype,
 					PB_migrate, PB_migrate_end);
 }
-- 
cgit v0.10.2


From 5117f45d11a9ee62d9b086f1312f3f31781ff155 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:31:59 -0700
Subject: page allocator: calculate the preferred zone for allocation only once

get_page_from_freelist() can be called multiple times for an allocation.
Part of this calculates the preferred_zone which is the first usable zone
in the zonelist but the zone depends on the GFP flags specified at the
beginning of the allocation call.  This patch calculates preferred_zone
once.  It's safe to do this because if preferred_zone is NULL at the start
of the call, no amount of direct reclaim or other actions will change the
fact the allocation will fail.

[akpm@linux-foundation.org: remove (void) casts]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b098596..8fc6d1f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1388,26 +1388,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
  */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
-		struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
+		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
+		struct zone *preferred_zone)
 {
 	struct zoneref *z;
 	struct page *page = NULL;
 	int classzone_idx;
-	struct zone *zone, *preferred_zone;
+	struct zone *zone;
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 
-	(void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
-							&preferred_zone);
-	if (!preferred_zone)
-		return NULL;
-
-	classzone_idx = zone_idx(preferred_zone);
-
 	if (WARN_ON_ONCE(order >= MAX_ORDER))
 		return NULL;
 
+	classzone_idx = zone_idx(preferred_zone);
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
@@ -1500,7 +1495,7 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
-	nodemask_t *nodemask)
+	nodemask_t *nodemask, struct zone *preferred_zone)
 {
 	struct page *page;
 
@@ -1517,7 +1512,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	 */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
 		order, zonelist, high_zoneidx,
-		ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
+		preferred_zone);
 	if (page)
 		goto out;
 
@@ -1537,7 +1533,8 @@ out:
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
-	nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress)
+	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+	unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
 	struct reclaim_state reclaim_state;
@@ -1569,7 +1566,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 
 	if (likely(*did_some_progress))
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
-					zonelist, high_zoneidx, alloc_flags);
+					zonelist, high_zoneidx,
+					alloc_flags, preferred_zone);
 	return page;
 }
 
@@ -1589,13 +1587,14 @@ is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
 static inline struct page *
 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
-	nodemask_t *nodemask)
+	nodemask_t *nodemask, struct zone *preferred_zone)
 {
 	struct page *page;
 
 	do {
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
-			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
+			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
+			preferred_zone);
 
 		if (!page && gfp_mask & __GFP_NOFAIL)
 			congestion_wait(WRITE, HZ/50);
@@ -1618,7 +1617,7 @@ void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
-	nodemask_t *nodemask)
+	nodemask_t *nodemask, struct zone *preferred_zone)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	struct page *page = NULL;
@@ -1668,7 +1667,8 @@ restart:
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
-						high_zoneidx, alloc_flags);
+						high_zoneidx, alloc_flags,
+						preferred_zone);
 	if (page)
 		goto got_pg;
 
@@ -1678,7 +1678,7 @@ rebalance:
 		/* Do not dip into emergency reserves if specified */
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 			page = __alloc_pages_high_priority(gfp_mask, order,
-				zonelist, high_zoneidx, nodemask);
+				zonelist, high_zoneidx, nodemask, preferred_zone);
 			if (page)
 				goto got_pg;
 		}
@@ -1695,7 +1695,8 @@ rebalance:
 	page = __alloc_pages_direct_reclaim(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
-					alloc_flags, &did_some_progress);
+					alloc_flags, preferred_zone,
+					&did_some_progress);
 	if (page)
 		goto got_pg;
 
@@ -1707,7 +1708,7 @@ rebalance:
 		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 			page = __alloc_pages_may_oom(gfp_mask, order,
 					zonelist, high_zoneidx,
-					nodemask);
+					nodemask, preferred_zone);
 			if (page)
 				goto got_pg;
 
@@ -1752,6 +1753,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+	struct zone *preferred_zone;
 	struct page *page;
 
 	lockdep_trace_alloc(gfp_mask);
@@ -1769,11 +1771,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 
+	/* The preferred zone is used for statistics later */
+	first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+	if (!preferred_zone)
+		return NULL;
+
+	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
-			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
+			preferred_zone);
 	if (unlikely(!page))
 		page = __alloc_pages_slowpath(gfp_mask, order,
-				zonelist, high_zoneidx, nodemask);
+				zonelist, high_zoneidx, nodemask,
+				preferred_zone);
 
 	return page;
 }
-- 
cgit v0.10.2


From 3dd2826698b6902aafd9441ce28ebb44735fd0d6 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:00 -0700
Subject: page allocator: calculate the migratetype for allocation only once

GFP mask is converted into a migratetype when deciding which pagelist to
take a page from.  However, it is happening multiple times per allocation,
at least once per zone traversed.  Calculate it once.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8fc6d1f..d3be076 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1061,13 +1061,13 @@ void split_page(struct page *page, unsigned int order)
  * or two.
  */
 static struct page *buffered_rmqueue(struct zone *preferred_zone,
-			struct zone *zone, int order, gfp_t gfp_flags)
+			struct zone *zone, int order, gfp_t gfp_flags,
+			int migratetype)
 {
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
 	int cpu;
-	int migratetype = allocflags_to_migratetype(gfp_flags);
 
 again:
 	cpu  = get_cpu();
@@ -1389,7 +1389,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
-		struct zone *preferred_zone)
+		struct zone *preferred_zone, int migratetype)
 {
 	struct zoneref *z;
 	struct page *page = NULL;
@@ -1433,7 +1433,8 @@ zonelist_scan:
 			}
 		}
 
-		page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
+		page = buffered_rmqueue(preferred_zone, zone, order,
+						gfp_mask, migratetype);
 		if (page)
 			break;
 this_zone_full:
@@ -1495,7 +1496,8 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
-	nodemask_t *nodemask, struct zone *preferred_zone)
+	nodemask_t *nodemask, struct zone *preferred_zone,
+	int migratetype)
 {
 	struct page *page;
 
@@ -1513,7 +1515,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
 		order, zonelist, high_zoneidx,
 		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
-		preferred_zone);
+		preferred_zone, migratetype);
 	if (page)
 		goto out;
 
@@ -1534,7 +1536,7 @@ static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	unsigned long *did_some_progress)
+	int migratetype, unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
 	struct reclaim_state reclaim_state;
@@ -1567,7 +1569,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 	if (likely(*did_some_progress))
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx,
-					alloc_flags, preferred_zone);
+					alloc_flags, preferred_zone,
+					migratetype);
 	return page;
 }
 
@@ -1587,14 +1590,15 @@ is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
 static inline struct page *
 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
-	nodemask_t *nodemask, struct zone *preferred_zone)
+	nodemask_t *nodemask, struct zone *preferred_zone,
+	int migratetype)
 {
 	struct page *page;
 
 	do {
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
-			preferred_zone);
+			preferred_zone, migratetype);
 
 		if (!page && gfp_mask & __GFP_NOFAIL)
 			congestion_wait(WRITE, HZ/50);
@@ -1617,7 +1621,8 @@ void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
-	nodemask_t *nodemask, struct zone *preferred_zone)
+	nodemask_t *nodemask, struct zone *preferred_zone,
+	int migratetype)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	struct page *page = NULL;
@@ -1668,7 +1673,8 @@ restart:
 	 */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 						high_zoneidx, alloc_flags,
-						preferred_zone);
+						preferred_zone,
+						migratetype);
 	if (page)
 		goto got_pg;
 
@@ -1678,7 +1684,8 @@ rebalance:
 		/* Do not dip into emergency reserves if specified */
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 			page = __alloc_pages_high_priority(gfp_mask, order,
-				zonelist, high_zoneidx, nodemask, preferred_zone);
+				zonelist, high_zoneidx, nodemask, preferred_zone,
+				migratetype);
 			if (page)
 				goto got_pg;
 		}
@@ -1696,7 +1703,7 @@ rebalance:
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
-					&did_some_progress);
+					migratetype, &did_some_progress);
 	if (page)
 		goto got_pg;
 
@@ -1708,7 +1715,8 @@ rebalance:
 		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 			page = __alloc_pages_may_oom(gfp_mask, order,
 					zonelist, high_zoneidx,
-					nodemask, preferred_zone);
+					nodemask, preferred_zone,
+					migratetype);
 			if (page)
 				goto got_pg;
 
@@ -1755,6 +1763,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zone *preferred_zone;
 	struct page *page;
+	int migratetype = allocflags_to_migratetype(gfp_mask);
 
 	lockdep_trace_alloc(gfp_mask);
 
@@ -1779,11 +1788,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
-			preferred_zone);
+			preferred_zone, migratetype);
 	if (unlikely(!page))
 		page = __alloc_pages_slowpath(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
-				preferred_zone);
+				preferred_zone, migratetype);
 
 	return page;
 }
-- 
cgit v0.10.2


From 341ce06f69abfafa31b9468410a13dbd60e2b237 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 16 Jun 2009 15:32:02 -0700
Subject: page allocator: calculate the alloc_flags for allocation only once

Factor out the mapping between GFP and alloc_flags only once.  Once
factored out, it only needs to be calculated once but some care must be
taken.

[neilb@suse.de says]
As the test:

-       if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
-                       && !in_interrupt()) {
-               if (!(gfp_mask & __GFP_NOMEMALLOC)) {

has been replaced with a slightly weaker one:

+       if (alloc_flags & ALLOC_NO_WATERMARKS) {

Without care, this would allow recursion into the allocator via direct
reclaim.  This patch ensures we do not recurse when PF_MEMALLOC is set but
TF_MEMDIE callers are now allowed to directly reclaim where they would
have been prevented in the past.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Neil Brown <neilb@suse.de>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d3be076..ef870fb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1574,15 +1574,6 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 	return page;
 }
 
-static inline int
-is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
-{
-	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
-			&& !in_interrupt())
-		return 1;
-	return 0;
-}
-
 /*
  * This is called in the allocator slow-path if the allocation request is of
  * sufficient urgency to ignore watermarks and take other desperate measures
@@ -1618,6 +1609,42 @@ void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
 		wakeup_kswapd(zone, order);
 }
 
+static inline int
+gfp_to_alloc_flags(gfp_t gfp_mask)
+{
+	struct task_struct *p = current;
+	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+	const gfp_t wait = gfp_mask & __GFP_WAIT;
+
+	/*
+	 * The caller may dip into page reserves a bit more if the caller
+	 * cannot run direct reclaim, or if the caller has realtime scheduling
+	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
+	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+	 */
+	if (gfp_mask & __GFP_HIGH)
+		alloc_flags |= ALLOC_HIGH;
+
+	if (!wait) {
+		alloc_flags |= ALLOC_HARDER;
+		/*
+		 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+		 */
+		alloc_flags &= ~ALLOC_CPUSET;
+	} else if (unlikely(rt_task(p)))
+		alloc_flags |= ALLOC_HARDER;
+
+	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
+		if (!in_interrupt() &&
+		    ((p->flags & PF_MEMALLOC) ||
+		     unlikely(test_thread_flag(TIF_MEMDIE))))
+			alloc_flags |= ALLOC_NO_WATERMARKS;
+	}
+
+	return alloc_flags;
+}
+
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -1648,56 +1675,35 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	 * OK, we're below the kswapd watermark and have kicked background
 	 * reclaim. Now things get more complex, so set up alloc_flags according
 	 * to how we want to proceed.
-	 *
-	 * The caller may dip into page reserves a bit more if the caller
-	 * cannot run direct reclaim, or if the caller has realtime scheduling
-	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
-	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
 	 */
-	alloc_flags = ALLOC_WMARK_MIN;
-	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
-		alloc_flags |= ALLOC_HARDER;
-	if (gfp_mask & __GFP_HIGH)
-		alloc_flags |= ALLOC_HIGH;
-	if (wait)
-		alloc_flags |= ALLOC_CPUSET;
+	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 
 restart:
-	/*
-	 * Go through the zonelist again. Let __GFP_HIGH and allocations
-	 * coming from realtime tasks go deeper into reserves.
-	 *
-	 * This is the last chance, in general, before the goto nopage.
-	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
-	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
-	 */
+	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
-						high_zoneidx, alloc_flags,
-						preferred_zone,
-						migratetype);
+			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
+			preferred_zone, migratetype);
 	if (page)
 		goto got_pg;
 
 rebalance:
 	/* Allocate without watermarks if the context allows */
-	if (is_allocation_high_priority(p, gfp_mask)) {
-		/* Do not dip into emergency reserves if specified */
-		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
-			page = __alloc_pages_high_priority(gfp_mask, order,
-				zonelist, high_zoneidx, nodemask, preferred_zone,
-				migratetype);
-			if (page)
-				goto got_pg;
-		}
-
-		/* Ensure no recursion into the allocator */
-		goto nopage;
+	if (alloc_flags & ALLOC_NO_WATERMARKS) {
+		page = __alloc_pages_high_priority(gfp_mask, order,
+				zonelist, high_zoneidx, nodemask,
+				preferred_zone, migratetype);
+		if (page)
+			goto got_pg;
 	}
 
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
 
+	/* Avoid recursion of direct reclaim */
+	if (p->flags & PF_MEMALLOC)
+		goto nopage;
+
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order,
 					zonelist, high_zoneidx,
-- 
cgit v0.10.2


From a56f57ff94c25d5d80def06f3ed8fe7f99147762 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:02 -0700
Subject: page allocator: remove a branch by assuming __GFP_HIGH == ALLOC_HIGH

Allocations that specify __GFP_HIGH get the ALLOC_HIGH flag.  If these
flags are equal to each other, we can eliminate a branch.

[akpm@linux-foundation.org: Suggested the hack]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef870fb..94f33e2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1616,14 +1616,16 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 
+	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
+	BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
+
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or if the caller has realtime scheduling
 	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
 	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
 	 */
-	if (gfp_mask & __GFP_HIGH)
-		alloc_flags |= ALLOC_HIGH;
+	alloc_flags |= (gfp_mask & __GFP_HIGH);
 
 	if (!wait) {
 		alloc_flags |= ALLOC_HARDER;
-- 
cgit v0.10.2


From 728ec980fb9fa2d65d9e05444079a53615985e7b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:04 -0700
Subject: page allocator: inline __rmqueue_smallest()

Inline __rmqueue_smallest by altering flow very slightly so that there is
only one call site.  Because there is only one call-site, this function
can then be inlined without causing text bloat.  On an x86-based config,
this patch reduces text by 16 bytes.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94f33e2..04713f6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -661,7 +661,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
  * Go through the free lists for the given migratetype and remove
  * the smallest available page from the freelists
  */
-static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+static inline
+struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	unsigned int current_order;
@@ -831,8 +832,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
 		}
 	}
 
-	/* Use MIGRATE_RESERVE rather than fail an allocation */
-	return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
+	return NULL;
 }
 
 /*
@@ -844,11 +844,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
 {
 	struct page *page;
 
+retry_reserve:
 	page = __rmqueue_smallest(zone, order, migratetype);
 
-	if (unlikely(!page))
+	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
 		page = __rmqueue_fallback(zone, order, migratetype);
 
+		/*
+		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
+		 * is used because __rmqueue_smallest is an inline function
+		 * and we want just one call site
+		 */
+		if (!page) {
+			migratetype = MIGRATE_RESERVE;
+			goto retry_reserve;
+		}
+	}
+
 	return page;
 }
 
-- 
cgit v0.10.2


From 0a15c3e9f649f71464ac39e6378f1fde6f995322 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:05 -0700
Subject: page allocator: inline buffered_rmqueue()

buffered_rmqueue() is in the fast path so inline it.  Because it only has
one call site, this function can then be inlined without causing text
bloat.  On an x86-based config, it made no difference as the savings were
padded out by NOP instructions.  Milage varies but text will either
decrease in size or remain static.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 04713f6..c101921 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1072,7 +1072,8 @@ void split_page(struct page *page, unsigned int order)
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
-static struct page *buffered_rmqueue(struct zone *preferred_zone,
+static inline
+struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, int order, gfp_t gfp_flags,
 			int migratetype)
 {
-- 
cgit v0.10.2


From 0ac3a4099b0171ff965836182bc688bb8ca01058 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:06 -0700
Subject: page allocator: inline __rmqueue_fallback()

__rmqueue_fallback() is in the slow path but has only one call site.
Because there is only one call-site, this function can then be inlined
without causing text bloat.  On an x86-based config, it made no difference
as the savings were padded out by NOP instructions.  Milage varies but
text will either decrease in size or remain static.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c101921..91e29b3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -771,8 +771,8 @@ static int move_freepages_block(struct zone *zone, struct page *page,
 }
 
 /* Remove an element from the buddy allocator from the fallback list */
-static struct page *__rmqueue_fallback(struct zone *zone, int order,
-						int start_migratetype)
+static inline struct page *
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 {
 	struct free_area * area;
 	int current_order;
-- 
cgit v0.10.2


From ed0ae21dc5fe3b9ad4cf1c7bb2bfd2ad596c481c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:07 -0700
Subject: page allocator: do not call get_pageblock_migratetype() more than
 necessary

get_pageblock_migratetype() is potentially called twice for every page
free.  Once, when being freed to the pcp lists and once when being freed
back to buddy.  When freeing from the pcp lists, it is known what the
pageblock type was at the time of free so use it rather than rechecking.
In low memory situations under memory pressure, this might skew
anti-fragmentation slightly but the interference is minimal and decisions
that are fragmenting memory are being made anyway.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 91e29b3..8f334d3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -452,16 +452,18 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
  */
 
 static inline void __free_one_page(struct page *page,
-		struct zone *zone, unsigned int order)
+		struct zone *zone, unsigned int order,
+		int migratetype)
 {
 	unsigned long page_idx;
 	int order_size = 1 << order;
-	int migratetype = get_pageblock_migratetype(page);
 
 	if (unlikely(PageCompound(page)))
 		if (unlikely(destroy_compound_page(page, order)))
 			return;
 
+	VM_BUG_ON(migratetype == -1);
+
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 
 	VM_BUG_ON(page_idx & (order_size - 1));
@@ -530,17 +532,18 @@ static void free_pages_bulk(struct zone *zone, int count,
 		page = list_entry(list->prev, struct page, lru);
 		/* have to delete it as __free_one_page list manipulates */
 		list_del(&page->lru);
-		__free_one_page(page, zone, order);
+		__free_one_page(page, zone, order, page_private(page));
 	}
 	spin_unlock(&zone->lock);
 }
 
-static void free_one_page(struct zone *zone, struct page *page, int order)
+static void free_one_page(struct zone *zone, struct page *page, int order,
+				int migratetype)
 {
 	spin_lock(&zone->lock);
 	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
 	zone->pages_scanned = 0;
-	__free_one_page(page, zone, order);
+	__free_one_page(page, zone, order, migratetype);
 	spin_unlock(&zone->lock);
 }
 
@@ -565,7 +568,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
-	free_one_page(page_zone(page), page, order);
+	free_one_page(page_zone(page), page, order,
+					get_pageblock_migratetype(page));
 	local_irq_restore(flags);
 }
 
-- 
cgit v0.10.2


From da456f14d2f2d7350f2b9440af79c85a34c7eed5 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:08 -0700
Subject: page allocator: do not disable interrupts in free_page_mlock()

free_page_mlock() tests and clears PG_mlocked using locked versions of the
bit operations.  If set, it disables interrupts to update counters and
this happens on every page free even though interrupts are disabled very
shortly afterwards a second time.  This is wasteful.

This patch splits what free_page_mlock() does.  The bit check is still
made.  However, the update of counters is delayed until the interrupts are
disabled and the non-lock version for clearing the bit is used.  One
potential weirdness with this split is that the counters do not get
updated if the bad_page() check is triggered but a system showing bad
pages is getting screwed already.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/internal.h b/mm/internal.h
index 987bb03..58ec1bc 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -157,14 +157,9 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
  */
 static inline void free_page_mlock(struct page *page)
 {
-	if (unlikely(TestClearPageMlocked(page))) {
-		unsigned long flags;
-
-		local_irq_save(flags);
-		__dec_zone_page_state(page, NR_MLOCK);
-		__count_vm_event(UNEVICTABLE_MLOCKFREED);
-		local_irq_restore(flags);
-	}
+	__ClearPageMlocked(page);
+	__dec_zone_page_state(page, NR_MLOCK);
+	__count_vm_event(UNEVICTABLE_MLOCKFREED);
 }
 
 #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8f334d3..03a386d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -495,7 +495,6 @@ static inline void __free_one_page(struct page *page,
 
 static inline int free_pages_check(struct page *page)
 {
-	free_page_mlock(page);
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(page_count(page) != 0)  |
@@ -552,6 +551,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	unsigned long flags;
 	int i;
 	int bad = 0;
+	int clearMlocked = PageMlocked(page);
 
 	for (i = 0 ; i < (1 << order) ; ++i)
 		bad += free_pages_check(page + i);
@@ -567,6 +567,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	kernel_map_pages(page, 1 << order, 0);
 
 	local_irq_save(flags);
+	if (unlikely(clearMlocked))
+		free_page_mlock(page);
 	__count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, order,
 					get_pageblock_migratetype(page));
@@ -1013,6 +1015,7 @@ static void free_hot_cold_page(struct page *page, int cold)
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
+	int clearMlocked = PageMlocked(page);
 
 	if (PageAnon(page))
 		page->mapping = NULL;
@@ -1028,7 +1031,10 @@ static void free_hot_cold_page(struct page *page, int cold)
 
 	pcp = &zone_pcp(zone, get_cpu())->pcp;
 	local_irq_save(flags);
+	if (unlikely(clearMlocked))
+		free_page_mlock(page);
 	__count_vm_event(PGFREE);
+
 	if (cold)
 		list_add_tail(&page->lru, &pcp->list);
 	else
-- 
cgit v0.10.2


From d395b73428d9748fb70b33477c9b2acae62f360a Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:09 -0700
Subject: page allocator: do not setup zonelist cache when there is only one
 node

There is a zonelist cache which is used to track zones that are not in the
allowed cpuset or found to be recently full.  This is to reduce cache
footprint on large machines.  On smaller machines, it just incurs cost for
no gain.  This patch only uses the zonelist cache when there are NUMA
nodes.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 03a386d..fd8e3ca 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1464,8 +1464,11 @@ this_zone_full:
 		if (NUMA_BUILD)
 			zlc_mark_zone_full(zonelist, z);
 try_next_zone:
-		if (NUMA_BUILD && !did_zlc_setup) {
-			/* we do zlc_setup after the first zone is tried */
+		if (NUMA_BUILD && !did_zlc_setup && num_online_nodes() > 1) {
+			/*
+			 * we do zlc_setup after the first zone is tried but only
+			 * if there are multiple nodes make it worthwhile
+			 */
 			allowednodes = zlc_setup(zonelist, alloc_flags);
 			zlc_active = 1;
 			did_zlc_setup = 1;
-- 
cgit v0.10.2


From a3af9c389a7f3e675313f442fdd8c247c1cdb66b Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 16 Jun 2009 15:32:10 -0700
Subject: page allocator: do not check for compound pages during the page
 allocator sanity checks

A number of sanity checks are made on each page allocation and free
including that the page count is zero.  page_count() checks for compound
pages and checks the count of the head page if true.  However, in these
paths, we do not care if the page is compound or not as the count of each
tail page should also be zero.

This patch makes two changes to the use of page_count() in the free path.
It converts one check of page_count() to a VM_BUG_ON() as the count should
have been unconditionally checked earlier in the free path.  It also
avoids checking for compound pages.

[mel@csn.ul.ie: Wrote changelog]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fd8e3ca..8485735 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -421,7 +421,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
 		return 0;
 
 	if (PageBuddy(buddy) && page_order(buddy) == order) {
-		BUG_ON(page_count(buddy) != 0);
+		VM_BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	return 0;
@@ -497,7 +497,7 @@ static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
-		(page_count(page) != 0)  |
+		(atomic_read(&page->_count) != 0) |
 		(page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
 		bad_page(page);
 		return 1;
@@ -642,7 +642,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
-		(page_count(page) != 0)  |
+		(atomic_read(&page->_count) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
 		bad_page(page);
 		return 1;
-- 
cgit v0.10.2


From 418589663d6011de9006425b6c5721e1544fb47a Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:12 -0700
Subject: page allocator: use allocation flags as an index to the zone
 watermark

ALLOC_WMARK_MIN, ALLOC_WMARK_LOW and ALLOC_WMARK_HIGH determin whether
pages_min, pages_low or pages_high is used as the zone watermark when
allocating the pages.  Two branches in the allocator hotpath determine
which watermark to use.

This patch uses the flags as an array index into a watermark array that is
indexed with WMARK_* defines accessed via helpers.  All call sites that
use zone->pages_* are updated to use the helpers for accessing the values
and the array offsets for setting.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 6fab2dc..0ea5adb 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -233,8 +233,8 @@ These protections are added to score to judge whether this zone should be used
 for page allocation or should be reclaimed.
 
 In this example, if normal pages (index=2) are required to this DMA zone and
-pages_high is used for watermark, the kernel judges this zone should not be
-used because pages_free(1355) is smaller than watermark + protection[2]
+watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should
+not be used because pages_free(1355) is smaller than watermark + protection[2]
 (4 + 2004 = 2008). If this protection value is 0, this zone would be used for
 normal page requirement. If requirement is DMA zone(index=0), protection[0]
 (=0) is used.
@@ -280,9 +280,10 @@ The default value is 65536.
 min_free_kbytes:
 
 This is used to force the Linux VM to keep a minimum number
-of kilobytes free.  The VM uses this number to compute a pages_min
-value for each lowmem zone in the system.  Each lowmem zone gets
-a number of reserved free pages based proportionally on its size.
+of kilobytes free.  The VM uses this number to compute a
+watermark[WMARK_MIN] value for each lowmem zone in the system.
+Each lowmem zone gets a number of reserved free pages based
+proportionally on its size.
 
 Some minimal amount of memory is needed to satisfy PF_MEMALLOC
 allocations; if you set this to lower than 1024KB, your system will
diff --git a/Documentation/vm/balance b/Documentation/vm/balance
index bd3d31b..c46e68c 100644
--- a/Documentation/vm/balance
+++ b/Documentation/vm/balance
@@ -75,15 +75,15 @@ Page stealing from process memory and shm is done if stealing the page would
 alleviate memory pressure on any zone in the page's node that has fallen below
 its watermark.
 
-pages_min/pages_low/pages_high/low_on_memory/zone_wake_kswapd: These are 
-per-zone fields, used to determine when a zone needs to be balanced. When
-the number of pages falls below pages_min, the hysteric field low_on_memory
-gets set. This stays set till the number of free pages becomes pages_high.
-When low_on_memory is set, page allocation requests will try to free some
-pages in the zone (providing GFP_WAIT is set in the request). Orthogonal
-to this, is the decision to poke kswapd to free some zone pages. That
-decision is not hysteresis based, and is done when the number of free
-pages is below pages_low; in which case zone_wake_kswapd is also set.
+watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
+are per-zone fields, used to determine when a zone needs to be balanced. When
+the number of pages falls below watermark[WMARK_MIN], the hysteric field
+low_on_memory gets set. This stays set till the number of free pages becomes
+watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will
+try to free some pages in the zone (providing GFP_WAIT is set in the request).
+Orthogonal to this, is the decision to poke kswapd to free some zone pages.
+That decision is not hysteresis based, and is done when the number of free
+pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
 
 
 (Good) Ideas that I have heard:
diff --git a/arch/m32r/mm/discontig.c b/arch/m32r/mm/discontig.c
index 7daf897..b7a78ad 100644
--- a/arch/m32r/mm/discontig.c
+++ b/arch/m32r/mm/discontig.c
@@ -154,9 +154,9 @@ unsigned long __init zone_sizes_init(void)
 	 *  Use all area of internal RAM.
 	 *  see __alloc_pages()
 	 */
-	NODE_DATA(1)->node_zones->pages_min = 0;
-	NODE_DATA(1)->node_zones->pages_low = 0;
-	NODE_DATA(1)->node_zones->pages_high = 0;
+	NODE_DATA(1)->node_zones->watermark[WMARK_MIN] = 0;
+	NODE_DATA(1)->node_zones->watermark[WMARK_LOW] = 0;
+	NODE_DATA(1)->node_zones->watermark[WMARK_HIGH] = 0;
 
 	return holes;
 }
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0aa4445..dd8487f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -163,6 +163,17 @@ static inline int is_unevictable_lru(enum lru_list l)
 #endif
 }
 
+enum zone_watermarks {
+	WMARK_MIN,
+	WMARK_LOW,
+	WMARK_HIGH,
+	NR_WMARK
+};
+
+#define min_wmark_pages(z) (z->watermark[WMARK_MIN])
+#define low_wmark_pages(z) (z->watermark[WMARK_LOW])
+#define high_wmark_pages(z) (z->watermark[WMARK_HIGH])
+
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
@@ -275,7 +286,10 @@ struct zone_reclaim_stat {
 
 struct zone {
 	/* Fields commonly accessed by the page allocator */
-	unsigned long		pages_min, pages_low, pages_high;
+
+	/* zone watermarks, access with *_wmark_pages(zone) macros */
+	unsigned long watermark[NR_WMARK];
+
 	/*
 	 * We don't know if the memory that we're going to allocate will be freeable
 	 * or/and it will be released eventually, so to avoid totally wasting several
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8485735..abe2600 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1150,10 +1150,15 @@ failed:
 	return NULL;
 }
 
-#define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
-#define ALLOC_WMARK_MIN		0x02 /* use pages_min watermark */
-#define ALLOC_WMARK_LOW		0x04 /* use pages_low watermark */
-#define ALLOC_WMARK_HIGH	0x08 /* use pages_high watermark */
+/* The ALLOC_WMARK bits are used as an index to zone->watermark */
+#define ALLOC_WMARK_MIN		WMARK_MIN
+#define ALLOC_WMARK_LOW		WMARK_LOW
+#define ALLOC_WMARK_HIGH	WMARK_HIGH
+#define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
+
+/* Mask to get the watermark bits */
+#define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
+
 #define ALLOC_HARDER		0x10 /* try to alloc harder */
 #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
@@ -1440,14 +1445,10 @@ zonelist_scan:
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				goto try_next_zone;
 
+		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
 		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
 			unsigned long mark;
-			if (alloc_flags & ALLOC_WMARK_MIN)
-				mark = zone->pages_min;
-			else if (alloc_flags & ALLOC_WMARK_LOW)
-				mark = zone->pages_low;
-			else
-				mark = zone->pages_high;
+			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
 			if (!zone_watermark_ok(zone, order, mark,
 				    classzone_idx, alloc_flags)) {
 				if (!zone_reclaim_mode ||
@@ -1959,7 +1960,7 @@ static unsigned int nr_free_zone_pages(int offset)
 
 	for_each_zone_zonelist(zone, z, zonelist, offset) {
 		unsigned long size = zone->present_pages;
-		unsigned long high = zone->pages_high;
+		unsigned long high = high_wmark_pages(zone);
 		if (size > high)
 			sum += size - high;
 	}
@@ -2096,9 +2097,9 @@ void show_free_areas(void)
 			"\n",
 			zone->name,
 			K(zone_page_state(zone, NR_FREE_PAGES)),
-			K(zone->pages_min),
-			K(zone->pages_low),
-			K(zone->pages_high),
+			K(min_wmark_pages(zone)),
+			K(low_wmark_pages(zone)),
+			K(high_wmark_pages(zone)),
 			K(zone_page_state(zone, NR_ACTIVE_ANON)),
 			K(zone_page_state(zone, NR_INACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ACTIVE_FILE)),
@@ -2702,8 +2703,8 @@ static inline unsigned long wait_table_bits(unsigned long size)
 
 /*
  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
- * of blocks reserved is based on zone->pages_min. The memory within the
- * reserve will tend to store contiguous free pages. Setting min_free_kbytes
+ * of blocks reserved is based on min_wmark_pages(zone). The memory within
+ * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
  * higher will lead to a bigger reserve which will get freed as contiguous
  * blocks as reclaim kicks in
  */
@@ -2716,7 +2717,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
 	/* Get the start pfn, end pfn and the number of blocks to reserve */
 	start_pfn = zone->zone_start_pfn;
 	end_pfn = start_pfn + zone->spanned_pages;
-	reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
+	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
 							pageblock_order;
 
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
@@ -4319,8 +4320,8 @@ static void calculate_totalreserve_pages(void)
 					max = zone->lowmem_reserve[j];
 			}
 
-			/* we treat pages_high as reserved pages. */
-			max += zone->pages_high;
+			/* we treat the high watermark as reserved pages. */
+			max += high_wmark_pages(zone);
 
 			if (max > zone->present_pages)
 				max = zone->present_pages;
@@ -4400,7 +4401,7 @@ void setup_per_zone_pages_min(void)
 			 * need highmem pages, so cap pages_min to a small
 			 * value here.
 			 *
-			 * The (pages_high-pages_low) and (pages_low-pages_min)
+			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
 			 * deltas controls asynch page reclaim, and so should
 			 * not be capped for highmem.
 			 */
@@ -4411,17 +4412,17 @@ void setup_per_zone_pages_min(void)
 				min_pages = SWAP_CLUSTER_MAX;
 			if (min_pages > 128)
 				min_pages = 128;
-			zone->pages_min = min_pages;
+			zone->watermark[WMARK_MIN] = min_pages;
 		} else {
 			/*
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
-			zone->pages_min = tmp;
+			zone->watermark[WMARK_MIN] = tmp;
 		}
 
-		zone->pages_low   = zone->pages_min + (tmp >> 2);
-		zone->pages_high  = zone->pages_min + (tmp >> 1);
+		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
+		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
@@ -4566,7 +4567,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
  *	whenever sysctl_lowmem_reserve_ratio changes.
  *
  * The reserve ratio obviously has absolutely no relation with the
- * pages_min watermarks. The lowmem reserve ratio can only make sense
+ * minimum watermarks. The lowmem reserve ratio can only make sense
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a6b7d14..e5245d0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1401,7 +1401,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 		free  = zone_page_state(zone, NR_FREE_PAGES);
 		/* If we have very few page cache pages,
 		   force-scan anon pages. */
-		if (unlikely(file + free <= zone->pages_high)) {
+		if (unlikely(file + free <= high_wmark_pages(zone))) {
 			percent[0] = 100;
 			percent[1] = 0;
 			return;
@@ -1533,11 +1533,13 @@ static void shrink_zone(int priority, struct zone *zone,
  * try to reclaim pages from zones which will satisfy the caller's allocation
  * request.
  *
- * We reclaim from a zone even if that zone is over pages_high.  Because:
+ * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
+ * Because:
  * a) The caller may be trying to free *extra* pages to satisfy a higher-order
  *    allocation or
- * b) The zones may be over pages_high but they must go *over* pages_high to
- *    satisfy the `incremental min' zone defense algorithm.
+ * b) The target zone may be at high_wmark_pages(zone) but the lower zones
+ *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
+ *    zone defense algorithm.
  *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
@@ -1743,7 +1745,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
- * they are all at pages_high.
+ * they are all at high_wmark_pages(zone).
  *
  * Returns the number of pages which were actually freed.
  *
@@ -1756,11 +1758,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
  * the zone for when the problem goes away.
  *
  * kswapd scans the zones in the highmem->normal->dma direction.  It skips
- * zones which have free_pages > pages_high, but once a zone is found to have
- * free_pages <= pages_high, we scan that zone and the lower zones regardless
- * of the number of free pages in the lower zones.  This interoperates with
- * the page allocator fallback scheme to ensure that aging of pages is balanced
- * across the zones.
+ * zones which have free_pages > high_wmark_pages(zone), but once a zone is
+ * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
+ * lower zones regardless of the number of free pages in the lower zones. This
+ * interoperates with the page allocator fallback scheme to ensure that aging
+ * of pages is balanced across the zones.
  */
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 {
@@ -1781,7 +1783,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 	};
 	/*
 	 * temp_priority is used to remember the scanning priority at which
-	 * this zone was successfully refilled to free_pages == pages_high.
+	 * this zone was successfully refilled to
+	 * free_pages == high_wmark_pages(zone).
 	 */
 	int temp_priority[MAX_NR_ZONES];
 
@@ -1826,8 +1829,8 @@ loop_again:
 				shrink_active_list(SWAP_CLUSTER_MAX, zone,
 							&sc, priority, 0);
 
-			if (!zone_watermark_ok(zone, order, zone->pages_high,
-					       0, 0)) {
+			if (!zone_watermark_ok(zone, order,
+					high_wmark_pages(zone), 0, 0)) {
 				end_zone = i;
 				break;
 			}
@@ -1861,8 +1864,8 @@ loop_again:
 					priority != DEF_PRIORITY)
 				continue;
 
-			if (!zone_watermark_ok(zone, order, zone->pages_high,
-					       end_zone, 0))
+			if (!zone_watermark_ok(zone, order,
+					high_wmark_pages(zone), end_zone, 0))
 				all_zones_ok = 0;
 			temp_priority[i] = priority;
 			sc.nr_scanned = 0;
@@ -1871,8 +1874,8 @@ loop_again:
 			 * We put equal pressure on every zone, unless one
 			 * zone has way too many pages free already.
 			 */
-			if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
-						end_zone, 0))
+			if (!zone_watermark_ok(zone, order,
+					8*high_wmark_pages(zone), end_zone, 0))
 				shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -2038,7 +2041,7 @@ void wakeup_kswapd(struct zone *zone, int order)
 		return;
 
 	pgdat = zone->zone_pgdat;
-	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
+	if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
 		return;
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 74d66db..4151107 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -714,9 +714,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n        spanned  %lu"
 		   "\n        present  %lu",
 		   zone_page_state(zone, NR_FREE_PAGES),
-		   zone->pages_min,
-		   zone->pages_low,
-		   zone->pages_high,
+		   min_wmark_pages(zone),
+		   low_wmark_pages(zone),
+		   high_wmark_pages(zone),
 		   zone->pages_scanned,
 		   zone->lru[LRU_ACTIVE_ANON].nr_scan,
 		   zone->lru[LRU_INACTIVE_ANON].nr_scan,
-- 
cgit v0.10.2


From f2260e6b1f4eba0f5b5906795117791b5c660154 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:13 -0700
Subject: page allocator: update NR_FREE_PAGES only as necessary

When pages are being freed to the buddy allocator, the zone NR_FREE_PAGES
counter must be updated.  In the case of bulk per-cpu page freeing, it's
updated once per page.  This retouches cache lines more than necessary.
Update the counters one per per-cpu bulk free.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index abe2600..d56e377 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -456,7 +456,6 @@ static inline void __free_one_page(struct page *page,
 		int migratetype)
 {
 	unsigned long page_idx;
-	int order_size = 1 << order;
 
 	if (unlikely(PageCompound(page)))
 		if (unlikely(destroy_compound_page(page, order)))
@@ -466,10 +465,9 @@ static inline void __free_one_page(struct page *page,
 
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 
-	VM_BUG_ON(page_idx & (order_size - 1));
+	VM_BUG_ON(page_idx & ((1 << order) - 1));
 	VM_BUG_ON(bad_range(zone, page));
 
-	__mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
 	while (order < MAX_ORDER-1) {
 		unsigned long combined_idx;
 		struct page *buddy;
@@ -524,6 +522,8 @@ static void free_pages_bulk(struct zone *zone, int count,
 	spin_lock(&zone->lock);
 	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
 	zone->pages_scanned = 0;
+
+	__mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
 	while (count--) {
 		struct page *page;
 
@@ -542,6 +542,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
 	spin_lock(&zone->lock);
 	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
 	zone->pages_scanned = 0;
+
+	__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
 	__free_one_page(page, zone, order, migratetype);
 	spin_unlock(&zone->lock);
 }
@@ -686,7 +688,6 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 		list_del(&page->lru);
 		rmv_page_order(page);
 		area->nr_free--;
-		__mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
 		expand(zone, page, order, current_order, area, migratetype);
 		return page;
 	}
@@ -826,8 +827,6 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 			/* Remove the page from the freelists */
 			list_del(&page->lru);
 			rmv_page_order(page);
-			__mod_zone_page_state(zone, NR_FREE_PAGES,
-							-(1UL << order));
 
 			if (current_order == pageblock_order)
 				set_pageblock_migratetype(page,
@@ -900,6 +899,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		set_page_private(page, migratetype);
 		list = &page->lru;
 	}
+	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock(&zone->lock);
 	return i;
 }
@@ -1129,6 +1129,7 @@ again:
 	} else {
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order, migratetype);
+		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
-- 
cgit v0.10.2


From 974709bdb2a34db378fc84140220f363f558d0d6 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:14 -0700
Subject: page allocator: get the pageblock migratetype without disabling
 interrupts

Local interrupts are disabled when freeing pages to the PCP list.  Part of
that free checks what the migratetype of the pageblock the page is in but
it checks this with interrupts disabled and interupts should never be
disabled longer than necessary.  This patch checks the pagetype with
interrupts enabled with the impact that it is possible a page is freed to
the wrong list when a pageblock changes type.  As that block is now
already considered mixed from an anti-fragmentation perspective, it's not
of vital importance.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d56e377..e60e414 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1030,6 +1030,7 @@ static void free_hot_cold_page(struct page *page, int cold)
 	kernel_map_pages(page, 1, 0);
 
 	pcp = &zone_pcp(zone, get_cpu())->pcp;
+	set_page_private(page, get_pageblock_migratetype(page));
 	local_irq_save(flags);
 	if (unlikely(clearMlocked))
 		free_page_mlock(page);
@@ -1039,7 +1040,6 @@ static void free_hot_cold_page(struct page *page, int cold)
 		list_add_tail(&page->lru, &pcp->list);
 	else
 		list_add(&page->lru, &pcp->list);
-	set_page_private(page, get_pageblock_migratetype(page));
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
-- 
cgit v0.10.2


From 62bc62a873116805774ffd37d7f86aa4faa832b1 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Tue, 16 Jun 2009 15:32:15 -0700
Subject: page allocator: use a pre-calculated value instead of
 num_online_nodes() in fast paths

num_online_nodes() is called in a number of places but most often by the
page allocator when deciding whether the zonelist needs to be filtered
based on cpusets or the zonelist cache.  This is actually a heavy function
and touches a number of cache lines.

This patch stores the number of online nodes at boot time and updates the
value when nodes get onlined and offlined.  The value is then used in a
number of important paths in place of num_online_nodes().

[rientjes@google.com: do not override definition of node_set_online() with macro]
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 848025c..829b94b 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -408,6 +408,19 @@ static inline int num_node_state(enum node_states state)
 #define next_online_node(nid)	next_node((nid), node_states[N_ONLINE])
 
 extern int nr_node_ids;
+extern int nr_online_nodes;
+
+static inline void node_set_online(int nid)
+{
+	node_set_state(nid, N_ONLINE);
+	nr_online_nodes = num_node_state(N_ONLINE);
+}
+
+static inline void node_set_offline(int nid)
+{
+	node_clear_state(nid, N_ONLINE);
+	nr_online_nodes = num_node_state(N_ONLINE);
+}
 #else
 
 static inline int node_state(int node, enum node_states state)
@@ -434,7 +447,10 @@ static inline int num_node_state(enum node_states state)
 #define first_online_node	0
 #define next_online_node(nid)	(MAX_NUMNODES)
 #define nr_node_ids		1
+#define nr_online_nodes		1
 
+#define node_set_online(node)	   node_set_state((node), N_ONLINE)
+#define node_set_offline(node)	   node_clear_state((node), N_ONLINE)
 #endif
 
 #define node_online_map 	node_states[N_ONLINE]
@@ -454,9 +470,6 @@ static inline int num_node_state(enum node_states state)
 #define node_online(node)	node_state((node), N_ONLINE)
 #define node_possible(node)	node_state((node), N_POSSIBLE)
 
-#define node_set_online(node)	   node_set_state((node), N_ONLINE)
-#define node_set_offline(node)	   node_clear_state((node), N_ONLINE)
-
 #define for_each_node(node)	   for_each_node_state(node, N_POSSIBLE)
 #define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2f8241f..7b9b601 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -875,7 +875,7 @@ static void return_unused_surplus_pages(struct hstate *h,
 	 * can no longer free unreserved surplus pages. This occurs when
 	 * the nodes with surplus pages have no free pages.
 	 */
-	unsigned long remaining_iterations = num_online_nodes();
+	unsigned long remaining_iterations = nr_online_nodes;
 
 	/* Uncommit the reservation */
 	h->resv_huge_pages -= unused_resv_pages;
@@ -904,7 +904,7 @@ static void return_unused_surplus_pages(struct hstate *h,
 			h->surplus_huge_pages--;
 			h->surplus_huge_pages_node[nid]--;
 			nr_pages--;
-			remaining_iterations = num_online_nodes();
+			remaining_iterations = nr_online_nodes;
 		}
 	}
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e60e414..0c9f406 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -161,7 +161,9 @@ static unsigned long __meminitdata dma_reserve;
 
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
+int nr_online_nodes __read_mostly = 1;
 EXPORT_SYMBOL(nr_node_ids);
+EXPORT_SYMBOL(nr_online_nodes);
 #endif
 
 int page_group_by_mobility_disabled __read_mostly;
@@ -1466,7 +1468,7 @@ this_zone_full:
 		if (NUMA_BUILD)
 			zlc_mark_zone_full(zonelist, z);
 try_next_zone:
-		if (NUMA_BUILD && !did_zlc_setup && num_online_nodes() > 1) {
+		if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
 			/*
 			 * we do zlc_setup after the first zone is tried but only
 			 * if there are multiple nodes make it worthwhile
@@ -2265,7 +2267,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
 }
 
 
-#define MAX_NODE_LOAD (num_online_nodes())
+#define MAX_NODE_LOAD (nr_online_nodes)
 static int node_load[MAX_NUMNODES];
 
 /**
@@ -2474,7 +2476,7 @@ static void build_zonelists(pg_data_t *pgdat)
 
 	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
-	load = num_online_nodes();
+	load = nr_online_nodes;
 	prev_node = local_node;
 	nodes_clear(used_mask);
 
@@ -2625,7 +2627,7 @@ void build_all_zonelists(void)
 
 	printk("Built %i zonelists in %s order, mobility grouping %s.  "
 		"Total pages: %ld\n",
-			num_online_nodes(),
+			nr_online_nodes,
 			zonelist_order_name[current_zonelist_order],
 			page_group_by_mobility_disabled ? "off" : "on",
 			vm_total_pages);
diff --git a/mm/slub.c b/mm/slub.c
index 30354bf..1c95077 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3737,7 +3737,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
 						 to_cpumask(l->cpus));
 		}
 
-		if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
+		if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
 				len < PAGE_SIZE - 60) {
 			len += sprintf(buf + len, " nodes=");
 			len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 8847add..5ed8931 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -124,7 +124,7 @@ svc_pool_map_choose_mode(void)
 {
 	unsigned int node;
 
-	if (num_online_nodes() > 1) {
+	if (nr_online_nodes > 1) {
 		/*
 		 * Actually have multiple NUMA nodes,
 		 * so split pools on NUMA node boundaries
-- 
cgit v0.10.2


From b6e68bc1baed9b6972a250aba66b8c5276cf6fb1 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:16 -0700
Subject: page allocator: slab: use nr_online_nodes to check for a NUMA
 platform

SLAB currently avoids checking a bitmap repeatedly by checking once and
storing a flag.  When the addition of nr_online_nodes as a cheaper version
of num_online_nodes(), this check can be replaced by nr_online_nodes.

(Christoph did a patch that this is lifted almost verbatim from)

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/slab.c b/mm/slab.c
index bb3254c..744ab9a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -898,7 +898,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
   */
 
 static int use_alien_caches __read_mostly = 1;
-static int numa_platform __read_mostly = 1;
 static int __init noaliencache_setup(char *s)
 {
 	use_alien_caches = 0;
@@ -1457,10 +1456,8 @@ void __init kmem_cache_init(void)
 	int order;
 	int node;
 
-	if (num_possible_nodes() == 1) {
+	if (num_possible_nodes() == 1)
 		use_alien_caches = 0;
-		numa_platform = 0;
-	}
 
 	for (i = 0; i < NUM_INIT_LISTS; i++) {
 		kmem_list3_init(&initkmem_list3[i]);
@@ -3590,7 +3587,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 	 * variable to skip the call, which is mostly likely to be present in
 	 * the cache.
 	 */
-	if (numa_platform && cache_free_alien(cachep, objp))
+	if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
 		return;
 
 	if (likely(ac->avail < ac->limit)) {
-- 
cgit v0.10.2


From 092cead6175bb1b3d3078a34ba71c939d526c70b Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 16 Jun 2009 15:32:17 -0700
Subject: page allocator: move free_page_mlock() to page_alloc.c

Currently, free_page_mlock() is only called from page_alloc.c.  Thus, we
can move it to page_alloc.c.

Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/internal.h b/mm/internal.h
index 58ec1bc..4b1672a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -150,18 +150,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 	}
 }
 
-/*
- * free_page_mlock() -- clean up attempts to free and mlocked() page.
- * Page should not be on lru, so no need to fix that up.
- * free_pages_check() will verify...
- */
-static inline void free_page_mlock(struct page *page)
-{
-	__ClearPageMlocked(page);
-	__dec_zone_page_state(page, NR_MLOCK);
-	__count_vm_event(UNEVICTABLE_MLOCKFREED);
-}
-
 #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
 static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
 {
@@ -170,7 +158,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
 static inline void clear_page_mlock(struct page *page) { }
 static inline void mlock_vma_page(struct page *page) { }
 static inline void mlock_migrate_page(struct page *new, struct page *old) { }
-static inline void free_page_mlock(struct page *page) { }
 
 #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0c9f406..5dac5d8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -493,6 +493,22 @@ static inline void __free_one_page(struct page *page,
 	zone->free_area[order].nr_free++;
 }
 
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
+/*
+ * free_page_mlock() -- clean up attempts to free and mlocked() page.
+ * Page should not be on lru, so no need to fix that up.
+ * free_pages_check() will verify...
+ */
+static inline void free_page_mlock(struct page *page)
+{
+	__ClearPageMlocked(page);
+	__dec_zone_page_state(page, NR_MLOCK);
+	__count_vm_event(UNEVICTABLE_MLOCKFREED);
+}
+#else
+static void free_page_mlock(struct page *page) { }
+#endif
+
 static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
-- 
cgit v0.10.2


From 72807a74c0172376bba6b5b27702c9f702b526e9 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:18 -0700
Subject: page allocator: sanity check order in the page allocator slow path

Callers may speculatively call different allocators in order of preference
trying to allocate a buffer of a given size.  The order needed to allocate
this may be larger than what the page allocator can normally handle.
While the allocator mostly does the right thing, it should not direct
reclaim or wakeup kswapd with a bogus order.  This patch sanity checks the
order in the slow path and returns NULL if it is too large.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5dac5d8..85759cd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1446,9 +1446,6 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 
-	if (WARN_ON_ONCE(order >= MAX_ORDER))
-		return NULL;
-
 	classzone_idx = zone_idx(preferred_zone);
 zonelist_scan:
 	/*
@@ -1707,6 +1704,15 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct task_struct *p = current;
 
 	/*
+	 * In the slowpath, we sanity check order to avoid ever trying to
+	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
+	 * be using allocators in order of preference for an area that is
+	 * too large.
+	 */
+	if (WARN_ON_ONCE(order >= MAX_ORDER))
+		return NULL;
+
+	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
 	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
 	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
-- 
cgit v0.10.2


From a1dd268cf6306565a31a48deff8bf4f6b4b105f7 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:19 -0700
Subject: mm: use alloc_pages_exact() in alloc_large_system_hash() to avoid
 duplicated logic

alloc_large_system_hash() has logic for freeing pages at the end of an
excessively large power-of-two buffer that is a duplicate of what is in
alloc_pages_exact().  This patch converts alloc_large_system_hash() to use
alloc_pages_exact().

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 85759cd..8ca06d8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4699,26 +4699,13 @@ void *__init alloc_large_system_hash(const char *tablename,
 		else if (hashdist)
 			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
 		else {
-			unsigned long order = get_order(size);
-
-			if (order < MAX_ORDER)
-				table = (void *)__get_free_pages(GFP_ATOMIC,
-								order);
 			/*
 			 * If bucketsize is not a power-of-two, we may free
-			 * some pages at the end of hash table.
+			 * some pages at the end of hash table which
+			 * alloc_pages_exact() automatically does
 			 */
-			if (table) {
-				unsigned long alloc_end = (unsigned long)table +
-						(PAGE_SIZE << order);
-				unsigned long used = (unsigned long)table +
-						PAGE_ALIGN(size);
-				split_page(virt_to_page(table), order);
-				while (used < alloc_end) {
-					free_page(used);
-					used += PAGE_SIZE;
-				}
-			}
+			if (get_order(size) < MAX_ORDER)
+				table = alloc_pages_exact(size, GFP_ATOMIC);
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
 
-- 
cgit v0.10.2


From 20a0307c0396c2edb651401d2f2db193dda2f3c9 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:32:22 -0700
Subject: mm: introduce PageHuge() for testing huge/gigantic pages

A series of patches to enhance the /proc/pagemap interface and to add a
userspace executable which can be used to present the pagemap data.

Export 10 more flags to end users (and more for kernel developers):

        11. KPF_MMAP            (pseudo flag) memory mapped page
        12. KPF_ANON            (pseudo flag) memory mapped page (anonymous)
        13. KPF_SWAPCACHE       page is in swap cache
        14. KPF_SWAPBACKED      page is swap/RAM backed
        15. KPF_COMPOUND_HEAD   (*)
        16. KPF_COMPOUND_TAIL   (*)
        17. KPF_HUGE		hugeTLB pages
        18. KPF_UNEVICTABLE     page is in the unevictable LRU list
        19. KPF_HWPOISON        hardware detected corruption
        20. KPF_NOPAGE          (pseudo flag) no page frame at the address

        (*) For compound pages, exporting _both_ head/tail info enables
            users to tell where a compound page starts/ends, and its order.

a simple demo of the page-types tool

# ./page-types -h
page-types [options]
            -r|--raw                  Raw mode, for kernel developers
            -a|--addr    addr-spec    Walk a range of pages
            -b|--bits    bits-spec    Walk pages with specified bits
            -l|--list                 Show page details in ranges
            -L|--list-each            Show page details one by one
            -N|--no-summary           Don't show summay info
            -h|--help                 Show this usage message
addr-spec:
            N                         one page at offset N (unit: pages)
            N+M                       pages range from N to N+M-1
            N,M                       pages range from N to M-1
            N,                        pages range from N to end
            ,M                        pages range from 0 to M
bits-spec:
            bit1,bit2                 (flags & (bit1|bit2)) != 0
            bit1,bit2=bit1            (flags & (bit1|bit2)) == bit1
            bit1,~bit2                (flags & (bit1|bit2)) == bit1
            =bit1,bit2                flags == (bit1|bit2)
bit-names:
          locked              error         referenced           uptodate
           dirty                lru             active               slab
       writeback            reclaim              buddy               mmap
       anonymous          swapcache         swapbacked      compound_head
   compound_tail               huge        unevictable           hwpoison
          nopage           reserved(r)         mlocked(r)    mappedtodisk(r)
         private(r)       private_2(r)   owner_private(r)            arch(r)
        uncached(r)       readahead(o)       slob_free(o)     slub_frozen(o)
      slub_debug(o)
                                   (r) raw mode bits  (o) overloaded bits

# ./page-types
             flags      page-count       MB  symbolic-flags                     long-symbolic-flags
0x0000000000000000          487369     1903  _________________________________
0x0000000000000014               5        0  __R_D____________________________  referenced,dirty
0x0000000000000020               1        0  _____l___________________________  lru
0x0000000000000024              34        0  __R__l___________________________  referenced,lru
0x0000000000000028            3838       14  ___U_l___________________________  uptodate,lru
0x0001000000000028              48        0  ___U_l_______________________I___  uptodate,lru,readahead
0x000000000000002c            6478       25  __RU_l___________________________  referenced,uptodate,lru
0x000100000000002c              47        0  __RU_l_______________________I___  referenced,uptodate,lru,readahead
0x0000000000000040            8344       32  ______A__________________________  active
0x0000000000000060               1        0  _____lA__________________________  lru,active
0x0000000000000068             348        1  ___U_lA__________________________  uptodate,lru,active
0x0001000000000068              12        0  ___U_lA______________________I___  uptodate,lru,active,readahead
0x000000000000006c             988        3  __RU_lA__________________________  referenced,uptodate,lru,active
0x000100000000006c              48        0  __RU_lA______________________I___  referenced,uptodate,lru,active,readahead
0x0000000000004078               1        0  ___UDlA_______b__________________  uptodate,dirty,lru,active,swapbacked
0x000000000000407c              34        0  __RUDlA_______b__________________  referenced,uptodate,dirty,lru,active,swapbacked
0x0000000000000400             503        1  __________B______________________  buddy
0x0000000000000804               1        0  __R________M_____________________  referenced,mmap
0x0000000000000828            1029        4  ___U_l_____M_____________________  uptodate,lru,mmap
0x0001000000000828              43        0  ___U_l_____M_________________I___  uptodate,lru,mmap,readahead
0x000000000000082c             382        1  __RU_l_____M_____________________  referenced,uptodate,lru,mmap
0x000100000000082c              12        0  __RU_l_____M_________________I___  referenced,uptodate,lru,mmap,readahead
0x0000000000000868             192        0  ___U_lA____M_____________________  uptodate,lru,active,mmap
0x0001000000000868              12        0  ___U_lA____M_________________I___  uptodate,lru,active,mmap,readahead
0x000000000000086c             800        3  __RU_lA____M_____________________  referenced,uptodate,lru,active,mmap
0x000100000000086c              31        0  __RU_lA____M_________________I___  referenced,uptodate,lru,active,mmap,readahead
0x0000000000004878               2        0  ___UDlA____M__b__________________  uptodate,dirty,lru,active,mmap,swapbacked
0x0000000000001000             492        1  ____________a____________________  anonymous
0x0000000000005808               4        0  ___U_______Ma_b__________________  uptodate,mmap,anonymous,swapbacked
0x0000000000005868            2839       11  ___U_lA____Ma_b__________________  uptodate,lru,active,mmap,anonymous,swapbacked
0x000000000000586c              30        0  __RU_lA____Ma_b__________________  referenced,uptodate,lru,active,mmap,anonymous,swapbacked
             total          513968     2007

# ./page-types -r
             flags      page-count       MB  symbolic-flags                     long-symbolic-flags
0x0000000000000000          468002     1828  _________________________________
0x0000000100000000           19102       74  _____________________r___________  reserved
0x0000000000008000              41        0  _______________H_________________  compound_head
0x0000000000010000             188        0  ________________T________________  compound_tail
0x0000000000008014               1        0  __R_D__________H_________________  referenced,dirty,compound_head
0x0000000000010014               4        0  __R_D___________T________________  referenced,dirty,compound_tail
0x0000000000000020               1        0  _____l___________________________  lru
0x0000000800000024              34        0  __R__l__________________P________  referenced,lru,private
0x0000000000000028            3794       14  ___U_l___________________________  uptodate,lru
0x0001000000000028              46        0  ___U_l_______________________I___  uptodate,lru,readahead
0x0000000400000028              44        0  ___U_l_________________d_________  uptodate,lru,mappedtodisk
0x0001000400000028               2        0  ___U_l_________________d_____I___  uptodate,lru,mappedtodisk,readahead
0x000000000000002c            6434       25  __RU_l___________________________  referenced,uptodate,lru
0x000100000000002c              47        0  __RU_l_______________________I___  referenced,uptodate,lru,readahead
0x000000040000002c              14        0  __RU_l_________________d_________  referenced,uptodate,lru,mappedtodisk
0x000000080000002c              30        0  __RU_l__________________P________  referenced,uptodate,lru,private
0x0000000800000040            8124       31  ______A_________________P________  active,private
0x0000000000000040             219        0  ______A__________________________  active
0x0000000800000060               1        0  _____lA_________________P________  lru,active,private
0x0000000000000068             322        1  ___U_lA__________________________  uptodate,lru,active
0x0001000000000068              12        0  ___U_lA______________________I___  uptodate,lru,active,readahead
0x0000000400000068              13        0  ___U_lA________________d_________  uptodate,lru,active,mappedtodisk
0x0000000800000068              12        0  ___U_lA_________________P________  uptodate,lru,active,private
0x000000000000006c             977        3  __RU_lA__________________________  referenced,uptodate,lru,active
0x000100000000006c              48        0  __RU_lA______________________I___  referenced,uptodate,lru,active,readahead
0x000000040000006c               5        0  __RU_lA________________d_________  referenced,uptodate,lru,active,mappedtodisk
0x000000080000006c               3        0  __RU_lA_________________P________  referenced,uptodate,lru,active,private
0x0000000c0000006c               3        0  __RU_lA________________dP________  referenced,uptodate,lru,active,mappedtodisk,private
0x0000000c00000068               1        0  ___U_lA________________dP________  uptodate,lru,active,mappedtodisk,private
0x0000000000004078               1        0  ___UDlA_______b__________________  uptodate,dirty,lru,active,swapbacked
0x000000000000407c              34        0  __RUDlA_______b__________________  referenced,uptodate,dirty,lru,active,swapbacked
0x0000000000000400             538        2  __________B______________________  buddy
0x0000000000000804               1        0  __R________M_____________________  referenced,mmap
0x0000000000000828            1029        4  ___U_l_____M_____________________  uptodate,lru,mmap
0x0001000000000828              43        0  ___U_l_____M_________________I___  uptodate,lru,mmap,readahead
0x000000000000082c             382        1  __RU_l_____M_____________________  referenced,uptodate,lru,mmap
0x000100000000082c              12        0  __RU_l_____M_________________I___  referenced,uptodate,lru,mmap,readahead
0x0000000000000868             192        0  ___U_lA____M_____________________  uptodate,lru,active,mmap
0x0001000000000868              12        0  ___U_lA____M_________________I___  uptodate,lru,active,mmap,readahead
0x000000000000086c             800        3  __RU_lA____M_____________________  referenced,uptodate,lru,active,mmap
0x000100000000086c              31        0  __RU_lA____M_________________I___  referenced,uptodate,lru,active,mmap,readahead
0x0000000000004878               2        0  ___UDlA____M__b__________________  uptodate,dirty,lru,active,mmap,swapbacked
0x0000000000001000             492        1  ____________a____________________  anonymous
0x0000000000005008               2        0  ___U________a_b__________________  uptodate,anonymous,swapbacked
0x0000000000005808               4        0  ___U_______Ma_b__________________  uptodate,mmap,anonymous,swapbacked
0x000000000000580c               1        0  __RU_______Ma_b__________________  referenced,uptodate,mmap,anonymous,swapbacked
0x0000000000005868            2839       11  ___U_lA____Ma_b__________________  uptodate,lru,active,mmap,anonymous,swapbacked
0x000000000000586c              29        0  __RU_lA____Ma_b__________________  referenced,uptodate,lru,active,mmap,anonymous,swapbacked
             total          513968     2007

# ./page-types --raw --list --no-summary --bits reserved
offset  count   flags
0       15      _____________________r___________
31      4       _____________________r___________
159     97      _____________________r___________
4096    2067    _____________________r___________
6752    2390    _____________________r___________
9355    3       _____________________r___________
9728    14526   _____________________r___________

This patch:

Introduce PageHuge(), which identifies huge/gigantic pages by their
dedicated compound destructor functions.

Also move prep_compound_gigantic_page() to hugetlb.c and make
__free_pages_ok() non-static.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/proc/page.c b/fs/proc/page.c
index e998383..38dd88b 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -6,6 +6,7 @@
 #include <linux/mmzone.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/hugetlb.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 03be7f2..a05a5ef 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -11,6 +11,8 @@
 
 struct ctl_table;
 
+int PageHuge(struct page *page);
+
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
 	return vma->vm_flags & VM_HUGETLB;
@@ -61,6 +63,11 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
+static inline int PageHuge(struct page *page)
+{
+	return 0;
+}
+
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
 	return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7b9b601..a56e6f3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -578,41 +578,6 @@ static void free_huge_page(struct page *page)
 		hugetlb_put_quota(mapping, 1);
 }
 
-/*
- * Increment or decrement surplus_huge_pages.  Keep node-specific counters
- * balanced by operating on them in a round-robin fashion.
- * Returns 1 if an adjustment was made.
- */
-static int adjust_pool_surplus(struct hstate *h, int delta)
-{
-	static int prev_nid;
-	int nid = prev_nid;
-	int ret = 0;
-
-	VM_BUG_ON(delta != -1 && delta != 1);
-	do {
-		nid = next_node(nid, node_online_map);
-		if (nid == MAX_NUMNODES)
-			nid = first_node(node_online_map);
-
-		/* To shrink on this node, there must be a surplus page */
-		if (delta < 0 && !h->surplus_huge_pages_node[nid])
-			continue;
-		/* Surplus cannot exceed the total number of pages */
-		if (delta > 0 && h->surplus_huge_pages_node[nid] >=
-						h->nr_huge_pages_node[nid])
-			continue;
-
-		h->surplus_huge_pages += delta;
-		h->surplus_huge_pages_node[nid] += delta;
-		ret = 1;
-		break;
-	} while (nid != prev_nid);
-
-	prev_nid = nid;
-	return ret;
-}
-
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
 	set_compound_page_dtor(page, free_huge_page);
@@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 	put_page(page); /* free it into the hugepage allocator */
 }
 
+static void prep_compound_gigantic_page(struct page *page, unsigned long order)
+{
+	int i;
+	int nr_pages = 1 << order;
+	struct page *p = page + 1;
+
+	/* we rely on prep_new_huge_page to set the destructor */
+	set_compound_order(page, order);
+	__SetPageHead(page);
+	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+		__SetPageTail(p);
+		p->first_page = page;
+	}
+}
+
+int PageHuge(struct page *page)
+{
+	compound_page_dtor *dtor;
+
+	if (!PageCompound(page))
+		return 0;
+
+	page = compound_head(page);
+	dtor = get_compound_page_dtor(page);
+
+	return dtor == free_huge_page;
+}
+
 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
 	struct page *page;
@@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
 }
 #endif
 
+/*
+ * Increment or decrement surplus_huge_pages.  Keep node-specific counters
+ * balanced by operating on them in a round-robin fashion.
+ * Returns 1 if an adjustment was made.
+ */
+static int adjust_pool_surplus(struct hstate *h, int delta)
+{
+	static int prev_nid;
+	int nid = prev_nid;
+	int ret = 0;
+
+	VM_BUG_ON(delta != -1 && delta != 1);
+	do {
+		nid = next_node(nid, node_online_map);
+		if (nid == MAX_NUMNODES)
+			nid = first_node(node_online_map);
+
+		/* To shrink on this node, there must be a surplus page */
+		if (delta < 0 && !h->surplus_huge_pages_node[nid])
+			continue;
+		/* Surplus cannot exceed the total number of pages */
+		if (delta > 0 && h->surplus_huge_pages_node[nid] >=
+						h->nr_huge_pages_node[nid])
+			continue;
+
+		h->surplus_huge_pages += delta;
+		h->surplus_huge_pages_node[nid] += delta;
+		ret = 1;
+		break;
+	} while (nid != prev_nid);
+
+	prev_nid = nid;
+	return ret;
+}
+
 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
 static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
 {
diff --git a/mm/internal.h b/mm/internal.h
index 4b1672a..b4ac332 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -16,9 +16,6 @@
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
 
-extern void prep_compound_page(struct page *page, unsigned long order);
-extern void prep_compound_gigantic_page(struct page *page, unsigned long order);
-
 static inline void set_page_count(struct page *page, int v)
 {
 	atomic_set(&page->_count, v);
@@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page);
  */
 extern unsigned long highest_memmap_pfn;
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
+extern void prep_compound_page(struct page *page, unsigned long order);
+
 
 /*
  * function for dealing with page's order in buddy system.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8ca06d8..131655c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -300,23 +300,6 @@ void prep_compound_page(struct page *page, unsigned long order)
 	}
 }
 
-#ifdef CONFIG_HUGETLBFS
-void prep_compound_gigantic_page(struct page *page, unsigned long order)
-{
-	int i;
-	int nr_pages = 1 << order;
-	struct page *p = page + 1;
-
-	set_compound_page_dtor(page, free_compound_page);
-	set_compound_order(page, order);
-	__SetPageHead(page);
-	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
-		__SetPageTail(p);
-		p->first_page = page;
-	}
-}
-#endif
-
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
 	int i;
-- 
cgit v0.10.2


From ed7ce0f1022942301776f93159c981b09382ddea Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:32:23 -0700
Subject: proc: kpagecount/kpageflags code cleanup

Move increments of pfn/out to bottom of the loop.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Acked-by: Matt Mackall <mpm@selenic.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 38dd88b..e73e911 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -12,6 +12,7 @@
 
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
+
 /* /proc/kpagecount - an array exposing page counts
  *
  * Each entry is a u64 representing the corresponding
@@ -33,20 +34,22 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
 		return -EINVAL;
 
 	while (count > 0) {
-		ppage = NULL;
 		if (pfn_valid(pfn))
 			ppage = pfn_to_page(pfn);
-		pfn++;
+		else
+			ppage = NULL;
 		if (!ppage)
 			pcount = 0;
 		else
 			pcount = page_mapcount(ppage);
 
-		if (put_user(pcount, out++)) {
+		if (put_user(pcount, out)) {
 			ret = -EFAULT;
 			break;
 		}
 
+		pfn++;
+		out++;
 		count -= KPMSIZE;
 	}
 
@@ -99,10 +102,10 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 		return -EINVAL;
 
 	while (count > 0) {
-		ppage = NULL;
 		if (pfn_valid(pfn))
 			ppage = pfn_to_page(pfn);
-		pfn++;
+		else
+			ppage = NULL;
 		if (!ppage)
 			kflags = 0;
 		else
@@ -120,11 +123,13 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 			kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) |
 			kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy);
 
-		if (put_user(uflags, out++)) {
+		if (put_user(uflags, out)) {
 			ret = -EFAULT;
 			break;
 		}
 
+		pfn++;
+		out++;
 		count -= KPMSIZE;
 	}
 
-- 
cgit v0.10.2


From 177975495914efb372f7edee28ba9a0fdb754149 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:32:24 -0700
Subject: proc: export more page flags in /proc/kpageflags

Export all page flags faithfully in /proc/kpageflags.

	11. KPF_MMAP		(pseudo flag) memory mapped page
	12. KPF_ANON		(pseudo flag) memory mapped page (anonymous)
	13. KPF_SWAPCACHE	page is in swap cache
	14. KPF_SWAPBACKED	page is swap/RAM backed
	15. KPF_COMPOUND_HEAD	(*)
	16. KPF_COMPOUND_TAIL	(*)
	17. KPF_HUGE		hugeTLB pages
	18. KPF_UNEVICTABLE	page is in the unevictable LRU list
	19. KPF_HWPOISON(TBD)	hardware detected corruption
	20. KPF_NOPAGE		(pseudo flag) no page frame at the address
	32-39.			more obscure flags for kernel developers

	(*) For compound pages, exporting _both_ head/tail info enables
	    users to tell where a compound page starts/ends, and its order.

The accompanying page-types tool will handle the details like decoupling
overloaded flags and hiding obscure flags to normal users.

Thanks to KOSAKI and Andi for their valuable recommendations!

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/proc/page.c b/fs/proc/page.c
index e73e911..9d926bd 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -72,19 +72,124 @@ static const struct file_operations proc_kpagecount_operations = {
 
 /* These macros are used to decouple internal flags from exported ones */
 
-#define KPF_LOCKED     0
-#define KPF_ERROR      1
-#define KPF_REFERENCED 2
-#define KPF_UPTODATE   3
-#define KPF_DIRTY      4
-#define KPF_LRU        5
-#define KPF_ACTIVE     6
-#define KPF_SLAB       7
-#define KPF_WRITEBACK  8
-#define KPF_RECLAIM    9
-#define KPF_BUDDY     10
-
-#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos)
+#define KPF_LOCKED		0
+#define KPF_ERROR		1
+#define KPF_REFERENCED		2
+#define KPF_UPTODATE		3
+#define KPF_DIRTY		4
+#define KPF_LRU			5
+#define KPF_ACTIVE		6
+#define KPF_SLAB		7
+#define KPF_WRITEBACK		8
+#define KPF_RECLAIM		9
+#define KPF_BUDDY		10
+
+/* 11-20: new additions in 2.6.31 */
+#define KPF_MMAP		11
+#define KPF_ANON		12
+#define KPF_SWAPCACHE		13
+#define KPF_SWAPBACKED		14
+#define KPF_COMPOUND_HEAD	15
+#define KPF_COMPOUND_TAIL	16
+#define KPF_HUGE		17
+#define KPF_UNEVICTABLE		18
+#define KPF_NOPAGE		20
+
+/* kernel hacking assistances
+ * WARNING: subject to change, never rely on them!
+ */
+#define KPF_RESERVED		32
+#define KPF_MLOCKED		33
+#define KPF_MAPPEDTODISK	34
+#define KPF_PRIVATE		35
+#define KPF_PRIVATE_2		36
+#define KPF_OWNER_PRIVATE	37
+#define KPF_ARCH		38
+#define KPF_UNCACHED		39
+
+static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
+{
+	return ((kflags >> kbit) & 1) << ubit;
+}
+
+static u64 get_uflags(struct page *page)
+{
+	u64 k;
+	u64 u;
+
+	/*
+	 * pseudo flag: KPF_NOPAGE
+	 * it differentiates a memory hole from a page with no flags
+	 */
+	if (!page)
+		return 1 << KPF_NOPAGE;
+
+	k = page->flags;
+	u = 0;
+
+	/*
+	 * pseudo flags for the well known (anonymous) memory mapped pages
+	 *
+	 * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
+	 * simple test in page_mapped() is not enough.
+	 */
+	if (!PageSlab(page) && page_mapped(page))
+		u |= 1 << KPF_MMAP;
+	if (PageAnon(page))
+		u |= 1 << KPF_ANON;
+
+	/*
+	 * compound pages: export both head/tail info
+	 * they together define a compound page's start/end pos and order
+	 */
+	if (PageHead(page))
+		u |= 1 << KPF_COMPOUND_HEAD;
+	if (PageTail(page))
+		u |= 1 << KPF_COMPOUND_TAIL;
+	if (PageHuge(page))
+		u |= 1 << KPF_HUGE;
+
+	u |= kpf_copy_bit(k, KPF_LOCKED,	PG_locked);
+
+	/*
+	 * Caveats on high order pages:
+	 * PG_buddy will only be set on the head page; SLUB/SLQB do the same
+	 * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+	 */
+	u |= kpf_copy_bit(k, KPF_SLAB,		PG_slab);
+	u |= kpf_copy_bit(k, KPF_BUDDY,		PG_buddy);
+
+	u |= kpf_copy_bit(k, KPF_ERROR,		PG_error);
+	u |= kpf_copy_bit(k, KPF_DIRTY,		PG_dirty);
+	u |= kpf_copy_bit(k, KPF_UPTODATE,	PG_uptodate);
+	u |= kpf_copy_bit(k, KPF_WRITEBACK,	PG_writeback);
+
+	u |= kpf_copy_bit(k, KPF_LRU,		PG_lru);
+	u |= kpf_copy_bit(k, KPF_REFERENCED,	PG_referenced);
+	u |= kpf_copy_bit(k, KPF_ACTIVE,	PG_active);
+	u |= kpf_copy_bit(k, KPF_RECLAIM,	PG_reclaim);
+
+	u |= kpf_copy_bit(k, KPF_SWAPCACHE,	PG_swapcache);
+	u |= kpf_copy_bit(k, KPF_SWAPBACKED,	PG_swapbacked);
+
+#ifdef CONFIG_UNEVICTABLE_LRU
+	u |= kpf_copy_bit(k, KPF_UNEVICTABLE,	PG_unevictable);
+	u |= kpf_copy_bit(k, KPF_MLOCKED,	PG_mlocked);
+#endif
+
+#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
+	u |= kpf_copy_bit(k, KPF_UNCACHED,	PG_uncached);
+#endif
+
+	u |= kpf_copy_bit(k, KPF_RESERVED,	PG_reserved);
+	u |= kpf_copy_bit(k, KPF_MAPPEDTODISK,	PG_mappedtodisk);
+	u |= kpf_copy_bit(k, KPF_PRIVATE,	PG_private);
+	u |= kpf_copy_bit(k, KPF_PRIVATE_2,	PG_private_2);
+	u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE,	PG_owner_priv_1);
+	u |= kpf_copy_bit(k, KPF_ARCH,		PG_arch_1);
+
+	return u;
+};
 
 static ssize_t kpageflags_read(struct file *file, char __user *buf,
 			     size_t count, loff_t *ppos)
@@ -94,7 +199,6 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 	unsigned long src = *ppos;
 	unsigned long pfn;
 	ssize_t ret = 0;
-	u64 kflags, uflags;
 
 	pfn = src / KPMSIZE;
 	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
@@ -106,24 +210,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 			ppage = pfn_to_page(pfn);
 		else
 			ppage = NULL;
-		if (!ppage)
-			kflags = 0;
-		else
-			kflags = ppage->flags;
-
-		uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) |
-			kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
-			kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
-			kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
-			kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) |
-			kpf_copy_bit(kflags, KPF_LRU, PG_lru) |
-			kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) |
-			kpf_copy_bit(kflags, KPF_SLAB, PG_slab) |
-			kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) |
-			kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) |
-			kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy);
-
-		if (put_user(uflags, out)) {
+
+		if (put_user(get_uflags(ppage), out)) {
 			ret = -EFAULT;
 			break;
 		}
-- 
cgit v0.10.2


From c9ba78e226057a1c2f19671383c496df187c02b5 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:32:25 -0700
Subject: pagemap: document clarifications

Some bit ranges were inclusive and some not.  Fix them to be consistently
inclusive.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index ce72c0f..1f1e69f 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -12,9 +12,9 @@ There are three components to pagemap:
    value for each virtual page, containing the following data (from
    fs/proc/task_mmu.c, above pagemap_read):
 
-    * Bits 0-55  page frame number (PFN) if present
+    * Bits 0-54  page frame number (PFN) if present
     * Bits 0-4   swap type if swapped
-    * Bits 5-55  swap offset if swapped
+    * Bits 5-54  swap offset if swapped
     * Bits 55-60 page shift (page size = 1<<page shift)
     * Bit  61    reserved for future use
     * Bit  62    page swapped
@@ -36,7 +36,7 @@ There are three components to pagemap:
  * /proc/kpageflags.  This file contains a 64-bit set of flags for each
    page, indexed by PFN.
 
-   The flags are (from fs/proc/proc_misc, above kpageflags_read):
+   The flags are (from fs/proc/page.c, above kpageflags_read):
 
      0. LOCKED
      1. ERROR
-- 
cgit v0.10.2


From 17e895012f7fe9dc63144990da98c41bbc22d68f Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:32:26 -0700
Subject: pagemap: document 9 more exported page flags

Also add short descriptions for all of the 20 exported page flags.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index 1f1e69f..600a304 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -49,6 +49,68 @@ There are three components to pagemap:
      8. WRITEBACK
      9. RECLAIM
     10. BUDDY
+    11. MMAP
+    12. ANON
+    13. SWAPCACHE
+    14. SWAPBACKED
+    15. COMPOUND_HEAD
+    16. COMPOUND_TAIL
+    16. HUGE
+    18. UNEVICTABLE
+    20. NOPAGE
+
+Short descriptions to the page flags:
+
+ 0. LOCKED
+    page is being locked for exclusive access, eg. by undergoing read/write IO
+
+ 7. SLAB
+    page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
+    When compound page is used, SLUB/SLQB will only set this flag on the head
+    page; SLOB will not flag it at all.
+
+10. BUDDY
+    a free memory block managed by the buddy system allocator
+    The buddy system organizes free memory in blocks of various orders.
+    An order N block has 2^N physically contiguous pages, with the BUDDY flag
+    set for and _only_ for the first page.
+
+15. COMPOUND_HEAD
+16. COMPOUND_TAIL
+    A compound page with order N consists of 2^N physically contiguous pages.
+    A compound page with order 2 takes the form of "HTTT", where H donates its
+    head page and T donates its tail page(s).  The major consumers of compound
+    pages are hugeTLB pages (Documentation/vm/hugetlbpage.txt), the SLUB etc.
+    memory allocators and various device drivers. However in this interface,
+    only huge/giga pages are made visible to end users.
+17. HUGE
+    this is an integral part of a HugeTLB page
+
+20. NOPAGE
+    no page frame exists at the requested address
+
+    [IO related page flags]
+ 1. ERROR     IO error occurred
+ 3. UPTODATE  page has up-to-date data
+              ie. for file backed page: (in-memory data revision >= on-disk one)
+ 4. DIRTY     page has been written to, hence contains new data
+              ie. for file backed page: (in-memory data revision >  on-disk one)
+ 8. WRITEBACK page is being synced to disk
+
+    [LRU related page flags]
+ 5. LRU         page is in one of the LRU lists
+ 6. ACTIVE      page is in the active LRU list
+18. UNEVICTABLE page is in the unevictable (non-)LRU list
+                It is somehow pinned and not a candidate for LRU page reclaims,
+		eg. ramfs pages, shmctl(SHM_LOCK) and mlock() memory segments
+ 2. REFERENCED  page has been referenced since last LRU list enqueue/requeue
+ 9. RECLAIM     page will be reclaimed soon after its pageout IO completed
+11. MMAP        a memory mapped page
+12. ANON        a memory mapped page that is not part of a file
+13. SWAPCACHE   page is mapped to swap space, ie. has an associated swap entry
+14. SWAPBACKED  page is backed by swap/RAM
+
+The page-types tool in this directory can be used to query the above flags.
 
 Using pagemap to do something useful:
 
-- 
cgit v0.10.2


From 35efa5e993a7a00a50b87d2b7725c3eafc80b083 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:32:27 -0700
Subject: pagemap: add page-types tool

Add page-types, a handy tool for querying page flags.

It will expand some of the overloaded flags:
	PG_slob_free   = PG_private
	PG_slub_frozen = PG_active
	PG_slub_debug  = PG_error
	PG_readahead   = PG_reclaim

and mask out obscure flags except in -raw mode:
	PG_reserved
	PG_mlocked
	PG_mappedtodisk
	PG_private
	PG_private_2
	PG_owner_priv_1
	PG_arch_1
	PG_uncached
	PG_compound* for non hugeTLB pages

[akpm@linux-foundation.org: fix warning]
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile
index 6f562f7..27479d4 100644
--- a/Documentation/vm/Makefile
+++ b/Documentation/vm/Makefile
@@ -2,7 +2,7 @@
 obj- := dummy.o
 
 # List of programs to build
-hostprogs-y := slabinfo
+hostprogs-y := slabinfo slqbinfo page-types
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c
new file mode 100644
index 0000000..0833f44
--- /dev/null
+++ b/Documentation/vm/page-types.c
@@ -0,0 +1,698 @@
+/*
+ * page-types: Tool for querying page flags
+ *
+ * Copyright (C) 2009 Intel corporation
+ * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <string.h>
+#include <getopt.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/fcntl.h>
+
+
+/*
+ * kernel page flags
+ */
+
+#define KPF_BYTES		8
+#define PROC_KPAGEFLAGS		"/proc/kpageflags"
+
+/* copied from kpageflags_read() */
+#define KPF_LOCKED		0
+#define KPF_ERROR		1
+#define KPF_REFERENCED		2
+#define KPF_UPTODATE		3
+#define KPF_DIRTY		4
+#define KPF_LRU			5
+#define KPF_ACTIVE		6
+#define KPF_SLAB		7
+#define KPF_WRITEBACK		8
+#define KPF_RECLAIM		9
+#define KPF_BUDDY		10
+
+/* [11-20] new additions in 2.6.31 */
+#define KPF_MMAP		11
+#define KPF_ANON		12
+#define KPF_SWAPCACHE		13
+#define KPF_SWAPBACKED		14
+#define KPF_COMPOUND_HEAD	15
+#define KPF_COMPOUND_TAIL	16
+#define KPF_HUGE		17
+#define KPF_UNEVICTABLE		18
+#define KPF_NOPAGE		20
+
+/* [32-] kernel hacking assistances */
+#define KPF_RESERVED		32
+#define KPF_MLOCKED		33
+#define KPF_MAPPEDTODISK	34
+#define KPF_PRIVATE		35
+#define KPF_PRIVATE_2		36
+#define KPF_OWNER_PRIVATE	37
+#define KPF_ARCH		38
+#define KPF_UNCACHED		39
+
+/* [48-] take some arbitrary free slots for expanding overloaded flags
+ * not part of kernel API
+ */
+#define KPF_READAHEAD		48
+#define KPF_SLOB_FREE		49
+#define KPF_SLUB_FROZEN		50
+#define KPF_SLUB_DEBUG		51
+
+#define KPF_ALL_BITS		((uint64_t)~0ULL)
+#define KPF_HACKERS_BITS	(0xffffULL << 32)
+#define KPF_OVERLOADED_BITS	(0xffffULL << 48)
+#define BIT(name)		(1ULL << KPF_##name)
+#define BITS_COMPOUND		(BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
+
+static char *page_flag_names[] = {
+	[KPF_LOCKED]		= "L:locked",
+	[KPF_ERROR]		= "E:error",
+	[KPF_REFERENCED]	= "R:referenced",
+	[KPF_UPTODATE]		= "U:uptodate",
+	[KPF_DIRTY]		= "D:dirty",
+	[KPF_LRU]		= "l:lru",
+	[KPF_ACTIVE]		= "A:active",
+	[KPF_SLAB]		= "S:slab",
+	[KPF_WRITEBACK]		= "W:writeback",
+	[KPF_RECLAIM]		= "I:reclaim",
+	[KPF_BUDDY]		= "B:buddy",
+
+	[KPF_MMAP]		= "M:mmap",
+	[KPF_ANON]		= "a:anonymous",
+	[KPF_SWAPCACHE]		= "s:swapcache",
+	[KPF_SWAPBACKED]	= "b:swapbacked",
+	[KPF_COMPOUND_HEAD]	= "H:compound_head",
+	[KPF_COMPOUND_TAIL]	= "T:compound_tail",
+	[KPF_HUGE]		= "G:huge",
+	[KPF_UNEVICTABLE]	= "u:unevictable",
+	[KPF_NOPAGE]		= "n:nopage",
+
+	[KPF_RESERVED]		= "r:reserved",
+	[KPF_MLOCKED]		= "m:mlocked",
+	[KPF_MAPPEDTODISK]	= "d:mappedtodisk",
+	[KPF_PRIVATE]		= "P:private",
+	[KPF_PRIVATE_2]		= "p:private_2",
+	[KPF_OWNER_PRIVATE]	= "O:owner_private",
+	[KPF_ARCH]		= "h:arch",
+	[KPF_UNCACHED]		= "c:uncached",
+
+	[KPF_READAHEAD]		= "I:readahead",
+	[KPF_SLOB_FREE]		= "P:slob_free",
+	[KPF_SLUB_FROZEN]	= "A:slub_frozen",
+	[KPF_SLUB_DEBUG]	= "E:slub_debug",
+};
+
+
+/*
+ * data structures
+ */
+
+static int		opt_raw;	/* for kernel developers */
+static int		opt_list;	/* list pages (in ranges) */
+static int		opt_no_summary;	/* don't show summary */
+static pid_t		opt_pid;	/* process to walk */
+
+#define MAX_ADDR_RANGES	1024
+static int		nr_addr_ranges;
+static unsigned long	opt_offset[MAX_ADDR_RANGES];
+static unsigned long	opt_size[MAX_ADDR_RANGES];
+
+#define MAX_BIT_FILTERS	64
+static int		nr_bit_filters;
+static uint64_t		opt_mask[MAX_BIT_FILTERS];
+static uint64_t		opt_bits[MAX_BIT_FILTERS];
+
+static int		page_size;
+
+#define PAGES_BATCH	(64 << 10)	/* 64k pages */
+static int		kpageflags_fd;
+static uint64_t		kpageflags_buf[KPF_BYTES * PAGES_BATCH];
+
+#define HASH_SHIFT	13
+#define HASH_SIZE	(1 << HASH_SHIFT)
+#define HASH_MASK	(HASH_SIZE - 1)
+#define HASH_KEY(flags)	(flags & HASH_MASK)
+
+static unsigned long	total_pages;
+static unsigned long	nr_pages[HASH_SIZE];
+static uint64_t 	page_flags[HASH_SIZE];
+
+
+/*
+ * helper functions
+ */
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define min_t(type, x, y) ({			\
+	type __min1 = (x);			\
+	type __min2 = (y);			\
+	__min1 < __min2 ? __min1 : __min2; })
+
+unsigned long pages2mb(unsigned long pages)
+{
+	return (pages * page_size) >> 20;
+}
+
+void fatal(const char *x, ...)
+{
+	va_list ap;
+
+	va_start(ap, x);
+	vfprintf(stderr, x, ap);
+	va_end(ap);
+	exit(EXIT_FAILURE);
+}
+
+
+/*
+ * page flag names
+ */
+
+char *page_flag_name(uint64_t flags)
+{
+	static char buf[65];
+	int present;
+	int i, j;
+
+	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+		present = (flags >> i) & 1;
+		if (!page_flag_names[i]) {
+			if (present)
+				fatal("unkown flag bit %d\n", i);
+			continue;
+		}
+		buf[j++] = present ? page_flag_names[i][0] : '_';
+	}
+
+	return buf;
+}
+
+char *page_flag_longname(uint64_t flags)
+{
+	static char buf[1024];
+	int i, n;
+
+	for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+		if (!page_flag_names[i])
+			continue;
+		if ((flags >> i) & 1)
+			n += snprintf(buf + n, sizeof(buf) - n, "%s,",
+					page_flag_names[i] + 2);
+	}
+	if (n)
+		n--;
+	buf[n] = '\0';
+
+	return buf;
+}
+
+
+/*
+ * page list and summary
+ */
+
+void show_page_range(unsigned long offset, uint64_t flags)
+{
+	static uint64_t      flags0;
+	static unsigned long index;
+	static unsigned long count;
+
+	if (flags == flags0 && offset == index + count) {
+		count++;
+		return;
+	}
+
+	if (count)
+		printf("%lu\t%lu\t%s\n",
+				index, count, page_flag_name(flags0));
+
+	flags0 = flags;
+	index  = offset;
+	count  = 1;
+}
+
+void show_page(unsigned long offset, uint64_t flags)
+{
+	printf("%lu\t%s\n", offset, page_flag_name(flags));
+}
+
+void show_summary(void)
+{
+	int i;
+
+	printf("             flags\tpage-count       MB"
+		"  symbolic-flags\t\t\tlong-symbolic-flags\n");
+
+	for (i = 0; i < ARRAY_SIZE(nr_pages); i++) {
+		if (nr_pages[i])
+			printf("0x%016llx\t%10lu %8lu  %s\t%s\n",
+				(unsigned long long)page_flags[i],
+				nr_pages[i],
+				pages2mb(nr_pages[i]),
+				page_flag_name(page_flags[i]),
+				page_flag_longname(page_flags[i]));
+	}
+
+	printf("             total\t%10lu %8lu\n",
+			total_pages, pages2mb(total_pages));
+}
+
+
+/*
+ * page flag filters
+ */
+
+int bit_mask_ok(uint64_t flags)
+{
+	int i;
+
+	for (i = 0; i < nr_bit_filters; i++) {
+		if (opt_bits[i] == KPF_ALL_BITS) {
+			if ((flags & opt_mask[i]) == 0)
+				return 0;
+		} else {
+			if ((flags & opt_mask[i]) != opt_bits[i])
+				return 0;
+		}
+	}
+
+	return 1;
+}
+
+uint64_t expand_overloaded_flags(uint64_t flags)
+{
+	/* SLOB/SLUB overload several page flags */
+	if (flags & BIT(SLAB)) {
+		if (flags & BIT(PRIVATE))
+			flags ^= BIT(PRIVATE) | BIT(SLOB_FREE);
+		if (flags & BIT(ACTIVE))
+			flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
+		if (flags & BIT(ERROR))
+			flags ^= BIT(ERROR) | BIT(SLUB_DEBUG);
+	}
+
+	/* PG_reclaim is overloaded as PG_readahead in the read path */
+	if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM))
+		flags ^= BIT(RECLAIM) | BIT(READAHEAD);
+
+	return flags;
+}
+
+uint64_t well_known_flags(uint64_t flags)
+{
+	/* hide flags intended only for kernel hacker */
+	flags &= ~KPF_HACKERS_BITS;
+
+	/* hide non-hugeTLB compound pages */
+	if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE)))
+		flags &= ~BITS_COMPOUND;
+
+	return flags;
+}
+
+
+/*
+ * page frame walker
+ */
+
+int hash_slot(uint64_t flags)
+{
+	int k = HASH_KEY(flags);
+	int i;
+
+	/* Explicitly reserve slot 0 for flags 0: the following logic
+	 * cannot distinguish an unoccupied slot from slot (flags==0).
+	 */
+	if (flags == 0)
+		return 0;
+
+	/* search through the remaining (HASH_SIZE-1) slots */
+	for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) {
+		if (!k || k >= ARRAY_SIZE(page_flags))
+			k = 1;
+		if (page_flags[k] == 0) {
+			page_flags[k] = flags;
+			return k;
+		}
+		if (page_flags[k] == flags)
+			return k;
+	}
+
+	fatal("hash table full: bump up HASH_SHIFT?\n");
+	exit(EXIT_FAILURE);
+}
+
+void add_page(unsigned long offset, uint64_t flags)
+{
+	flags = expand_overloaded_flags(flags);
+
+	if (!opt_raw)
+		flags = well_known_flags(flags);
+
+	if (!bit_mask_ok(flags))
+		return;
+
+	if (opt_list == 1)
+		show_page_range(offset, flags);
+	else if (opt_list == 2)
+		show_page(offset, flags);
+
+	nr_pages[hash_slot(flags)]++;
+	total_pages++;
+}
+
+void walk_pfn(unsigned long index, unsigned long count)
+{
+	unsigned long batch;
+	unsigned long n;
+	unsigned long i;
+
+	if (index > ULONG_MAX / KPF_BYTES)
+		fatal("index overflow: %lu\n", index);
+
+	lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET);
+
+	while (count) {
+		batch = min_t(unsigned long, count, PAGES_BATCH);
+		n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES);
+		if (n == 0)
+			break;
+		if (n < 0) {
+			perror(PROC_KPAGEFLAGS);
+			exit(EXIT_FAILURE);
+		}
+
+		if (n % KPF_BYTES != 0)
+			fatal("partial read: %lu bytes\n", n);
+		n = n / KPF_BYTES;
+
+		for (i = 0; i < n; i++)
+			add_page(index + i, kpageflags_buf[i]);
+
+		index += batch;
+		count -= batch;
+	}
+}
+
+void walk_addr_ranges(void)
+{
+	int i;
+
+	kpageflags_fd = open(PROC_KPAGEFLAGS, O_RDONLY);
+	if (kpageflags_fd < 0) {
+		perror(PROC_KPAGEFLAGS);
+		exit(EXIT_FAILURE);
+	}
+
+	if (!nr_addr_ranges)
+		walk_pfn(0, ULONG_MAX);
+
+	for (i = 0; i < nr_addr_ranges; i++)
+		walk_pfn(opt_offset[i], opt_size[i]);
+
+	close(kpageflags_fd);
+}
+
+
+/*
+ * user interface
+ */
+
+const char *page_flag_type(uint64_t flag)
+{
+	if (flag & KPF_HACKERS_BITS)
+		return "(r)";
+	if (flag & KPF_OVERLOADED_BITS)
+		return "(o)";
+	return "   ";
+}
+
+void usage(void)
+{
+	int i, j;
+
+	printf(
+"page-types [options]\n"
+"            -r|--raw                  Raw mode, for kernel developers\n"
+"            -a|--addr    addr-spec    Walk a range of pages\n"
+"            -b|--bits    bits-spec    Walk pages with specified bits\n"
+#if 0 /* planned features */
+"            -p|--pid     pid          Walk process address space\n"
+"            -f|--file    filename     Walk file address space\n"
+#endif
+"            -l|--list                 Show page details in ranges\n"
+"            -L|--list-each            Show page details one by one\n"
+"            -N|--no-summary           Don't show summay info\n"
+"            -h|--help                 Show this usage message\n"
+"addr-spec:\n"
+"            N                         one page at offset N (unit: pages)\n"
+"            N+M                       pages range from N to N+M-1\n"
+"            N,M                       pages range from N to M-1\n"
+"            N,                        pages range from N to end\n"
+"            ,M                        pages range from 0 to M\n"
+"bits-spec:\n"
+"            bit1,bit2                 (flags & (bit1|bit2)) != 0\n"
+"            bit1,bit2=bit1            (flags & (bit1|bit2)) == bit1\n"
+"            bit1,~bit2                (flags & (bit1|bit2)) == bit1\n"
+"            =bit1,bit2                flags == (bit1|bit2)\n"
+"bit-names:\n"
+	);
+
+	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+		if (!page_flag_names[i])
+			continue;
+		printf("%16s%s", page_flag_names[i] + 2,
+				 page_flag_type(1ULL << i));
+		if (++j > 3) {
+			j = 0;
+			putchar('\n');
+		}
+	}
+	printf("\n                                   "
+		"(r) raw mode bits  (o) overloaded bits\n");
+}
+
+unsigned long long parse_number(const char *str)
+{
+	unsigned long long n;
+
+	n = strtoll(str, NULL, 0);
+
+	if (n == 0 && str[0] != '0')
+		fatal("invalid name or number: %s\n", str);
+
+	return n;
+}
+
+void parse_pid(const char *str)
+{
+	opt_pid = parse_number(str);
+}
+
+void parse_file(const char *name)
+{
+}
+
+void add_addr_range(unsigned long offset, unsigned long size)
+{
+	if (nr_addr_ranges >= MAX_ADDR_RANGES)
+		fatal("too much addr ranges\n");
+
+	opt_offset[nr_addr_ranges] = offset;
+	opt_size[nr_addr_ranges] = size;
+	nr_addr_ranges++;
+}
+
+void parse_addr_range(const char *optarg)
+{
+	unsigned long offset;
+	unsigned long size;
+	char *p;
+
+	p = strchr(optarg, ',');
+	if (!p)
+		p = strchr(optarg, '+');
+
+	if (p == optarg) {
+		offset = 0;
+		size   = parse_number(p + 1);
+	} else if (p) {
+		offset = parse_number(optarg);
+		if (p[1] == '\0')
+			size = ULONG_MAX;
+		else {
+			size = parse_number(p + 1);
+			if (*p == ',') {
+				if (size < offset)
+					fatal("invalid range: %lu,%lu\n",
+							offset, size);
+				size -= offset;
+			}
+		}
+	} else {
+		offset = parse_number(optarg);
+		size   = 1;
+	}
+
+	add_addr_range(offset, size);
+}
+
+void add_bits_filter(uint64_t mask, uint64_t bits)
+{
+	if (nr_bit_filters >= MAX_BIT_FILTERS)
+		fatal("too much bit filters\n");
+
+	opt_mask[nr_bit_filters] = mask;
+	opt_bits[nr_bit_filters] = bits;
+	nr_bit_filters++;
+}
+
+uint64_t parse_flag_name(const char *str, int len)
+{
+	int i;
+
+	if (!*str || !len)
+		return 0;
+
+	if (len <= 8 && !strncmp(str, "compound", len))
+		return BITS_COMPOUND;
+
+	for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+		if (!page_flag_names[i])
+			continue;
+		if (!strncmp(str, page_flag_names[i] + 2, len))
+			return 1ULL << i;
+	}
+
+	return parse_number(str);
+}
+
+uint64_t parse_flag_names(const char *str, int all)
+{
+	const char *p    = str;
+	uint64_t   flags = 0;
+
+	while (1) {
+		if (*p == ',' || *p == '=' || *p == '\0') {
+			if ((*str != '~') || (*str == '~' && all && *++str))
+				flags |= parse_flag_name(str, p - str);
+			if (*p != ',')
+				break;
+			str = p + 1;
+		}
+		p++;
+	}
+
+	return flags;
+}
+
+void parse_bits_mask(const char *optarg)
+{
+	uint64_t mask;
+	uint64_t bits;
+	const char *p;
+
+	p = strchr(optarg, '=');
+	if (p == optarg) {
+		mask = KPF_ALL_BITS;
+		bits = parse_flag_names(p + 1, 0);
+	} else if (p) {
+		mask = parse_flag_names(optarg, 0);
+		bits = parse_flag_names(p + 1, 0);
+	} else if (strchr(optarg, '~')) {
+		mask = parse_flag_names(optarg, 1);
+		bits = parse_flag_names(optarg, 0);
+	} else {
+		mask = parse_flag_names(optarg, 0);
+		bits = KPF_ALL_BITS;
+	}
+
+	add_bits_filter(mask, bits);
+}
+
+
+struct option opts[] = {
+	{ "raw"       , 0, NULL, 'r' },
+	{ "pid"       , 1, NULL, 'p' },
+	{ "file"      , 1, NULL, 'f' },
+	{ "addr"      , 1, NULL, 'a' },
+	{ "bits"      , 1, NULL, 'b' },
+	{ "list"      , 0, NULL, 'l' },
+	{ "list-each" , 0, NULL, 'L' },
+	{ "no-summary", 0, NULL, 'N' },
+	{ "help"      , 0, NULL, 'h' },
+	{ NULL        , 0, NULL, 0 }
+};
+
+int main(int argc, char *argv[])
+{
+	int c;
+
+	page_size = getpagesize();
+
+	while ((c = getopt_long(argc, argv,
+				"rp:f:a:b:lLNh", opts, NULL)) != -1) {
+		switch (c) {
+		case 'r':
+			opt_raw = 1;
+			break;
+		case 'p':
+			parse_pid(optarg);
+			break;
+		case 'f':
+			parse_file(optarg);
+			break;
+		case 'a':
+			parse_addr_range(optarg);
+			break;
+		case 'b':
+			parse_bits_mask(optarg);
+			break;
+		case 'l':
+			opt_list = 1;
+			break;
+		case 'L':
+			opt_list = 2;
+			break;
+		case 'N':
+			opt_no_summary = 1;
+			break;
+		case 'h':
+			usage();
+			exit(0);
+		default:
+			usage();
+			exit(1);
+		}
+	}
+
+	if (opt_list == 1)
+		printf("offset\tcount\tflags\n");
+	if (opt_list == 2)
+		printf("offset\tflags\n");
+
+	walk_addr_ranges();
+
+	if (opt_list == 1)
+		show_page_range(0, 0);  /* drain the buffer */
+
+	if (opt_no_summary)
+		return 0;
+
+	if (opt_list)
+		printf("\n\n");
+
+	show_summary();
+
+	return 0;
+}
-- 
cgit v0.10.2


From 56e49d218890f49b0057710a4b6fef31f5ffbfec Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 16 Jun 2009 15:32:28 -0700
Subject: vmscan: evict use-once pages first

When the file LRU lists are dominated by streaming IO pages, evict those
pages first, before considering evicting other pages.

This should be safe from deadlocks or performance problems
because only three things can happen to an inactive file page:

1) referenced twice and promoted to the active list
2) evicted by the pageout code
3) under IO, after which it will get evicted or promoted

The pages freed in this way can either be reused for streaming IO, or
allocated for something else.  If the pages are used for streaming IO,
this pageout pattern continues.  Otherwise, we will fall back to the
normal pageout pattern.

Signed-off-by: Rik van Riel <riel@redhat.com>
Reported-by: Elladan <elladan@eskimo.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 25b9ca9..45add35 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -94,6 +94,7 @@ extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
 extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 							int priority);
 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
+int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 				       struct zone *zone,
 				       enum lru_list lru);
@@ -239,6 +240,12 @@ mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
 	return 1;
 }
 
+static inline int
+mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
+{
+	return 1;
+}
+
 static inline unsigned long
 mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone,
 			 enum lru_list lru)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 78eb855..70db6e0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -570,6 +570,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
 	return 0;
 }
 
+int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
+{
+	unsigned long active;
+	unsigned long inactive;
+
+	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
+	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
+
+	return (active > inactive);
+}
+
 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 				       struct zone *zone,
 				       enum lru_list lru)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5245d0..9673437 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1351,12 +1351,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
 	return low;
 }
 
+static int inactive_file_is_low_global(struct zone *zone)
+{
+	unsigned long active, inactive;
+
+	active = zone_page_state(zone, NR_ACTIVE_FILE);
+	inactive = zone_page_state(zone, NR_INACTIVE_FILE);
+
+	return (active > inactive);
+}
+
+/**
+ * inactive_file_is_low - check if file pages need to be deactivated
+ * @zone: zone to check
+ * @sc:   scan control of this context
+ *
+ * When the system is doing streaming IO, memory pressure here
+ * ensures that active file pages get deactivated, until more
+ * than half of the file pages are on the inactive list.
+ *
+ * Once we get to that situation, protect the system's working
+ * set from being evicted by disabling active file page aging.
+ *
+ * This uses a different ratio than the anonymous pages, because
+ * the page cache uses a use-once replacement algorithm.
+ */
+static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
+{
+	int low;
+
+	if (scanning_global_lru(sc))
+		low = inactive_file_is_low_global(zone);
+	else
+		low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
+	return low;
+}
+
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 	struct zone *zone, struct scan_control *sc, int priority)
 {
 	int file = is_file_lru(lru);
 
-	if (lru == LRU_ACTIVE_FILE) {
+	if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
 		shrink_active_list(nr_to_scan, zone, sc, priority, file);
 		return 0;
 	}
-- 
cgit v0.10.2


From 6e08a369ee10b361ac1cdcdf4fabd420fd08beb3 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:32:29 -0700
Subject: vmscan: cleanup the scan batching code

The vmscan batching logic is twisting.  Move it into a standalone function
nr_scan_try_batch() and document it.  No behavior change.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index dd8487f..db976b9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -334,9 +334,9 @@ struct zone {
 
 	/* Fields commonly accessed by the page reclaim scanner */
 	spinlock_t		lru_lock;	
-	struct {
+	struct zone_lru {
 		struct list_head list;
-		unsigned long nr_scan;
+		unsigned long nr_saved_scan;	/* accumulated for batching */
 	} lru[NR_LRU_LISTS];
 
 	struct zone_reclaim_stat reclaim_stat;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 131655c..e5b8f62 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3657,7 +3657,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		zone_pcp_init(zone);
 		for_each_lru(l) {
 			INIT_LIST_HEAD(&zone->lru[l].list);
-			zone->lru[l].nr_scan = 0;
+			zone->lru[l].nr_saved_scan = 0;
 		}
 		zone->reclaim_stat.recent_rotated[0] = 0;
 		zone->reclaim_stat.recent_rotated[1] = 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9673437..d4da097 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1492,6 +1492,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 	percent[1] = 100 - percent[0];
 }
 
+/*
+ * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
+ * until we collected @swap_cluster_max pages to scan.
+ */
+static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
+				       unsigned long *nr_saved_scan,
+				       unsigned long swap_cluster_max)
+{
+	unsigned long nr;
+
+	*nr_saved_scan += nr_to_scan;
+	nr = *nr_saved_scan;
+
+	if (nr >= swap_cluster_max)
+		*nr_saved_scan = 0;
+	else
+		nr = 0;
+
+	return nr;
+}
 
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
@@ -1517,14 +1537,11 @@ static void shrink_zone(int priority, struct zone *zone,
 			scan >>= priority;
 			scan = (scan * percent[file]) / 100;
 		}
-		if (scanning_global_lru(sc)) {
-			zone->lru[l].nr_scan += scan;
-			nr[l] = zone->lru[l].nr_scan;
-			if (nr[l] >= swap_cluster_max)
-				zone->lru[l].nr_scan = 0;
-			else
-				nr[l] = 0;
-		} else
+		if (scanning_global_lru(sc))
+			nr[l] = nr_scan_try_batch(scan,
+						  &zone->lru[l].nr_saved_scan,
+						  swap_cluster_max);
+		else
 			nr[l] = scan;
 	}
 
@@ -2124,11 +2141,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
 						l == LRU_ACTIVE_FILE))
 				continue;
 
-			zone->lru[l].nr_scan += (lru_pages >> prio) + 1;
-			if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
+			zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
+			if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
 				unsigned long nr_to_scan;
 
-				zone->lru[l].nr_scan = 0;
+				zone->lru[l].nr_saved_scan = 0;
 				nr_to_scan = min(nr_pages, lru_pages);
 				nr_reclaimed += shrink_list(l, nr_to_scan, zone,
 								sc, prio);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4151107..84c0555 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -718,10 +718,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   low_wmark_pages(zone),
 		   high_wmark_pages(zone),
 		   zone->pages_scanned,
-		   zone->lru[LRU_ACTIVE_ANON].nr_scan,
-		   zone->lru[LRU_INACTIVE_ANON].nr_scan,
-		   zone->lru[LRU_ACTIVE_FILE].nr_scan,
-		   zone->lru[LRU_INACTIVE_FILE].nr_scan,
+		   zone->lru[LRU_ACTIVE_ANON].nr_saved_scan,
+		   zone->lru[LRU_INACTIVE_ANON].nr_saved_scan,
+		   zone->lru[LRU_ACTIVE_FILE].nr_saved_scan,
+		   zone->lru[LRU_INACTIVE_FILE].nr_saved_scan,
 		   zone->spanned_pages,
 		   zone->present_pages);
 
-- 
cgit v0.10.2


From 08d9ae7cbbd0c5c07573d072ec771e997a9a39e0 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:32:30 -0700
Subject: vmscan: don't export nr_saved_scan in /proc/zoneinfo

The lru->nr_saved_scan's are not meaningful counters for even kernel
developers.  They typically are smaller than 32 and are always 0 for large
lists.  So remove them from /proc/zoneinfo.

Hopefully this interface change won't break too many scripts.
/proc/zoneinfo is too unstructured to be script friendly, and I wonder the
affected scripts - if there are any - are still bleeding since the not
long ago commit "vmscan: split LRU lists into anon & file sets", which
also touched the "scanned" line :)

If we are to re-export accumulated vmscan counts in the future, they can
go to new lines in /proc/zoneinfo instead of the current form, or to
/sys/devices/system/node/node0/meminfo?

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>

Acked-by: Rik van Riel <riel@redhat.com>
Cc: Nick Piggin <npiggin@suse.de>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 84c0555..1e151cf 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -710,7 +710,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n        min      %lu"
 		   "\n        low      %lu"
 		   "\n        high     %lu"
-		   "\n        scanned  %lu (aa: %lu ia: %lu af: %lu if: %lu)"
+		   "\n        scanned  %lu"
 		   "\n        spanned  %lu"
 		   "\n        present  %lu",
 		   zone_page_state(zone, NR_FREE_PAGES),
@@ -718,10 +718,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   low_wmark_pages(zone),
 		   high_wmark_pages(zone),
 		   zone->pages_scanned,
-		   zone->lru[LRU_ACTIVE_ANON].nr_saved_scan,
-		   zone->lru[LRU_INACTIVE_ANON].nr_saved_scan,
-		   zone->lru[LRU_ACTIVE_FILE].nr_saved_scan,
-		   zone->lru[LRU_INACTIVE_FILE].nr_saved_scan,
 		   zone->spanned_pages,
 		   zone->present_pages);
 
-- 
cgit v0.10.2


From af166777cf451f0373b952ce6766dc1c25385686 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:32:31 -0700
Subject: vmscan: ZVC updates in shrink_active_list() can be done once

This effectively lifts the unit of updates to nr_inactive_* and
pgdeactivate from PAGEVEC_SIZE=14 to SWAP_CLUSTER_MAX=32, or
MAX_ORDER_NR_PAGES=1024 for reclaim_zone().

Cc: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index d4da097..7592d8e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1223,7 +1223,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 			struct scan_control *sc, int priority, int file)
 {
 	unsigned long pgmoved;
-	int pgdeactivate = 0;
 	unsigned long pgscanned;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
 	LIST_HEAD(l_inactive);
@@ -1252,7 +1251,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		__mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
 
-	pgmoved = 0;
+	pgmoved = 0;  /* count referenced (mapping) mapped pages */
 	while (!list_empty(&l_hold)) {
 		cond_resched();
 		page = lru_to_page(&l_hold);
@@ -1286,7 +1285,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	 */
 	reclaim_stat->recent_rotated[!!file] += pgmoved;
 
-	pgmoved = 0;
+	pgmoved = 0;  /* count pages moved to inactive list */
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
@@ -1299,10 +1298,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		mem_cgroup_add_lru_list(page, lru);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
-			__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
 			spin_unlock_irq(&zone->lru_lock);
-			pgdeactivate += pgmoved;
-			pgmoved = 0;
 			if (buffer_heads_over_limit)
 				pagevec_strip(&pvec);
 			__pagevec_release(&pvec);
@@ -1310,9 +1306,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		}
 	}
 	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
-	pgdeactivate += pgmoved;
 	__count_zone_vm_events(PGREFILL, zone, pgscanned);
-	__count_vm_events(PGDEACTIVATE, pgdeactivate);
+	__count_vm_events(PGDEACTIVATE, pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
 	if (buffer_heads_over_limit)
 		pagevec_strip(&pvec);
-- 
cgit v0.10.2


From 5c87eada68fe5d29a5f67528f81b6e45124f579b Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Tue, 16 Jun 2009 15:32:32 -0700
Subject: mm: setup_per_zone_inactive_ratio - do not call for int_sqrt if not
 needed

int_sqrt() returns 0 if its argument is zero so call it if only needed.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e5b8f62..db8c46f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4471,8 +4471,9 @@ static void setup_per_zone_inactive_ratio(void)
 
 		/* Zone size in gigabytes */
 		gb = zone->present_pages >> (30 - PAGE_SHIFT);
-		ratio = int_sqrt(10 * gb);
-		if (!ratio)
+		if (gb)
+			ratio = int_sqrt(10 * gb);
+		else
 			ratio = 1;
 
 		zone->inactive_ratio = ratio;
-- 
cgit v0.10.2


From e9bb35df6f813ca46f8e6273add657643c7df73f Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Tue, 16 Jun 2009 15:32:32 -0700
Subject: mm: setup_per_zone_inactive_ratio - fix comment and make it __init

The caller of setup_per_zone_inactive_ratio is an __init function.  There
is no need to keep the callee after it completed as well.  Also fix a
comment.

Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index db8c46f..076463c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4440,8 +4440,6 @@ void setup_per_zone_pages_min(void)
 }
 
 /**
- * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
- *
  * The inactive anon list should be small enough that the VM never has to
  * do too much work, but large enough that each inactive page has a chance
  * to be referenced again before it is swapped out.
@@ -4462,7 +4460,7 @@ void setup_per_zone_pages_min(void)
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
-static void setup_per_zone_inactive_ratio(void)
+static void __init setup_per_zone_inactive_ratio(void)
 {
 	struct zone *zone;
 
-- 
cgit v0.10.2


From f8ad0f499fad5cdbcaaa2d97542b2db869b5a770 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 16 Jun 2009 15:32:33 -0700
Subject: mm: introduce follow_pte()

A generic readonly page table lookup helper to map an address space and an
address from it to a pte.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Hellwig <hch@infradead.org>
Acked-by: Magnus Damm <magnus.damm@gmail.com>
Cc: Hans Verkuil <hverkuil@xs4all.nl>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memory.c b/mm/memory.c
index 891bad0..7135d6b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3103,6 +3103,43 @@ int in_gate_area_no_task(unsigned long addr)
 
 #endif	/* __HAVE_ARCH_GATE_AREA */
 
+static int follow_pte(struct mm_struct *mm, unsigned long address,
+		pte_t **ptepp, spinlock_t **ptlp)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep;
+
+	pgd = pgd_offset(mm, address);
+	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+		goto out;
+
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+		goto out;
+
+	pmd = pmd_offset(pud, address);
+	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		goto out;
+
+	/* We cannot handle huge page PFN maps. Luckily they don't exist. */
+	if (pmd_huge(*pmd))
+		goto out;
+
+	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
+	if (!ptep)
+		goto out;
+	if (!pte_present(*ptep))
+		goto unlock;
+	*ptepp = ptep;
+	return 0;
+unlock:
+	pte_unmap_unlock(ptep, *ptlp);
+out:
+	return -EINVAL;
+}
+
 #ifdef CONFIG_HAVE_IOREMAP_PROT
 int follow_phys(struct vm_area_struct *vma,
 		unsigned long address, unsigned int flags,
-- 
cgit v0.10.2


From 03668a4debf4f50de55c34b6e66dae63e1c73716 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 16 Jun 2009 15:32:34 -0700
Subject: mm: use generic follow_pte() in follow_phys()

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Hellwig <hch@infradead.org>
Acked-by: Magnus Damm <magnus.damm@gmail.com>
Cc: Hans Verkuil <hverkuil@xs4all.nl>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memory.c b/mm/memory.c
index 7135d6b..24ff204 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3145,50 +3145,24 @@ int follow_phys(struct vm_area_struct *vma,
 		unsigned long address, unsigned int flags,
 		unsigned long *prot, resource_size_t *phys)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
+	int ret = -EINVAL;
 	pte_t *ptep, pte;
 	spinlock_t *ptl;
-	resource_size_t phys_addr = 0;
-	struct mm_struct *mm = vma->vm_mm;
-	int ret = -EINVAL;
 
 	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
 		goto out;
 
-	pgd = pgd_offset(mm, address);
-	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-		goto out;
-
-	pud = pud_offset(pgd, address);
-	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
-		goto out;
-
-	pmd = pmd_offset(pud, address);
-	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-		goto out;
-
-	/* We cannot handle huge page PFN maps. Luckily they don't exist. */
-	if (pmd_huge(*pmd))
+	if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
 		goto out;
-
-	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
-	if (!ptep)
-		goto out;
-
 	pte = *ptep;
-	if (!pte_present(pte))
-		goto unlock;
+
 	if ((flags & FOLL_WRITE) && !pte_write(pte))
 		goto unlock;
-	phys_addr = pte_pfn(pte);
-	phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
 
 	*prot = pgprot_val(pte_pgprot(pte));
-	*phys = phys_addr;
-	ret = 0;
+	*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
 
+	ret = 0;
 unlock:
 	pte_unmap_unlock(ptep, ptl);
 out:
-- 
cgit v0.10.2


From 3b6748e2dd69906af3835db4dc9d1c8a3ee4c68c Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 16 Jun 2009 15:32:35 -0700
Subject: mm: introduce follow_pfn()

Analoguous to follow_phys(), add a helper that looks up the PFN at a
user virtual address in an IO mapping or a raw PFN mapping.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Hellwig <hch@infradead.org>
Acked-by: Magnus Damm <magnus.damm@gmail.com>
Cc: Hans Verkuil <hverkuil@xs4all.nl>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7b548e7..c80dbd4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -792,6 +792,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma);
 void unmap_mapping_range(struct address_space *mapping,
 		loff_t const holebegin, loff_t const holelen, int even_cows);
+int follow_pfn(struct vm_area_struct *vma, unsigned long address,
+	unsigned long *pfn);
 int follow_phys(struct vm_area_struct *vma, unsigned long address,
 		unsigned int flags, unsigned long *prot, resource_size_t *phys);
 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index 24ff204..d5d1653 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3140,6 +3140,35 @@ out:
 	return -EINVAL;
 }
 
+/**
+ * follow_pfn - look up PFN at a user virtual address
+ * @vma: memory mapping
+ * @address: user virtual address
+ * @pfn: location to store found PFN
+ *
+ * Only IO mappings and raw PFN mappings are allowed.
+ *
+ * Returns zero and the pfn at @pfn on success, -ve otherwise.
+ */
+int follow_pfn(struct vm_area_struct *vma, unsigned long address,
+	unsigned long *pfn)
+{
+	int ret = -EINVAL;
+	spinlock_t *ptl;
+	pte_t *ptep;
+
+	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+		return ret;
+
+	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
+	if (ret)
+		return ret;
+	*pfn = pte_pfn(*ptep);
+	pte_unmap_unlock(ptep, ptl);
+	return 0;
+}
+EXPORT_SYMBOL(follow_pfn);
+
 #ifdef CONFIG_HAVE_IOREMAP_PROT
 int follow_phys(struct vm_area_struct *vma,
 		unsigned long address, unsigned int flags,
-- 
cgit v0.10.2


From 720b17e759a50635c429ccaa2ec3d01edb4f92d6 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 16 Jun 2009 15:32:36 -0700
Subject: videobuf-dma-contig: zero copy USERPTR support

Since videobuf-dma-contig is designed to handle physically contiguous
memory, this patch modifies the videobuf-dma-contig code to only accept a
user space pointer to physically contiguous memory.  For now only
VM_PFNMAP vmas are supported, so forget hotplug.

On SuperH Mobile we use this with our sh_mobile_ceu_camera driver together
with various multimedia accelerator blocks that are exported to user space
using UIO.  The UIO kernel code exports physically contiguous memory to
user space and lets the user space application mmap() this memory and pass
a pointer using the USERPTR interface for V4L2 zero copy operation.

With this approach we support zero copy capture, hardware scaling and
various forms of hardware encoding and decoding.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Magnus Damm <damm@igel.co.jp>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: Mauro Carvalho Chehab <mchehab@infradead.org>
Cc: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/media/video/videobuf-dma-contig.c b/drivers/media/video/videobuf-dma-contig.c
index 6109fb5..0c29a01 100644
--- a/drivers/media/video/videobuf-dma-contig.c
+++ b/drivers/media/video/videobuf-dma-contig.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
 #include <linux/dma-mapping.h>
 #include <media/videobuf-dma-contig.h>
 
@@ -25,6 +26,7 @@ struct videobuf_dma_contig_memory {
 	void *vaddr;
 	dma_addr_t dma_handle;
 	unsigned long size;
+	int is_userptr;
 };
 
 #define MAGIC_DC_MEM 0x0733ac61
@@ -108,6 +110,82 @@ static struct vm_operations_struct videobuf_vm_ops = {
 	.close    = videobuf_vm_close,
 };
 
+/**
+ * videobuf_dma_contig_user_put() - reset pointer to user space buffer
+ * @mem: per-buffer private videobuf-dma-contig data
+ *
+ * This function resets the user space pointer
+ */
+static void videobuf_dma_contig_user_put(struct videobuf_dma_contig_memory *mem)
+{
+	mem->is_userptr = 0;
+	mem->dma_handle = 0;
+	mem->size = 0;
+}
+
+/**
+ * videobuf_dma_contig_user_get() - setup user space memory pointer
+ * @mem: per-buffer private videobuf-dma-contig data
+ * @vb: video buffer to map
+ *
+ * This function validates and sets up a pointer to user space memory.
+ * Only physically contiguous pfn-mapped memory is accepted.
+ *
+ * Returns 0 if successful.
+ */
+static int videobuf_dma_contig_user_get(struct videobuf_dma_contig_memory *mem,
+					struct videobuf_buffer *vb)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long prev_pfn, this_pfn;
+	unsigned long pages_done, user_address;
+	int ret;
+
+	mem->size = PAGE_ALIGN(vb->size);
+	mem->is_userptr = 0;
+	ret = -EINVAL;
+
+	down_read(&mm->mmap_sem);
+
+	vma = find_vma(mm, vb->baddr);
+	if (!vma)
+		goto out_up;
+
+	if ((vb->baddr + mem->size) > vma->vm_end)
+		goto out_up;
+
+	pages_done = 0;
+	prev_pfn = 0; /* kill warning */
+	user_address = vb->baddr;
+
+	while (pages_done < (mem->size >> PAGE_SHIFT)) {
+		ret = follow_pfn(vma, user_address, &this_pfn);
+		if (ret)
+			break;
+
+		if (pages_done == 0)
+			mem->dma_handle = this_pfn << PAGE_SHIFT;
+		else if (this_pfn != (prev_pfn + 1))
+			ret = -EFAULT;
+
+		if (ret)
+			break;
+
+		prev_pfn = this_pfn;
+		user_address += PAGE_SIZE;
+		pages_done++;
+	}
+
+	if (!ret)
+		mem->is_userptr = 1;
+
+ out_up:
+	up_read(&current->mm->mmap_sem);
+
+	return ret;
+}
+
 static void *__videobuf_alloc(size_t size)
 {
 	struct videobuf_dma_contig_memory *mem;
@@ -154,12 +232,11 @@ static int __videobuf_iolock(struct videobuf_queue *q,
 	case V4L2_MEMORY_USERPTR:
 		dev_dbg(q->dev, "%s memory method USERPTR\n", __func__);
 
-		/* The only USERPTR currently supported is the one needed for
-		   read() method.
-		 */
+		/* handle pointer from user space */
 		if (vb->baddr)
-			return -EINVAL;
+			return videobuf_dma_contig_user_get(mem, vb);
 
+		/* allocate memory for the read() method */
 		mem->size = PAGE_ALIGN(vb->size);
 		mem->vaddr = dma_alloc_coherent(q->dev, mem->size,
 						&mem->dma_handle, GFP_KERNEL);
@@ -400,7 +477,7 @@ void videobuf_dma_contig_free(struct videobuf_queue *q,
 	   So, it should free memory only if the memory were allocated for
 	   read() operation.
 	 */
-	if ((buf->memory != V4L2_MEMORY_USERPTR) || buf->baddr)
+	if (buf->memory != V4L2_MEMORY_USERPTR)
 		return;
 
 	if (!mem)
@@ -408,6 +485,13 @@ void videobuf_dma_contig_free(struct videobuf_queue *q,
 
 	MAGIC_CHECK(mem->magic, MAGIC_DC_MEM);
 
+	/* handle user space pointer case */
+	if (buf->baddr) {
+		videobuf_dma_contig_user_put(mem);
+		return;
+	}
+
+	/* read() method */
 	dma_free_coherent(q->dev, mem->size, mem->vaddr, mem->dma_handle);
 	mem->vaddr = NULL;
 }
-- 
cgit v0.10.2


From dab48dab37d2770824420d1e01730a107fade1aa Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 16 Jun 2009 15:32:37 -0700
Subject: page-allocator: warn if __GFP_NOFAIL is used for a large allocation

__GFP_NOFAIL is a bad fiction.  Allocations _can_ fail, and callers should
detect and suitably handle this (and not by lamely moving the infinite
loop up to the caller level either).

Attempting to use __GFP_NOFAIL for a higher-order allocation is even
worse, so add a once-off runtime check for this to slap people around for
even thinking about trying it.

Cc: David Rientjes <rientjes@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 076463c..61290ea 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1128,6 +1128,19 @@ again:
 		list_del(&page->lru);
 		pcp->count--;
 	} else {
+		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
+			/*
+			 * __GFP_NOFAIL is not to be used in new code.
+			 *
+			 * All __GFP_NOFAIL callers should be fixed so that they
+			 * properly detect and handle allocation failures.
+			 *
+			 * We most definitely don't want callers attempting to
+			 * allocate greater than single-page units with
+			 * __GFP_NOFAIL.
+			 */
+			WARN_ON_ONCE(order > 0);
+		}
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order, migratetype);
 		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
-- 
cgit v0.10.2


From 75927af8bcb940dad4fe281713d526cb520869ff Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 16 Jun 2009 15:32:38 -0700
Subject: mm: madvise(): correct return code

The posix_madvise() function succeeds (and does nothing) when called with
parameters (NULL, 0, -1); according to LSB tests, it should fail with
EINVAL because -1 is not a valid flag.

When called with a valid address and size, it correctly fails.

So perform an initial check for valid flags first.

Reported-by: Jiri Dluhos <jdluhos@novell.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Reviewed-and-Tested-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/madvise.c b/mm/madvise.c
index e994dcb..76eb419 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -238,12 +238,30 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		break;
 
 	default:
-		error = -EINVAL;
+		BUG();
 		break;
 	}
 	return error;
 }
 
+static int
+madvise_behavior_valid(int behavior)
+{
+	switch (behavior) {
+	case MADV_DOFORK:
+	case MADV_DONTFORK:
+	case MADV_NORMAL:
+	case MADV_SEQUENTIAL:
+	case MADV_RANDOM:
+	case MADV_REMOVE:
+	case MADV_WILLNEED:
+	case MADV_DONTNEED:
+		return 1;
+
+	default:
+		return 0;
+	}
+}
 /*
  * The madvise(2) system call.
  *
@@ -289,6 +307,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 	int write;
 	size_t len;
 
+	if (!madvise_behavior_valid(behavior))
+		return error;
+
 	write = madvise_need_mmap_write(behavior);
 	if (write)
 		down_write(&current->mm->mmap_sem);
-- 
cgit v0.10.2


From 7f33d49a2ed546e01f7b1d0607661810f2421859 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 16 Jun 2009 15:32:41 -0700
Subject: mm, PM/Freezer: Disable OOM killer when tasks are frozen

Currently, the following scenario appears to be possible in theory:

* Tasks are frozen for hibernation or suspend.
* Free pages are almost exhausted.
* Certain piece of code in the suspend code path attempts to allocate
  some memory using GFP_KERNEL and allocation order less than or
  equal to PAGE_ALLOC_COSTLY_ORDER.
* __alloc_pages_internal() cannot find a free page so it invokes the
  OOM killer.
* The OOM killer attempts to kill a task, but the task is frozen, so
  it doesn't die immediately.
* __alloc_pages_internal() jumps to 'restart', unsuccessfully tries
  to find a free page and invokes the OOM killer.
* No progress can be made.

Although it is now hard to trigger during hibernation due to the memory
shrinking carried out by the hibernation code, it is theoretically
possible to trigger during suspend after the memory shrinking has been
removed from that code path.  Moreover, since memory allocations are
going to be used for the hibernation memory shrinking, it will be even
more likely to happen during hibernation.

To prevent it from happening, introduce the oom_killer_disabled switch
that will cause __alloc_pages_internal() to fail in the situations in
which the OOM killer would have been called and make the freezer set
this switch after tasks have been successfully frozen.

[akpm@linux-foundation.org: be nicer to the namespace]
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Fengguang Wu <fengguang.wu@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4efa330..06b7e8c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -243,4 +243,16 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_all_pages(void);
 void drain_local_pages(void *dummy);
 
+extern bool oom_killer_disabled;
+
+static inline void oom_killer_disable(void)
+{
+	oom_killer_disabled = true;
+}
+
+static inline void oom_killer_enable(void)
+{
+	oom_killer_disabled = false;
+}
+
 #endif /* __LINUX_GFP_H */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ca63401..da2072d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -117,9 +117,12 @@ int freeze_processes(void)
 	if (error)
 		goto Exit;
 	printk("done.");
+
+	oom_killer_disable();
  Exit:
 	BUG_ON(in_atomic());
 	printk("\n");
+
 	return error;
 }
 
@@ -145,6 +148,8 @@ static void thaw_tasks(bool nosig_only)
 
 void thaw_processes(void)
 {
+	oom_killer_enable();
+
 	printk("Restarting tasks ... ");
 	thaw_tasks(true);
 	thaw_tasks(false);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 61290ea..5b09488 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -178,6 +178,8 @@ static void set_pageblock_migratetype(struct page *page, int migratetype)
 					PB_migrate, PB_migrate_end);
 }
 
+bool oom_killer_disabled __read_mostly;
+
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -1769,6 +1771,8 @@ rebalance:
 	 */
 	if (!did_some_progress) {
 		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+			if (oom_killer_disabled)
+				goto nopage;
 			page = __alloc_pages_may_oom(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask, preferred_zone,
-- 
cgit v0.10.2


From 35282a2de4e5e4e173ab61aa9d7015886021a821 Mon Sep 17 00:00:00 2001
From: Brice Goglin <Brice.Goglin@ens-lyon.org>
Date: Tue, 16 Jun 2009 15:32:43 -0700
Subject: migration: only migrate_prep() once per move_pages()

migrate_prep() is fairly expensive (72us on 16-core barcelona 1.9GHz).
Commit 3140a2273009c01c27d316f35ab76a37e105fdd8 improved move_pages()
throughput by breaking it into chunks, but it also made migrate_prep() be
called once per chunk (every 128pages or so) instead of once per
move_pages().

This patch reverts to calling migrate_prep() only once per chunk as we did
before 2.6.29.  It is also a followup to commit
0aedadf91a70a11c4a3e7c7d99b21e5528af8d5d ("mm: move migrate_prep out from
under mmap_sem").

This improves migration throughput on the above machine from 600MB/s to
750MB/s.

Signed-off-by: Brice Goglin <Brice.Goglin@inria.fr>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Rik van Riel <riel@redhat.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/migrate.c b/mm/migrate.c
index 5a24923..939888f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -820,7 +820,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
 	struct page_to_node *pp;
 	LIST_HEAD(pagelist);
 
-	migrate_prep();
 	down_read(&mm->mmap_sem);
 
 	/*
@@ -907,6 +906,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
 	pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
 	if (!pm)
 		goto out;
+
+	migrate_prep();
+
 	/*
 	 * Store a chunk of page_to_node array in a page,
 	 * but keep the last one as a marker
-- 
cgit v0.10.2


From 69c854817566db82c362797b4a6521d0b00fe1d8 Mon Sep 17 00:00:00 2001
From: MinChan Kim <minchan.kim@gmail.com>
Date: Tue, 16 Jun 2009 15:32:44 -0700
Subject: vmscan: prevent shrinking of active anon lru list in case of no swap
 space V3

shrink_zone() can deactivate active anon pages even if we don't have a
swap device.  Many embedded products don't have a swap device.  So the
deactivation of anon pages is unnecessary.

This patch prevents unnecessary deactivation of anon lru pages.  But, it
don't prevent aging of anon pages to swap out.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7592d8e..879d034 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1570,7 +1570,7 @@ static void shrink_zone(int priority, struct zone *zone,
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (inactive_anon_is_low(zone, sc))
+	if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
 		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
 
 	throttle_vm_writeout(sc->gfp_mask);
-- 
cgit v0.10.2


From 31c911329e048b715a1dfeaaf617be9430fd7f4e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Tue, 16 Jun 2009 15:32:45 -0700
Subject: mm: check the argument of kunmap on architectures without highmem

If you're using a non-highmem architecture, passing an argument with the
wrong type to kunmap() doesn't give you a warning because the ifdef
doesn't check the type.

Using a static inline function solves the problem nicely.

Reported-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 1fcb712..211ff44 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -55,7 +55,9 @@ static inline void *kmap(struct page *page)
 	return page_address(page);
 }
 
-#define kunmap(page) do { (void) (page); } while (0)
+static inline void kunmap(struct page *page)
+{
+}
 
 static inline void *kmap_atomic(struct page *page, enum km_type idx)
 {
-- 
cgit v0.10.2


From b70d94ee438b3fd9c15c7691d7a932a135c18101 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 16 Jun 2009 15:32:46 -0700
Subject: page-allocator: use integer fields lookup for gfp_zone and check for
 errors in flags passed to the page allocator

This simplifies the code in gfp_zone() and also keeps the ability of the
compiler to use constant folding to get rid of gfp_zone processing.

The lookup of the zone is done using a bitfield stored in an integer.  So
the code in gfp_zone is a simple extraction of bits from a constant
bitfield.  The compiler is generating a load of a constant into a register
and then performs a shift and mask operation to get the zone from a gfp_t.
 No cachelines are touched and no branches have to be predicted by the
compiler.

We are doing some macro tricks here to convince the compiler to always do
the constant folding if possible.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Mel Gorman <mel@csn.ul.ie>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 06b7e8c..412178a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -21,7 +21,8 @@ struct vm_area_struct;
 #define __GFP_DMA	((__force gfp_t)0x01u)
 #define __GFP_HIGHMEM	((__force gfp_t)0x02u)
 #define __GFP_DMA32	((__force gfp_t)0x04u)
-
+#define __GFP_MOVABLE	((__force gfp_t)0x08u)  /* Page is movable */
+#define GFP_ZONEMASK	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
 /*
  * Action modifiers - doesn't change the zoning
  *
@@ -51,7 +52,6 @@ struct vm_area_struct;
 #define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
 #define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
 #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
-#define __GFP_MOVABLE	((__force gfp_t)0x100000u)  /* Page is movable */
 
 #define __GFP_BITS_SHIFT 21	/* Room for 21 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -116,24 +116,105 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags)
 		((gfp_flags & __GFP_RECLAIMABLE) != 0);
 }
 
-static inline enum zone_type gfp_zone(gfp_t flags)
-{
+#ifdef CONFIG_HIGHMEM
+#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
+#else
+#define OPT_ZONE_HIGHMEM ZONE_NORMAL
+#endif
+
 #ifdef CONFIG_ZONE_DMA
-	if (flags & __GFP_DMA)
-		return ZONE_DMA;
+#define OPT_ZONE_DMA ZONE_DMA
+#else
+#define OPT_ZONE_DMA ZONE_NORMAL
 #endif
+
 #ifdef CONFIG_ZONE_DMA32
-	if (flags & __GFP_DMA32)
-		return ZONE_DMA32;
+#define OPT_ZONE_DMA32 ZONE_DMA32
+#else
+#define OPT_ZONE_DMA32 ZONE_NORMAL
 #endif
-	if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) ==
-			(__GFP_HIGHMEM | __GFP_MOVABLE))
-		return ZONE_MOVABLE;
-#ifdef CONFIG_HIGHMEM
-	if (flags & __GFP_HIGHMEM)
-		return ZONE_HIGHMEM;
+
+/*
+ * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
+ * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long
+ * and there are 16 of them to cover all possible combinations of
+ * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM
+ *
+ * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
+ * But GFP_MOVABLE is not only a zone specifier but also an allocation
+ * policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
+ * Only 1bit of the lowest 3 bit (DMA,DMA32,HIGHMEM) can be set to "1".
+ *
+ *       bit       result
+ *       =================
+ *       0x0    => NORMAL
+ *       0x1    => DMA or NORMAL
+ *       0x2    => HIGHMEM or NORMAL
+ *       0x3    => BAD (DMA+HIGHMEM)
+ *       0x4    => DMA32 or DMA or NORMAL
+ *       0x5    => BAD (DMA+DMA32)
+ *       0x6    => BAD (HIGHMEM+DMA32)
+ *       0x7    => BAD (HIGHMEM+DMA32+DMA)
+ *       0x8    => NORMAL (MOVABLE+0)
+ *       0x9    => DMA or NORMAL (MOVABLE+DMA)
+ *       0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
+ *       0xb    => BAD (MOVABLE+HIGHMEM+DMA)
+ *       0xc    => DMA32 (MOVABLE+HIGHMEM+DMA32)
+ *       0xd    => BAD (MOVABLE+DMA32+DMA)
+ *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
+ *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
+ *
+ * ZONES_SHIFT must be <= 2 on 32 bit platforms.
+ */
+
+#if 16 * ZONES_SHIFT > BITS_PER_LONG
+#error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
+#endif
+
+#define GFP_ZONE_TABLE ( \
+	(ZONE_NORMAL << 0 * ZONES_SHIFT)				\
+	| (OPT_ZONE_DMA << __GFP_DMA * ZONES_SHIFT) 			\
+	| (OPT_ZONE_HIGHMEM << __GFP_HIGHMEM * ZONES_SHIFT)		\
+	| (OPT_ZONE_DMA32 << __GFP_DMA32 * ZONES_SHIFT)			\
+	| (ZONE_NORMAL << __GFP_MOVABLE * ZONES_SHIFT)			\
+	| (OPT_ZONE_DMA << (__GFP_MOVABLE | __GFP_DMA) * ZONES_SHIFT)	\
+	| (ZONE_MOVABLE << (__GFP_MOVABLE | __GFP_HIGHMEM) * ZONES_SHIFT)\
+	| (OPT_ZONE_DMA32 << (__GFP_MOVABLE | __GFP_DMA32) * ZONES_SHIFT)\
+)
+
+/*
+ * GFP_ZONE_BAD is a bitmap for all combination of __GFP_DMA, __GFP_DMA32
+ * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
+ * entry starting with bit 0. Bit is set if the combination is not
+ * allowed.
+ */
+#define GFP_ZONE_BAD ( \
+	1 << (__GFP_DMA | __GFP_HIGHMEM)				\
+	| 1 << (__GFP_DMA | __GFP_DMA32)				\
+	| 1 << (__GFP_DMA32 | __GFP_HIGHMEM)				\
+	| 1 << (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)		\
+	| 1 << (__GFP_MOVABLE | __GFP_HIGHMEM | __GFP_DMA)		\
+	| 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA)		\
+	| 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_HIGHMEM)		\
+	| 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA | __GFP_HIGHMEM)\
+)
+
+static inline enum zone_type gfp_zone(gfp_t flags)
+{
+	enum zone_type z;
+	int bit = flags & GFP_ZONEMASK;
+
+	z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) &
+					 ((1 << ZONES_SHIFT) - 1);
+
+	if (__builtin_constant_p(bit))
+		BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
+	else {
+#ifdef CONFIG_DEBUG_VM
+		BUG_ON((GFP_ZONE_BAD >> bit) & 1);
 #endif
-	return ZONE_NORMAL;
+	}
+	return z;
 }
 
 /*
-- 
cgit v0.10.2


From bc75d33f0fc1d56e734db1f56d3cfc8097b8e0cf Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 16 Jun 2009 15:32:48 -0700
Subject: page-allocator: clean up functions related to pages_min

Change the names of two functions. It doesn't affect behavior.

Presently, setup_per_zone_pages_min() changes low, high of zone as well as
min.  So a better name is setup_per_zone_wmarks().  That's because Mel
changed zone->pages_[hig/low/min] to zone->watermark array in "page
allocator: replace the watermark-related union in struct zone with a
watermark[] array".

 * setup_per_zone_pages_min => setup_per_zone_wmarks

Of course, we have to change init_per_zone_pages_min, too.  There are not
pages_min any more.

 * init_per_zone_pages_min => init_per_zone_wmark_min

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c80dbd4..6e11cda 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1052,7 +1052,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn);
 extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void memmap_init_zone(unsigned long, int, unsigned long,
 				unsigned long, enum memmap_context);
-extern void setup_per_zone_pages_min(void);
+extern void setup_per_zone_wmarks(void);
 extern void mem_init(void);
 extern void __init mmap_init(void);
 extern void show_mem(void);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c083cf5..037291e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -422,7 +422,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	zone->present_pages += onlined_pages;
 	zone->zone_pgdat->node_present_pages += onlined_pages;
 
-	setup_per_zone_pages_min();
+	setup_per_zone_wmarks();
 	if (onlined_pages) {
 		kswapd_run(zone_to_nid(zone));
 		node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5b09488..8629c84 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4396,12 +4396,13 @@ static void setup_per_zone_lowmem_reserve(void)
 }
 
 /**
- * setup_per_zone_pages_min - called when min_free_kbytes changes.
+ * setup_per_zone_wmarks - called when min_free_kbytes changes
+ * or when memory is hot-added
  *
- * Ensures that the pages_{min,low,high} values for each zone are set correctly
- * with respect to min_free_kbytes.
+ * Ensures that the watermark[min,low,high] values for each zone are set
+ * correctly with respect to min_free_kbytes.
  */
-void setup_per_zone_pages_min(void)
+void setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
@@ -4519,7 +4520,7 @@ static void __init setup_per_zone_inactive_ratio(void)
  * 8192MB:	11584k
  * 16384MB:	16384k
  */
-static int __init init_per_zone_pages_min(void)
+static int __init init_per_zone_wmark_min(void)
 {
 	unsigned long lowmem_kbytes;
 
@@ -4530,12 +4531,12 @@ static int __init init_per_zone_pages_min(void)
 		min_free_kbytes = 128;
 	if (min_free_kbytes > 65536)
 		min_free_kbytes = 65536;
-	setup_per_zone_pages_min();
+	setup_per_zone_wmarks();
 	setup_per_zone_lowmem_reserve();
 	setup_per_zone_inactive_ratio();
 	return 0;
 }
-module_init(init_per_zone_pages_min)
+module_init(init_per_zone_wmark_min)
 
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
@@ -4547,7 +4548,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 {
 	proc_dointvec(table, write, file, buffer, length, ppos);
 	if (write)
-		setup_per_zone_pages_min();
+		setup_per_zone_wmarks();
 	return 0;
 }
 
-- 
cgit v0.10.2


From 96cb4df5ddf5e6d5785b5acd4003e3689b87f896 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 16 Jun 2009 15:32:49 -0700
Subject: page-allocator: add inactive ratio calculation function of each zone

Factor the per-zone arithemetic inside setup_per_zone_inactive_ratio()'s
loop into a a separate function, calculate_zone_inactive_ratio().  This
function will be used in a later patch

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6e11cda..f0473a8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1053,6 +1053,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void memmap_init_zone(unsigned long, int, unsigned long,
 				unsigned long, enum memmap_context);
 extern void setup_per_zone_wmarks(void);
+extern void calculate_zone_inactive_ratio(struct zone *zone);
 extern void mem_init(void);
 extern void __init mmap_init(void);
 extern void show_mem(void);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8629c84..303607f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4478,22 +4478,26 @@ void setup_per_zone_wmarks(void)
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
-static void __init setup_per_zone_inactive_ratio(void)
+void calculate_zone_inactive_ratio(struct zone *zone)
 {
-	struct zone *zone;
+	unsigned int gb, ratio;
 
-	for_each_zone(zone) {
-		unsigned int gb, ratio;
+	/* Zone size in gigabytes */
+	gb = zone->present_pages >> (30 - PAGE_SHIFT);
+	if (gb)
+		ratio = int_sqrt(10 * gb);
+	else
+		ratio = 1;
 
-		/* Zone size in gigabytes */
-		gb = zone->present_pages >> (30 - PAGE_SHIFT);
-		if (gb)
-			ratio = int_sqrt(10 * gb);
-		else
-			ratio = 1;
+	zone->inactive_ratio = ratio;
+}
 
-		zone->inactive_ratio = ratio;
-	}
+static void __init setup_per_zone_inactive_ratio(void)
+{
+	struct zone *zone;
+
+	for_each_zone(zone)
+		calculate_zone_inactive_ratio(zone);
 }
 
 /*
-- 
cgit v0.10.2


From bce7394a3ef82b8477952fbab838e4a6e8cb47d2 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 16 Jun 2009 15:32:50 -0700
Subject: page-allocator: reset wmark_min and inactive ratio of zone when
 hotplug happens

Solve two problems.

Whenever memory hotplug sucessfully happens, zone->present_pages
have to be changed.

1) Now memory hotplug calls setup_per_zone_wmark_min only when
   online_pages called, not offline_pages.

   It breaks balance.

2) If zone->present_pages is changed, we also have to change
   zone->inactive_ratio.  That's because inactive_ratio depends on
   zone->present_pages.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 037291e..e4412a6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -423,6 +423,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	zone->zone_pgdat->node_present_pages += onlined_pages;
 
 	setup_per_zone_wmarks();
+	calculate_zone_inactive_ratio(zone);
 	if (onlined_pages) {
 		kswapd_run(zone_to_nid(zone));
 		node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
@@ -832,6 +833,9 @@ repeat:
 	totalram_pages -= offlined_pages;
 	num_physpages -= offlined_pages;
 
+	setup_per_zone_wmarks();
+	calculate_zone_inactive_ratio(zone);
+
 	vm_total_pages = nr_free_pagecache_pages();
 	writeback_set_ratelimit();
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 303607f..00e2937 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4397,7 +4397,7 @@ static void setup_per_zone_lowmem_reserve(void)
 
 /**
  * setup_per_zone_wmarks - called when min_free_kbytes changes
- * or when memory is hot-added
+ * or when memory is hot-{added|removed}
  *
  * Ensures that the watermark[min,low,high] values for each zone are set
  * correctly with respect to min_free_kbytes.
-- 
cgit v0.10.2


From 6837765963f1723e80ca97b1fae660f3a60d77df Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 16 Jun 2009 15:32:51 -0700
Subject: mm: remove CONFIG_UNEVICTABLE_LRU config option

Currently, nobody wants to turn UNEVICTABLE_LRU off.  Thus this
configurability is unnecessary.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andi Kleen <andi@firstfloor.org>
Acked-by: Minchan Kim <minchan.kim@gmail.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 40b8097..91d4087 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -72,10 +72,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
 		       "Node %d Inactive(anon): %8lu kB\n"
 		       "Node %d Active(file):   %8lu kB\n"
 		       "Node %d Inactive(file): %8lu kB\n"
-#ifdef CONFIG_UNEVICTABLE_LRU
 		       "Node %d Unevictable:    %8lu kB\n"
 		       "Node %d Mlocked:        %8lu kB\n"
-#endif
 #ifdef CONFIG_HIGHMEM
 		       "Node %d HighTotal:      %8lu kB\n"
 		       "Node %d HighFree:       %8lu kB\n"
@@ -105,10 +103,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
 		       nid, K(node_page_state(nid, NR_INACTIVE_ANON)),
 		       nid, K(node_page_state(nid, NR_ACTIVE_FILE)),
 		       nid, K(node_page_state(nid, NR_INACTIVE_FILE)),
-#ifdef CONFIG_UNEVICTABLE_LRU
 		       nid, K(node_page_state(nid, NR_UNEVICTABLE)),
 		       nid, K(node_page_state(nid, NR_MLOCK)),
-#endif
 #ifdef CONFIG_HIGHMEM
 		       nid, K(i.totalhigh),
 		       nid, K(i.freehigh),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index c6b0302..d5c410d 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -64,10 +64,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		"Inactive(anon): %8lu kB\n"
 		"Active(file):   %8lu kB\n"
 		"Inactive(file): %8lu kB\n"
-#ifdef CONFIG_UNEVICTABLE_LRU
 		"Unevictable:    %8lu kB\n"
 		"Mlocked:        %8lu kB\n"
-#endif
 #ifdef CONFIG_HIGHMEM
 		"HighTotal:      %8lu kB\n"
 		"HighFree:       %8lu kB\n"
@@ -109,10 +107,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(pages[LRU_INACTIVE_ANON]),
 		K(pages[LRU_ACTIVE_FILE]),
 		K(pages[LRU_INACTIVE_FILE]),
-#ifdef CONFIG_UNEVICTABLE_LRU
 		K(pages[LRU_UNEVICTABLE]),
 		K(global_page_state(NR_MLOCK)),
-#endif
 #ifdef CONFIG_HIGHMEM
 		K(i.totalhigh),
 		K(i.freehigh),
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 9d926bd..2707c6c 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -172,10 +172,8 @@ static u64 get_uflags(struct page *page)
 	u |= kpf_copy_bit(k, KPF_SWAPCACHE,	PG_swapcache);
 	u |= kpf_copy_bit(k, KPF_SWAPBACKED,	PG_swapbacked);
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 	u |= kpf_copy_bit(k, KPF_UNEVICTABLE,	PG_unevictable);
 	u |= kpf_copy_bit(k, KPF_MLOCKED,	PG_mlocked);
-#endif
 
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 	u |= kpf_copy_bit(k, KPF_UNCACHED,	PG_uncached);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index db976b9..8895985 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -83,13 +83,8 @@ enum zone_stat_item {
 	NR_ACTIVE_ANON,		/*  "     "     "   "       "         */
 	NR_INACTIVE_FILE,	/*  "     "     "   "       "         */
 	NR_ACTIVE_FILE,		/*  "     "     "   "       "         */
-#ifdef CONFIG_UNEVICTABLE_LRU
 	NR_UNEVICTABLE,		/*  "     "     "   "       "         */
 	NR_MLOCK,		/* mlock()ed pages found and moved off LRU */
-#else
-	NR_UNEVICTABLE = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */
-	NR_MLOCK = NR_ACTIVE_FILE,
-#endif
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
@@ -132,11 +127,7 @@ enum lru_list {
 	LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
 	LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
 	LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
-#ifdef CONFIG_UNEVICTABLE_LRU
 	LRU_UNEVICTABLE,
-#else
-	LRU_UNEVICTABLE = LRU_ACTIVE_FILE, /* avoid compiler errors in dead code */
-#endif
 	NR_LRU_LISTS
 };
 
@@ -156,11 +147,7 @@ static inline int is_active_lru(enum lru_list l)
 
 static inline int is_unevictable_lru(enum lru_list l)
 {
-#ifdef CONFIG_UNEVICTABLE_LRU
 	return (l == LRU_UNEVICTABLE);
-#else
-	return 0;
-#endif
 }
 
 enum zone_watermarks {
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 62214c7..d6792f8 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -95,9 +95,7 @@ enum pageflags {
 	PG_reclaim,		/* To be reclaimed asap */
 	PG_buddy,		/* Page is free, on buddy lists */
 	PG_swapbacked,		/* Page is backed by RAM/swap */
-#ifdef CONFIG_UNEVICTABLE_LRU
 	PG_unevictable,		/* Page is "unevictable"  */
-#endif
 #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 	PG_mlocked,		/* Page is vma mlocked */
 #endif
@@ -248,14 +246,8 @@ PAGEFLAG_FALSE(SwapCache)
 	SETPAGEFLAG_NOOP(SwapCache) CLEARPAGEFLAG_NOOP(SwapCache)
 #endif
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
 	TESTCLEARFLAG(Unevictable, unevictable)
-#else
-PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
-	SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
-	__CLEARPAGEFLAG_NOOP(Unevictable)
-#endif
 
 #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 #define MLOCK_PAGES 1
@@ -382,12 +374,6 @@ static inline void __ClearPageTail(struct page *page)
 
 #endif /* !PAGEFLAGS_EXTENDED */
 
-#ifdef CONFIG_UNEVICTABLE_LRU
-#define __PG_UNEVICTABLE	(1 << PG_unevictable)
-#else
-#define __PG_UNEVICTABLE	0
-#endif
-
 #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 #define __PG_MLOCKED		(1 << PG_mlocked)
 #else
@@ -403,7 +389,7 @@ static inline void __ClearPageTail(struct page *page)
 	 1 << PG_private | 1 << PG_private_2 | \
 	 1 << PG_buddy	 | 1 << PG_writeback | 1 << PG_reserved | \
 	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
-	 __PG_UNEVICTABLE | __PG_MLOCKED)
+	 1 << PG_unevictable | __PG_MLOCKED)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 34da523..aec3252 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -22,9 +22,7 @@ enum mapping_flags {
 	AS_EIO		= __GFP_BITS_SHIFT + 0,	/* IO error on async write */
 	AS_ENOSPC	= __GFP_BITS_SHIFT + 1,	/* ENOSPC on async write */
 	AS_MM_ALL_LOCKS	= __GFP_BITS_SHIFT + 2,	/* under mm_take_all_locks() */
-#ifdef CONFIG_UNEVICTABLE_LRU
 	AS_UNEVICTABLE	= __GFP_BITS_SHIFT + 3,	/* e.g., ramdisk, SHM_LOCK */
-#endif
 };
 
 static inline void mapping_set_error(struct address_space *mapping, int error)
@@ -37,8 +35,6 @@ static inline void mapping_set_error(struct address_space *mapping, int error)
 	}
 }
 
-#ifdef CONFIG_UNEVICTABLE_LRU
-
 static inline void mapping_set_unevictable(struct address_space *mapping)
 {
 	set_bit(AS_UNEVICTABLE, &mapping->flags);
@@ -55,14 +51,6 @@ static inline int mapping_unevictable(struct address_space *mapping)
 		return test_bit(AS_UNEVICTABLE, &mapping->flags);
 	return !!mapping;
 }
-#else
-static inline void mapping_set_unevictable(struct address_space *mapping) { }
-static inline void mapping_clear_unevictable(struct address_space *mapping) { }
-static inline int mapping_unevictable(struct address_space *mapping)
-{
-	return 0;
-}
-#endif
 
 static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
 {
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b35bc0e..619379a 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -105,18 +105,11 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
  */
 int page_mkclean(struct page *);
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 /*
  * called in munlock()/munmap() path to check for other vmas holding
  * the page mlocked.
  */
 int try_to_munlock(struct page *);
-#else
-static inline int try_to_munlock(struct page *page)
-{
-	return 0;	/* a.k.a. SWAP_SUCCESS */
-}
-#endif
 
 #else	/* !CONFIG_MMU */
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d476aad..f30c069 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -235,7 +235,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 extern int page_evictable(struct page *page, struct vm_area_struct *vma);
 extern void scan_mapping_unevictable_pages(struct address_space *);
 
@@ -244,24 +243,6 @@ extern int scan_unevictable_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
 extern int scan_unevictable_register_node(struct node *node);
 extern void scan_unevictable_unregister_node(struct node *node);
-#else
-static inline int page_evictable(struct page *page,
-						struct vm_area_struct *vma)
-{
-	return 1;
-}
-
-static inline void scan_mapping_unevictable_pages(struct address_space *mapping)
-{
-}
-
-static inline int scan_unevictable_register_node(struct node *node)
-{
-	return 0;
-}
-
-static inline void scan_unevictable_unregister_node(struct node *node) { }
-#endif
 
 extern int kswapd_run(int nid);
 
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 524cd1b..ff4696c 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -41,7 +41,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_HUGETLB_PAGE
 		HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
 #endif
-#ifdef CONFIG_UNEVICTABLE_LRU
 		UNEVICTABLE_PGCULLED,	/* culled to noreclaim list */
 		UNEVICTABLE_PGSCANNED,	/* scanned for reclaimability */
 		UNEVICTABLE_PGRESCUED,	/* rescued from noreclaim list */
@@ -50,7 +49,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		UNEVICTABLE_PGCLEARED,	/* on COW, page truncate */
 		UNEVICTABLE_PGSTRANDED,	/* unable to isolate on unlock */
 		UNEVICTABLE_MLOCKFREED,
-#endif
 		NR_VM_EVENT_ITEMS
 };
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0e51a35..2ccee08 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1325,7 +1325,6 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
-#ifdef CONFIG_UNEVICTABLE_LRU
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "scan_unevictable_pages",
@@ -1334,7 +1333,6 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &scan_unevictable_handler,
 	},
-#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/mm/Kconfig b/mm/Kconfig
index 71830ba..97d2c88 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -203,25 +203,13 @@ config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
 
-config UNEVICTABLE_LRU
-	bool "Add LRU list to track non-evictable pages"
-	default y
-	help
-	  Keeps unevictable pages off of the active and inactive pageout
-	  lists, so kswapd will not waste CPU time or have its balancing
-	  algorithms thrown off by scanning these pages.  Selecting this
-	  will use one page flag and increase the code size a little,
-	  say Y unless you know what you are doing.
-
-	  See Documentation/vm/unevictable-lru.txt for more information.
-
 config HAVE_MLOCK
 	bool
 	default y if MMU=y
 
 config HAVE_MLOCKED_PAGE_BIT
 	bool
-	default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y
+	default y if HAVE_MLOCK=y
 
 config MMU_NOTIFIER
 	bool
diff --git a/mm/internal.h b/mm/internal.h
index b4ac332..f02c750 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -73,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 }
 #endif
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 /*
  * unevictable_migrate_page() called only from migrate_page_copy() to
  * migrate unevictable flag to new page.
@@ -85,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
 	if (TestClearPageUnevictable(old))
 		SetPageUnevictable(new);
 }
-#else
-static inline void unevictable_migrate_page(struct page *new, struct page *old)
-{
-}
-#endif
 
 #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 /*
diff --git a/mm/mlock.c b/mm/mlock.c
index ac13043..45eb650 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -31,7 +31,6 @@ int can_do_mlock(void)
 }
 EXPORT_SYMBOL(can_do_mlock);
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 /*
  * Mlocked pages are marked with PageMlocked() flag for efficient testing
  * in vmscan and, possibly, the fault path; and to support semi-accurate
@@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval)
 	return retval;
 }
 
-#else /* CONFIG_UNEVICTABLE_LRU */
-
-/*
- * Just make pages present if VM_LOCKED.  No-op if unlocking.
- */
-static long __mlock_vma_pages_range(struct vm_area_struct *vma,
-				   unsigned long start, unsigned long end,
-				   int mlock)
-{
-	if (mlock && (vma->vm_flags & VM_LOCKED))
-		return make_pages_present(start, end);
-	return 0;
-}
-
-static inline int __mlock_posix_error_return(long retval)
-{
-	return 0;
-}
-
-#endif /* CONFIG_UNEVICTABLE_LRU */
-
 /**
  * mlock_vma_pages_range() - mlock pages in specified vma range.
  * @vma - the vma containing the specfied address range
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 00e2937..c95a77c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2077,19 +2077,14 @@ void show_free_areas(void)
 
 	printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
 		" inactive_file:%lu"
-//TODO:  check/adjust line lengths
-#ifdef CONFIG_UNEVICTABLE_LRU
 		" unevictable:%lu"
-#endif
 		" dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
 		global_page_state(NR_ACTIVE_ANON),
 		global_page_state(NR_ACTIVE_FILE),
 		global_page_state(NR_INACTIVE_ANON),
 		global_page_state(NR_INACTIVE_FILE),
-#ifdef CONFIG_UNEVICTABLE_LRU
 		global_page_state(NR_UNEVICTABLE),
-#endif
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
@@ -2113,9 +2108,7 @@ void show_free_areas(void)
 			" inactive_anon:%lukB"
 			" active_file:%lukB"
 			" inactive_file:%lukB"
-#ifdef CONFIG_UNEVICTABLE_LRU
 			" unevictable:%lukB"
-#endif
 			" present:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
@@ -2129,9 +2122,7 @@ void show_free_areas(void)
 			K(zone_page_state(zone, NR_INACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ACTIVE_FILE)),
 			K(zone_page_state(zone, NR_INACTIVE_FILE)),
-#ifdef CONFIG_UNEVICTABLE_LRU
 			K(zone_page_state(zone, NR_UNEVICTABLE)),
-#endif
 			K(zone->present_pages),
 			zone->pages_scanned,
 			(zone_is_all_unreclaimable(zone) ? "yes" : "no")
diff --git a/mm/rmap.c b/mm/rmap.c
index 23122af..316c9d6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1202,7 +1202,6 @@ int try_to_unmap(struct page *page, int migration)
 	return ret;
 }
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 /**
  * try_to_munlock - try to munlock a page
  * @page: the page to be munlocked
@@ -1226,4 +1225,4 @@ int try_to_munlock(struct page *page)
 	else
 		return try_to_unmap_file(page, 1, 0);
 }
-#endif
+
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 879d034..2c4b945 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -514,7 +514,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
  *
  * lru_lock must not be held, interrupts must be enabled.
  */
-#ifdef CONFIG_UNEVICTABLE_LRU
 void putback_lru_page(struct page *page)
 {
 	int lru;
@@ -568,20 +567,6 @@ redo:
 	put_page(page);		/* drop ref from isolate */
 }
 
-#else /* CONFIG_UNEVICTABLE_LRU */
-
-void putback_lru_page(struct page *page)
-{
-	int lru;
-	VM_BUG_ON(PageLRU(page));
-
-	lru = !!TestClearPageActive(page) + page_is_file_cache(page);
-	lru_cache_add_lru(page, lru);
-	put_page(page);
-}
-#endif /* CONFIG_UNEVICTABLE_LRU */
-
-
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -2470,7 +2455,6 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 }
 #endif
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 /*
  * page_evictable - test whether a page is evictable
  * @page: the page to test
@@ -2717,4 +2701,3 @@ void scan_unevictable_unregister_node(struct node *node)
 	sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
 }
 
-#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1e151cf..1e3aa81 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -629,10 +629,8 @@ static const char * const vmstat_text[] = {
 	"nr_active_anon",
 	"nr_inactive_file",
 	"nr_active_file",
-#ifdef CONFIG_UNEVICTABLE_LRU
 	"nr_unevictable",
 	"nr_mlock",
-#endif
 	"nr_anon_pages",
 	"nr_mapped",
 	"nr_file_pages",
@@ -687,7 +685,6 @@ static const char * const vmstat_text[] = {
 	"htlb_buddy_alloc_success",
 	"htlb_buddy_alloc_fail",
 #endif
-#ifdef CONFIG_UNEVICTABLE_LRU
 	"unevictable_pgs_culled",
 	"unevictable_pgs_scanned",
 	"unevictable_pgs_rescued",
@@ -697,7 +694,6 @@ static const char * const vmstat_text[] = {
 	"unevictable_pgs_stranded",
 	"unevictable_pgs_mlockfreed",
 #endif
-#endif
 };
 
 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
-- 
cgit v0.10.2


From cb4b86ba47bb0937b71fb825b3ed88adf7a190f0 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 16 Jun 2009 15:32:52 -0700
Subject: mm: add swap cache interface for swap reference

In a following patch, the usage of swap cache is recorded into swap_map.
This patch is for necessary interface changes to do that.

2 interfaces:

  - swapcache_prepare()
  - swapcache_free()

are added for allocating/freeing refcnt from swap-cache to existing swap
entries.  But implementation itself is not changed under this patch.  At
adding swapcache_free(), memcg's hook code is moved under
swapcache_free().  This is better than using scattered hooks.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: Balbir Singh <balbir@in.ibm.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f30c069..259e96c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -282,8 +282,10 @@ extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
 extern swp_entry_t get_swap_page_of_type(int);
 extern int swap_duplicate(swp_entry_t);
+extern int swapcache_prepare(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern void swap_free(swp_entry_t);
+extern void swapcache_free(swp_entry_t, struct page *page);
 extern int free_swap_and_cache(swp_entry_t);
 extern int swap_type_of(dev_t, sector_t, struct block_device **);
 extern unsigned int count_swap_pages(int, int);
@@ -352,11 +354,16 @@ static inline void show_swap_cache_info(void)
 
 #define free_swap_and_cache(swp)	is_migration_entry(swp)
 #define swap_duplicate(swp)		is_migration_entry(swp)
+#define swapcache_prepare(swp)		is_migration_entry(swp)
 
 static inline void swap_free(swp_entry_t swp)
 {
 }
 
+static inline void swapcache_free(swp_entry_t swp, struct page *page)
+{
+}
+
 static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
 			struct vm_area_struct *vma, unsigned long addr)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index 0132fbd..47ab191 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	shmem_swp_unmap(entry);
 unlock:
 	spin_unlock(&info->lock);
-	swap_free(swap);
+	swapcache_free(swap, NULL);
 redirty:
 	set_page_dirty(page);
 	if (wbc->for_reclaim)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1416e7e..19bdf30 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -162,11 +162,11 @@ int add_to_swap(struct page *page)
 			return 1;
 		case -EEXIST:
 			/* Raced with "speculative" read_swap_cache_async */
-			swap_free(entry);
+			swapcache_free(entry, NULL);
 			continue;
 		default:
 			/* -ENOMEM radix-tree allocation failure */
-			swap_free(entry);
+			swapcache_free(entry, NULL);
 			return 0;
 		}
 	}
@@ -188,8 +188,7 @@ void delete_from_swap_cache(struct page *page)
 	__delete_from_swap_cache(page);
 	spin_unlock_irq(&swapper_space.tree_lock);
 
-	mem_cgroup_uncharge_swapcache(page, entry);
-	swap_free(entry);
+	swapcache_free(entry, page);
 	page_cache_release(page);
 }
 
@@ -293,7 +292,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		/*
 		 * Swap entry may have been freed since our caller observed it.
 		 */
-		if (!swap_duplicate(entry))
+		if (!swapcache_prepare(entry))
 			break;
 
 		/*
@@ -317,7 +316,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		}
 		ClearPageSwapBacked(new_page);
 		__clear_page_locked(new_page);
-		swap_free(entry);
+		swapcache_free(entry, NULL);
 	} while (err != -ENOMEM);
 
 	if (new_page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 312fafe..3187079 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -510,6 +510,16 @@ void swap_free(swp_entry_t entry)
 }
 
 /*
+ * Called after dropping swapcache to decrease refcnt to swap entries.
+ */
+void swapcache_free(swp_entry_t entry, struct page *page)
+{
+	if (page)
+		mem_cgroup_uncharge_swapcache(page, entry);
+	return swap_free(entry);
+}
+
+/*
  * How many references to page are currently swapped out?
  */
 static inline int page_swapcount(struct page *page)
@@ -1979,6 +1989,15 @@ bad_file:
 	goto out;
 }
 
+/*
+ * Called when allocating swap cache for exising swap entry,
+ */
+int swapcache_prepare(swp_entry_t entry)
+{
+	return swap_duplicate(entry);
+}
+
+
 struct swap_info_struct *
 get_swap_info_struct(unsigned type)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2c4b945..52339dd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -470,8 +470,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
 		swp_entry_t swap = { .val = page_private(page) };
 		__delete_from_swap_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
-		mem_cgroup_uncharge_swapcache(page, swap);
-		swap_free(swap);
+		swapcache_free(swap, page);
 	} else {
 		__remove_from_page_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
-- 
cgit v0.10.2


From 355cfa73ddff2fb8fa14e93bd94a057cc022512e Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 16 Jun 2009 15:32:53 -0700
Subject: mm: modify swap_map and add SWAP_HAS_CACHE flag

This is a part of the patches for fixing memcg's swap accountinf leak.
But, IMHO, not a bad patch even if no memcg.

There are 2 kinds of references to swap.
 - reference from swap entry
 - reference from swap cache

Then,

 - If there is swap cache && swap's refcnt is 1, there is only swap cache.
  (*) swapcount(entry) == 1 && find_get_page(swapper_space, entry) != NULL

This counting logic have worked well for a long time.  But considering
that we cannot know there is a _real_ reference or not by swap_map[],
current usage of counter is not very good.

This patch adds a flag SWAP_HAS_CACHE and recored information that a swap
entry has a cache or not.  This will remove -1 magic used in swapfile.c
and be a help to avoid unnecessary find_get_page().

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 259e96c..fed5e8e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -129,9 +129,10 @@ enum {
 
 #define SWAP_CLUSTER_MAX 32
 
-#define SWAP_MAP_MAX	0x7fff
-#define SWAP_MAP_BAD	0x8000
-
+#define SWAP_MAP_MAX	0x7ffe
+#define SWAP_MAP_BAD	0x7fff
+#define SWAP_HAS_CACHE  0x8000		/* There is a swap cache of entry. */
+#define SWAP_COUNT_MASK (~SWAP_HAS_CACHE)
 /*
  * The in-memory structure used to track swap areas.
  */
@@ -281,7 +282,7 @@ extern long total_swap_pages;
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
 extern swp_entry_t get_swap_page_of_type(int);
-extern int swap_duplicate(swp_entry_t);
+extern void swap_duplicate(swp_entry_t);
 extern int swapcache_prepare(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern void swap_free(swp_entry_t);
@@ -353,9 +354,12 @@ static inline void show_swap_cache_info(void)
 }
 
 #define free_swap_and_cache(swp)	is_migration_entry(swp)
-#define swap_duplicate(swp)		is_migration_entry(swp)
 #define swapcache_prepare(swp)		is_migration_entry(swp)
 
+static inline void swap_duplicate(swp_entry_t swp)
+{
+}
+
 static inline void swap_free(swp_entry_t swp)
 {
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 19bdf30..b9ca029 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -292,7 +292,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		/*
 		 * Swap entry may have been freed since our caller observed it.
 		 */
-		if (!swapcache_prepare(entry))
+		err = swapcache_prepare(entry);
+		if (err == -EEXIST) /* seems racy */
+			continue;
+		if (err)           /* swp entry is obsolete ? */
 			break;
 
 		/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 3187079..0d72969 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -53,6 +53,33 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES];
 
 static DEFINE_MUTEX(swapon_mutex);
 
+/* For reference count accounting in swap_map */
+/* enum for swap_map[] handling. internal use only */
+enum {
+	SWAP_MAP = 0,	/* ops for reference from swap users */
+	SWAP_CACHE,	/* ops for reference from swap cache */
+};
+
+static inline int swap_count(unsigned short ent)
+{
+	return ent & SWAP_COUNT_MASK;
+}
+
+static inline bool swap_has_cache(unsigned short ent)
+{
+	return !!(ent & SWAP_HAS_CACHE);
+}
+
+static inline unsigned short encode_swapmap(int count, bool has_cache)
+{
+	unsigned short ret = count;
+
+	if (has_cache)
+		return SWAP_HAS_CACHE | ret;
+	return ret;
+}
+
+
 /*
  * We need this because the bdev->unplug_fn can sleep and we cannot
  * hold swap_lock while calling the unplug_fn. And swap_lock
@@ -167,7 +194,8 @@ static int wait_for_discard(void *word)
 #define SWAPFILE_CLUSTER	256
 #define LATENCY_LIMIT		256
 
-static inline unsigned long scan_swap_map(struct swap_info_struct *si)
+static inline unsigned long scan_swap_map(struct swap_info_struct *si,
+					  int cache)
 {
 	unsigned long offset;
 	unsigned long scan_base;
@@ -285,7 +313,10 @@ checks:
 		si->lowest_bit = si->max;
 		si->highest_bit = 0;
 	}
-	si->swap_map[offset] = 1;
+	if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
+		si->swap_map[offset] = encode_swapmap(0, true);
+	else /* at suspend */
+		si->swap_map[offset] = encode_swapmap(1, false);
 	si->cluster_next = offset + 1;
 	si->flags -= SWP_SCANNING;
 
@@ -401,7 +432,8 @@ swp_entry_t get_swap_page(void)
 			continue;
 
 		swap_list.next = next;
-		offset = scan_swap_map(si);
+		/* This is called for allocating swap entry for cache */
+		offset = scan_swap_map(si, SWAP_CACHE);
 		if (offset) {
 			spin_unlock(&swap_lock);
 			return swp_entry(type, offset);
@@ -415,6 +447,7 @@ noswap:
 	return (swp_entry_t) {0};
 }
 
+/* The only caller of this function is now susupend routine */
 swp_entry_t get_swap_page_of_type(int type)
 {
 	struct swap_info_struct *si;
@@ -424,7 +457,8 @@ swp_entry_t get_swap_page_of_type(int type)
 	si = swap_info + type;
 	if (si->flags & SWP_WRITEOK) {
 		nr_swap_pages--;
-		offset = scan_swap_map(si);
+		/* This is called for allocating swap entry, not cache */
+		offset = scan_swap_map(si, SWAP_MAP);
 		if (offset) {
 			spin_unlock(&swap_lock);
 			return swp_entry(type, offset);
@@ -471,25 +505,38 @@ out:
 	return NULL;
 }
 
-static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent)
+static int swap_entry_free(struct swap_info_struct *p,
+			   swp_entry_t ent, int cache)
 {
 	unsigned long offset = swp_offset(ent);
-	int count = p->swap_map[offset];
-
-	if (count < SWAP_MAP_MAX) {
-		count--;
-		p->swap_map[offset] = count;
-		if (!count) {
-			if (offset < p->lowest_bit)
-				p->lowest_bit = offset;
-			if (offset > p->highest_bit)
-				p->highest_bit = offset;
-			if (p->prio > swap_info[swap_list.next].prio)
-				swap_list.next = p - swap_info;
-			nr_swap_pages++;
-			p->inuse_pages--;
-			mem_cgroup_uncharge_swap(ent);
+	int count = swap_count(p->swap_map[offset]);
+	bool has_cache;
+
+	has_cache = swap_has_cache(p->swap_map[offset]);
+
+	if (cache == SWAP_MAP) { /* dropping usage count of swap */
+		if (count < SWAP_MAP_MAX) {
+			count--;
+			p->swap_map[offset] = encode_swapmap(count, has_cache);
 		}
+	} else { /* dropping swap cache flag */
+		VM_BUG_ON(!has_cache);
+		p->swap_map[offset] = encode_swapmap(count, false);
+
+	}
+	/* return code. */
+	count = p->swap_map[offset];
+	/* free if no reference */
+	if (!count) {
+		if (offset < p->lowest_bit)
+			p->lowest_bit = offset;
+		if (offset > p->highest_bit)
+			p->highest_bit = offset;
+		if (p->prio > swap_info[swap_list.next].prio)
+			swap_list.next = p - swap_info;
+		nr_swap_pages++;
+		p->inuse_pages--;
+		mem_cgroup_uncharge_swap(ent);
 	}
 	return count;
 }
@@ -504,7 +551,7 @@ void swap_free(swp_entry_t entry)
 
 	p = swap_info_get(entry);
 	if (p) {
-		swap_entry_free(p, entry);
+		swap_entry_free(p, entry, SWAP_MAP);
 		spin_unlock(&swap_lock);
 	}
 }
@@ -514,9 +561,16 @@ void swap_free(swp_entry_t entry)
  */
 void swapcache_free(swp_entry_t entry, struct page *page)
 {
+	struct swap_info_struct *p;
+
 	if (page)
 		mem_cgroup_uncharge_swapcache(page, entry);
-	return swap_free(entry);
+	p = swap_info_get(entry);
+	if (p) {
+		swap_entry_free(p, entry, SWAP_CACHE);
+		spin_unlock(&swap_lock);
+	}
+	return;
 }
 
 /*
@@ -531,8 +585,7 @@ static inline int page_swapcount(struct page *page)
 	entry.val = page_private(page);
 	p = swap_info_get(entry);
 	if (p) {
-		/* Subtract the 1 for the swap cache itself */
-		count = p->swap_map[swp_offset(entry)] - 1;
+		count = swap_count(p->swap_map[swp_offset(entry)]);
 		spin_unlock(&swap_lock);
 	}
 	return count;
@@ -594,7 +647,7 @@ int free_swap_and_cache(swp_entry_t entry)
 
 	p = swap_info_get(entry);
 	if (p) {
-		if (swap_entry_free(p, entry) == 1) {
+		if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
 			page = find_get_page(&swapper_space, entry.val);
 			if (page && !trylock_page(page)) {
 				page_cache_release(page);
@@ -901,7 +954,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 			i = 1;
 		}
 		count = si->swap_map[i];
-		if (count && count != SWAP_MAP_BAD)
+		if (count && swap_count(count) != SWAP_MAP_BAD)
 			break;
 	}
 	return i;
@@ -1005,13 +1058,13 @@ static int try_to_unuse(unsigned int type)
 		 */
 		shmem = 0;
 		swcount = *swap_map;
-		if (swcount > 1) {
+		if (swap_count(swcount)) {
 			if (start_mm == &init_mm)
 				shmem = shmem_unuse(entry, page);
 			else
 				retval = unuse_mm(start_mm, entry, page);
 		}
-		if (*swap_map > 1) {
+		if (swap_count(*swap_map)) {
 			int set_start_mm = (*swap_map >= swcount);
 			struct list_head *p = &start_mm->mmlist;
 			struct mm_struct *new_start_mm = start_mm;
@@ -1021,7 +1074,7 @@ static int try_to_unuse(unsigned int type)
 			atomic_inc(&new_start_mm->mm_users);
 			atomic_inc(&prev_mm->mm_users);
 			spin_lock(&mmlist_lock);
-			while (*swap_map > 1 && !retval && !shmem &&
+			while (swap_count(*swap_map) && !retval && !shmem &&
 					(p = p->next) != &start_mm->mmlist) {
 				mm = list_entry(p, struct mm_struct, mmlist);
 				if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1033,14 +1086,16 @@ static int try_to_unuse(unsigned int type)
 				cond_resched();
 
 				swcount = *swap_map;
-				if (swcount <= 1)
+				if (!swap_count(swcount)) /* any usage ? */
 					;
 				else if (mm == &init_mm) {
 					set_start_mm = 1;
 					shmem = shmem_unuse(entry, page);
 				} else
 					retval = unuse_mm(mm, entry, page);
-				if (set_start_mm && *swap_map < swcount) {
+
+				if (set_start_mm &&
+				    swap_count(*swap_map) < swcount) {
 					mmput(new_start_mm);
 					atomic_inc(&mm->mm_users);
 					new_start_mm = mm;
@@ -1067,21 +1122,25 @@ static int try_to_unuse(unsigned int type)
 		}
 
 		/*
-		 * How could swap count reach 0x7fff when the maximum
-		 * pid is 0x7fff, and there's no way to repeat a swap
-		 * page within an mm (except in shmem, where it's the
-		 * shared object which takes the reference count)?
-		 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
-		 *
+		 * How could swap count reach 0x7ffe ?
+		 * There's no way to repeat a swap page within an mm
+		 * (except in shmem, where it's the shared object which takes
+		 * the reference count)?
+		 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
+		 * short is too small....)
 		 * If that's wrong, then we should worry more about
 		 * exit_mmap() and do_munmap() cases described above:
 		 * we might be resetting SWAP_MAP_MAX too early here.
 		 * We know "Undead"s can happen, they're okay, so don't
 		 * report them; but do report if we reset SWAP_MAP_MAX.
 		 */
-		if (*swap_map == SWAP_MAP_MAX) {
+		/* We might release the lock_page() in unuse_mm(). */
+		if (!PageSwapCache(page) || page_private(page) != entry.val)
+			goto retry;
+
+		if (swap_count(*swap_map) == SWAP_MAP_MAX) {
 			spin_lock(&swap_lock);
-			*swap_map = 1;
+			*swap_map = encode_swapmap(0, true);
 			spin_unlock(&swap_lock);
 			reset_overflow = 1;
 		}
@@ -1099,7 +1158,8 @@ static int try_to_unuse(unsigned int type)
 		 * pages would be incorrect if swap supported "shared
 		 * private" pages, but they are handled by tmpfs files.
 		 */
-		if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
+		if (swap_count(*swap_map) &&
+		     PageDirty(page) && PageSwapCache(page)) {
 			struct writeback_control wbc = {
 				.sync_mode = WB_SYNC_NONE,
 			};
@@ -1126,6 +1186,7 @@ static int try_to_unuse(unsigned int type)
 		 * mark page dirty so shrink_page_list will preserve it.
 		 */
 		SetPageDirty(page);
+retry:
 		unlock_page(page);
 		page_cache_release(page);
 
@@ -1952,15 +2013,23 @@ void si_swapinfo(struct sysinfo *val)
  *
  * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
  * "permanent", but will be reclaimed by the next swapoff.
+ * Returns error code in following case.
+ * - success -> 0
+ * - swp_entry is invalid -> EINVAL
+ * - swp_entry is migration entry -> EINVAL
+ * - swap-cache reference is requested but there is already one. -> EEXIST
+ * - swap-cache reference is requested but the entry is not used. -> ENOENT
  */
-int swap_duplicate(swp_entry_t entry)
+static int __swap_duplicate(swp_entry_t entry, bool cache)
 {
 	struct swap_info_struct * p;
 	unsigned long offset, type;
-	int result = 0;
+	int result = -EINVAL;
+	int count;
+	bool has_cache;
 
 	if (is_migration_entry(entry))
-		return 1;
+		return -EINVAL;
 
 	type = swp_type(entry);
 	if (type >= nr_swapfiles)
@@ -1969,17 +2038,40 @@ int swap_duplicate(swp_entry_t entry)
 	offset = swp_offset(entry);
 
 	spin_lock(&swap_lock);
-	if (offset < p->max && p->swap_map[offset]) {
-		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
-			p->swap_map[offset]++;
-			result = 1;
-		} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
+
+	if (unlikely(offset >= p->max))
+		goto unlock_out;
+
+	count = swap_count(p->swap_map[offset]);
+	has_cache = swap_has_cache(p->swap_map[offset]);
+
+	if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
+
+		/* set SWAP_HAS_CACHE if there is no cache and entry is used */
+		if (!has_cache && count) {
+			p->swap_map[offset] = encode_swapmap(count, true);
+			result = 0;
+		} else if (has_cache) /* someone added cache */
+			result = -EEXIST;
+		else if (!count) /* no users */
+			result = -ENOENT;
+
+	} else if (count || has_cache) {
+		if (count < SWAP_MAP_MAX - 1) {
+			p->swap_map[offset] = encode_swapmap(count + 1,
+							     has_cache);
+			result = 0;
+		} else if (count <= SWAP_MAP_MAX) {
 			if (swap_overflow++ < 5)
-				printk(KERN_WARNING "swap_dup: swap entry overflow\n");
-			p->swap_map[offset] = SWAP_MAP_MAX;
-			result = 1;
+				printk(KERN_WARNING
+				       "swap_dup: swap entry overflow\n");
+			p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
+							      has_cache);
+			result = 0;
 		}
-	}
+	} else
+		result = -ENOENT; /* unused swap entry */
+unlock_out:
 	spin_unlock(&swap_lock);
 out:
 	return result;
@@ -1988,13 +2080,25 @@ bad_file:
 	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
 	goto out;
 }
+/*
+ * increase reference count of swap entry by 1.
+ */
+void swap_duplicate(swp_entry_t entry)
+{
+	__swap_duplicate(entry, SWAP_MAP);
+}
 
 /*
+ * @entry: swap entry for which we allocate swap cache.
+ *
  * Called when allocating swap cache for exising swap entry,
+ * This can return error codes. Returns 0 at success.
+ * -EBUSY means there is a swap cache.
+ * Note: return code is different from swap_duplicate().
  */
 int swapcache_prepare(swp_entry_t entry)
 {
-	return swap_duplicate(entry);
+	return __swap_duplicate(entry, SWAP_CACHE);
 }
 
 
@@ -2035,7 +2139,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 		/* Don't read in free or bad pages */
 		if (!si->swap_map[toff])
 			break;
-		if (si->swap_map[toff] == SWAP_MAP_BAD)
+		if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
 			break;
 	}
 	/* Count contiguous allocated slots below our target */
@@ -2043,7 +2147,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 		/* Don't read in free or bad pages */
 		if (!si->swap_map[toff])
 			break;
-		if (si->swap_map[toff] == SWAP_MAP_BAD)
+		if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
 			break;
 	}
 	spin_unlock(&swap_lock);
-- 
cgit v0.10.2


From c9e444103b5e7a5a3519f9913f59767f92e33baf Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 16 Jun 2009 15:32:54 -0700
Subject: mm: reuse unused swap entry if necessary

Presently we can know a swap entry is just used as SwapCache via swap_map,
without looking up swap cache.

Then, we have a chance to reuse swap-cache-only swap entries in
get_swap_pages().

This patch tries to free swap-cache-only swap entries if swap is not
enough.

Note: We hit following path when swap_cluster code cannot find a free
cluster.  Then, vm_swap_full() is not only condition to allow the kernel
to reclaim unused swap.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@in.ibm.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0d72969..28faa01 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -79,6 +79,32 @@ static inline unsigned short encode_swapmap(int count, bool has_cache)
 	return ret;
 }
 
+/* returnes 1 if swap entry is freed */
+static int
+__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
+{
+	int type = si - swap_info;
+	swp_entry_t entry = swp_entry(type, offset);
+	struct page *page;
+	int ret = 0;
+
+	page = find_get_page(&swapper_space, entry.val);
+	if (!page)
+		return 0;
+	/*
+	 * This function is called from scan_swap_map() and it's called
+	 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
+	 * We have to use trylock for avoiding deadlock. This is a special
+	 * case and you should use try_to_free_swap() with explicit lock_page()
+	 * in usual operations.
+	 */
+	if (trylock_page(page)) {
+		ret = try_to_free_swap(page);
+		unlock_page(page);
+	}
+	page_cache_release(page);
+	return ret;
+}
 
 /*
  * We need this because the bdev->unplug_fn can sleep and we cannot
@@ -301,6 +327,19 @@ checks:
 		goto no_page;
 	if (offset > si->highest_bit)
 		scan_base = offset = si->lowest_bit;
+
+	/* reuse swap entry of cache-only swap if not busy. */
+	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+		int swap_was_freed;
+		spin_unlock(&swap_lock);
+		swap_was_freed = __try_to_reclaim_swap(si, offset);
+		spin_lock(&swap_lock);
+		/* entry was freed successfully, try to use this again */
+		if (swap_was_freed)
+			goto checks;
+		goto scan; /* check next one */
+	}
+
 	if (si->swap_map[offset])
 		goto scan;
 
@@ -382,6 +421,10 @@ scan:
 			spin_lock(&swap_lock);
 			goto checks;
 		}
+		if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+			spin_lock(&swap_lock);
+			goto checks;
+		}
 		if (unlikely(--latency_ration < 0)) {
 			cond_resched();
 			latency_ration = LATENCY_LIMIT;
@@ -393,6 +436,10 @@ scan:
 			spin_lock(&swap_lock);
 			goto checks;
 		}
+		if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+			spin_lock(&swap_lock);
+			goto checks;
+		}
 		if (unlikely(--latency_ration < 0)) {
 			cond_resched();
 			latency_ration = LATENCY_LIMIT;
-- 
cgit v0.10.2


From 2ff05b2b4eac2e63d345fc731ea151a060247f53 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 16 Jun 2009 15:32:56 -0700
Subject: oom: move oom_adj value from task_struct to mm_struct

The per-task oom_adj value is a characteristic of its mm more than the
task itself since it's not possible to oom kill any thread that shares the
mm.  If a task were to be killed while attached to an mm that could not be
freed because another thread were set to OOM_DISABLE, it would have
needlessly been terminated since there is no potential for future memory
freeing.

This patch moves oomkilladj (now more appropriately named oom_adj) from
struct task_struct to struct mm_struct.  This requires task_lock() on a
task to check its oom_adj value to protect against exec, but it's already
necessary to take the lock when dereferencing the mm to find the total VM
size for the badness heuristic.

This fixes a livelock if the oom killer chooses a task and another thread
sharing the same memory has an oom_adj value of OOM_DISABLE.  This occurs
because oom_kill_task() repeatedly returns 1 and refuses to kill the
chosen task while select_bad_process() will repeatedly choose the same
task during the next retry.

Taking task_lock() in select_bad_process() to check for OOM_DISABLE and in
oom_kill_task() to check for threads sharing the same memory will be
removed in the next patch in this series where it will no longer be
necessary.

Writing to /proc/pid/oom_adj for a kthread will now return -EINVAL since
these threads are immune from oom killing already.  They simply report an
oom_adj value of OOM_DISABLE.

Cc: Nick Piggin <npiggin@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index cd8717a..ebff3c1 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1003,11 +1003,13 @@ CHAPTER 3: PER-PROCESS PARAMETERS
 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
 ------------------------------------------------------
 
-This file can be used to adjust the score used to select which processes
-should be killed in an  out-of-memory  situation.  Giving it a high score will
-increase the likelihood of this process being killed by the oom-killer.  Valid
-values are in the range -16 to +15, plus the special value -17, which disables
-oom-killing altogether for this process.
+This file can be used to adjust the score used to select which processes should
+be killed in an out-of-memory situation.  The oom_adj value is a characteristic
+of the task's mm, so all threads that share an mm with pid will have the same
+oom_adj value.  A high value will increase the likelihood of this process being
+killed by the oom-killer.  Valid values are in the range -16 to +15 as
+explained below and a special value of -17, which disables oom-killing
+altogether for threads sharing pid's mm.
 
 The process to be killed in an out-of-memory situation is selected among all others
 based on its badness score. This value equals the original memory size of the process
@@ -1021,6 +1023,9 @@ the parent's score if they do not share the same memory. Thus forking servers
 are the prime candidates to be killed. Having only one 'hungry' child will make
 parent less preferable than the child.
 
+/proc/<pid>/oom_adj cannot be changed for kthreads since they are immune from
+oom-killing already.
+
 /proc/<pid>/oom_score shows process' current badness score.
 
 The following heuristics are then applied:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1539e63..3ce5ae9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1006,7 +1006,12 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 
 	if (!task)
 		return -ESRCH;
-	oom_adjust = task->oomkilladj;
+	task_lock(task);
+	if (task->mm)
+		oom_adjust = task->mm->oom_adj;
+	else
+		oom_adjust = OOM_DISABLE;
+	task_unlock(task);
 	put_task_struct(task);
 
 	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1035,11 +1040,19 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	task = get_proc_task(file->f_path.dentry->d_inode);
 	if (!task)
 		return -ESRCH;
-	if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
+	task_lock(task);
+	if (!task->mm) {
+		task_unlock(task);
+		put_task_struct(task);
+		return -EINVAL;
+	}
+	if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) {
+		task_unlock(task);
 		put_task_struct(task);
 		return -EACCES;
 	}
-	task->oomkilladj = oom_adjust;
+	task->mm->oom_adj = oom_adjust;
+	task_unlock(task);
 	put_task_struct(task);
 	if (end - buffer == 0)
 		return -EIO;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0e80e26..f440810 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -232,6 +232,8 @@ struct mm_struct {
 
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
+	s8 oom_adj;	/* OOM kill score adjustment (bit shift) */
+
 	cpumask_t cpu_vm_mask;
 
 	/* Architecture-specific MM context */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1048bf5..1bc6fae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1178,7 +1178,6 @@ struct task_struct {
 	 * a short time
 	 */
 	unsigned char fpu_counter;
-	s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	unsigned int btrace_seq;
 #endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a7b2460..b609135 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	unsigned long points, cpu_time, run_time;
 	struct mm_struct *mm;
 	struct task_struct *child;
+	int oom_adj;
 
 	task_lock(p);
 	mm = p->mm;
@@ -65,6 +66,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 		task_unlock(p);
 		return 0;
 	}
+	oom_adj = mm->oom_adj;
 
 	/*
 	 * The memory size of the process is the basis for the badness.
@@ -148,15 +150,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 		points /= 8;
 
 	/*
-	 * Adjust the score by oomkilladj.
+	 * Adjust the score by oom_adj.
 	 */
-	if (p->oomkilladj) {
-		if (p->oomkilladj > 0) {
+	if (oom_adj) {
+		if (oom_adj > 0) {
 			if (!points)
 				points = 1;
-			points <<= p->oomkilladj;
+			points <<= oom_adj;
 		} else
-			points >>= -(p->oomkilladj);
+			points >>= -(oom_adj);
 	}
 
 #ifdef DEBUG
@@ -251,8 +253,12 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 			*ppoints = ULONG_MAX;
 		}
 
-		if (p->oomkilladj == OOM_DISABLE)
+		task_lock(p);
+		if (p->mm && p->mm->oom_adj == OOM_DISABLE) {
+			task_unlock(p);
 			continue;
+		}
+		task_unlock(p);
 
 		points = badness(p, uptime.tv_sec);
 		if (points > *ppoints || !chosen) {
@@ -304,8 +310,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
 		}
 		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
 		       p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
-		       get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
-		       p->comm);
+		       get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
 		task_unlock(p);
 	} while_each_thread(g, p);
 }
@@ -367,8 +372,12 @@ static int oom_kill_task(struct task_struct *p)
 	 * Don't kill the process if any threads are set to OOM_DISABLE
 	 */
 	do_each_thread(g, q) {
-		if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
+		task_lock(q);
+		if (q->mm == mm && q->mm && q->mm->oom_adj == OOM_DISABLE) {
+			task_unlock(q);
 			return 1;
+		}
+		task_unlock(q);
 	} while_each_thread(g, q);
 
 	__oom_kill_task(p, 1);
@@ -393,10 +402,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	struct task_struct *c;
 
 	if (printk_ratelimit()) {
-		printk(KERN_WARNING "%s invoked oom-killer: "
-			"gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
-			current->comm, gfp_mask, order, current->oomkilladj);
 		task_lock(current);
+		printk(KERN_WARNING "%s invoked oom-killer: "
+			"gfp_mask=0x%x, order=%d, oom_adj=%d\n",
+			current->comm, gfp_mask, order,
+			current->mm ? current->mm->oom_adj : OOM_DISABLE);
 		cpuset_print_task_mems_allowed(current);
 		task_unlock(current);
 		dump_stack();
-- 
cgit v0.10.2


From 4d8b9135c30ccbe46e621fefd862969819003fd6 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 16 Jun 2009 15:32:57 -0700
Subject: oom: avoid unnecessary mm locking and scanning for OOM_DISABLE

This moves the check for OOM_DISABLE to the badness heuristic so it is
only necessary to hold task_lock() once.  If the mm is OOM_DISABLE, the
score is 0, which is also correctly exported via /proc/pid/oom_score.
This requires that tasks with badness scores of 0 are prohibited from
being oom killed, which makes sense since they would not allow for future
memory freeing anyway.

Since the oom_adj value is a characteristic of an mm and not a task, it is
no longer necessary to check the oom_adj value for threads sharing the
same memory (except when simply issuing SIGKILLs for threads in other
thread groups).

Cc: Nick Piggin <npiggin@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b609135..cdcf89c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -67,6 +67,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 		return 0;
 	}
 	oom_adj = mm->oom_adj;
+	if (oom_adj == OOM_DISABLE) {
+		task_unlock(p);
+		return 0;
+	}
 
 	/*
 	 * The memory size of the process is the basis for the badness.
@@ -253,15 +257,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 			*ppoints = ULONG_MAX;
 		}
 
-		task_lock(p);
-		if (p->mm && p->mm->oom_adj == OOM_DISABLE) {
-			task_unlock(p);
-			continue;
-		}
-		task_unlock(p);
-
 		points = badness(p, uptime.tv_sec);
-		if (points > *ppoints || !chosen) {
+		if (points > *ppoints) {
 			chosen = p;
 			*ppoints = points;
 		}
@@ -354,32 +351,13 @@ static int oom_kill_task(struct task_struct *p)
 	struct mm_struct *mm;
 	struct task_struct *g, *q;
 
+	task_lock(p);
 	mm = p->mm;
-
-	/* WARNING: mm may not be dereferenced since we did not obtain its
-	 * value from get_task_mm(p).  This is OK since all we need to do is
-	 * compare mm to q->mm below.
-	 *
-	 * Furthermore, even if mm contains a non-NULL value, p->mm may
-	 * change to NULL at any time since we do not hold task_lock(p).
-	 * However, this is of no concern to us.
-	 */
-
-	if (mm == NULL)
+	if (!mm || mm->oom_adj == OOM_DISABLE) {
+		task_unlock(p);
 		return 1;
-
-	/*
-	 * Don't kill the process if any threads are set to OOM_DISABLE
-	 */
-	do_each_thread(g, q) {
-		task_lock(q);
-		if (q->mm == mm && q->mm && q->mm->oom_adj == OOM_DISABLE) {
-			task_unlock(q);
-			return 1;
-		}
-		task_unlock(q);
-	} while_each_thread(g, q);
-
+	}
+	task_unlock(p);
 	__oom_kill_task(p, 1);
 
 	/*
-- 
cgit v0.10.2


From 82553a937f12352c26fe457510ebab3f512cd3fa Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 16 Jun 2009 15:32:58 -0700
Subject: oom: invoke oom killer for __GFP_NOFAIL

The oom killer must be invoked regardless of the order if the allocation
is __GFP_NOFAIL, otherwise it will loop forever when reclaim fails to free
some memory.

Cc: Nick Piggin <npiggin@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c95a77c..c5fb017 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1561,7 +1561,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 		goto out;
 
 	/* The OOM killer will not help higher order allocs */
-	if (order > PAGE_ALLOC_COSTLY_ORDER)
+	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
 		goto out;
 
 	/* Exhausted what can be done so it's blamo time */
@@ -1781,11 +1781,13 @@ rebalance:
 				goto got_pg;
 
 			/*
-			 * The OOM killer does not trigger for high-order allocations
-			 * but if no progress is being made, there are no other
-			 * options and retrying is unlikely to help
+			 * The OOM killer does not trigger for high-order
+			 * ~__GFP_NOFAIL allocations so if no progress is being
+			 * made, there are no other options and retrying is
+			 * unlikely to help.
 			 */
-			if (order > PAGE_ALLOC_COSTLY_ORDER)
+			if (order > PAGE_ALLOC_COSTLY_ORDER &&
+						!(gfp_mask & __GFP_NOFAIL))
 				goto nopage;
 
 			goto restart;
-- 
cgit v0.10.2


From 286973552f051404abdb58dd9b2f8f7558efe4e5 Mon Sep 17 00:00:00 2001
From: Mike Waychison <mikew@google.com>
Date: Tue, 16 Jun 2009 15:32:59 -0700
Subject: mm: remove __invalidate_mapping_pages variant

Remove __invalidate_mapping_pages atomic variant now that its sole caller
can sleep (fixed in eccb95cee4f0d56faa46ef22fb94dd4a3578d3eb ("vfs: fix
lock inversion in drop_pagecache_sb()")).

This fixes softlockups that can occur while in the drop_caches path.

Signed-off-by: Mike Waychison <mikew@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index b6a719a..a2edb79 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -24,7 +24,7 @@ static void drop_pagecache_sb(struct super_block *sb)
 			continue;
 		__iget(inode);
 		spin_unlock(&inode_lock);
-		__invalidate_mapping_pages(inode->i_mapping, 0, -1, true);
+		invalidate_mapping_pages(inode->i_mapping, 0, -1);
 		iput(toput_inode);
 		toput_inode = inode;
 		spin_lock(&inode_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8146e02..f5ae9f1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2036,9 +2036,6 @@ extern int __invalidate_device(struct block_device *);
 extern int invalidate_partition(struct gendisk *, int);
 #endif
 extern int invalidate_inodes(struct super_block *);
-unsigned long __invalidate_mapping_pages(struct address_space *mapping,
-					pgoff_t start, pgoff_t end,
-					bool be_atomic);
 unsigned long invalidate_mapping_pages(struct address_space *mapping,
 					pgoff_t start, pgoff_t end);
 
diff --git a/mm/truncate.c b/mm/truncate.c
index 12e1579..ccc3ecf 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 }
 EXPORT_SYMBOL(truncate_inode_pages);
 
-unsigned long __invalidate_mapping_pages(struct address_space *mapping,
-				pgoff_t start, pgoff_t end, bool be_atomic)
+/**
+ * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_mapping_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ */
+unsigned long invalidate_mapping_pages(struct address_space *mapping,
+				       pgoff_t start, pgoff_t end)
 {
 	struct pagevec pvec;
 	pgoff_t next = start;
@@ -309,30 +322,10 @@ unlock:
 				break;
 		}
 		pagevec_release(&pvec);
-		if (likely(!be_atomic))
-			cond_resched();
+		cond_resched();
 	}
 	return ret;
 }
-
-/**
- * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
- * @start: the offset 'from' which to invalidate
- * @end: the offset 'to' which to invalidate (inclusive)
- *
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
- *
- * invalidate_mapping_pages() will not block on IO activity. It will not
- * invalidate pages which are dirty, locked, under writeback or mapped into
- * pagetables.
- */
-unsigned long invalidate_mapping_pages(struct address_space *mapping,
-				pgoff_t start, pgoff_t end)
-{
-	return __invalidate_mapping_pages(mapping, start, end, false);
-}
 EXPORT_SYMBOL(invalidate_mapping_pages);
 
 /*
-- 
cgit v0.10.2


From 73d60b7f747176dbdff826c4127d22e1fd3f9f74 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 16 Jun 2009 15:33:00 -0700
Subject: page-allocator: clear N_HIGH_MEMORY map before we set it again

SRAT tables may contains nodes of very small size.  The arch code may
decide to not activate such a node.  However, currently the early boot
code sets N_HIGH_MEMORY for such nodes.  These nodes therefore seem to be
active although these nodes have no present pages.

For 64bit N_HIGH_MEMORY == N_NORMAL_MEMORY, so that works for 64 bit too

Signed-off-by: Yinghai Lu <Yinghai@kernel.org>
Tested-by: Jack Steiner <steiner@sgi.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c5fb017..6407cbf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4204,6 +4204,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 						early_node_map[i].start_pfn,
 						early_node_map[i].end_pfn);
 
+	/*
+	 * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init
+	 * that node_mask, clear it at first
+	 */
+	nodes_clear(node_states[N_HIGH_MEMORY]);
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
 	setup_nr_node_ids();
-- 
cgit v0.10.2


From 8192da6a8811ab6c3d29dc590a5f94a377c43739 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 16 Jun 2009 15:33:01 -0700
Subject: mm: remove annotation of gfp_mask in add_to_swap

Hugh removed add_to_swap's gfp_mask argument.  (mm: remove gfp_mask from
add_to_swap) So we have to remove annotation of gfp_mask of the function.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/swap_state.c b/mm/swap_state.c
index b9ca029..b62e7f5 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -124,7 +124,6 @@ void __delete_from_swap_cache(struct page *page)
 /**
  * add_to_swap - allocate swap space for a page
  * @page: page we want to move to swap
- * @gfp_mask: memory allocation flags
  *
  * Allocate swap space for the page and add the page to the
  * swap cache.  Caller needs to hold the page lock. 
-- 
cgit v0.10.2


From aca8bf323edd31ad462dc98c107c23a5c6022ca2 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 16 Jun 2009 15:33:02 -0700
Subject: mm: remove file argument from swap_readpage()

The file argument resulted from address_space's readpage long time ago.

We don't use it any more.  Let's remove unnecessary argement.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/swap.h b/include/linux/swap.h
index fed5e8e..0cedf31 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -256,7 +256,7 @@ extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
 
 #ifdef CONFIG_SWAP
 /* linux/mm/page_io.c */
-extern int swap_readpage(struct file *, struct page *);
+extern int swap_readpage(struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
 extern void end_swap_bio_read(struct bio *bio, int err);
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 3023c47..c6f3e50 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -120,7 +120,7 @@ out:
 	return ret;
 }
 
-int swap_readpage(struct file *file, struct page *page)
+int swap_readpage(struct page *page)
 {
 	struct bio *bio;
 	int ret = 0;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b62e7f5..42cd38e 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -313,7 +313,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			 * Initiate read into locked page and return.
 			 */
 			lru_cache_add_anon(new_page);
-			swap_readpage(NULL, new_page);
+			swap_readpage(new_page);
 			return new_page;
 		}
 		ClearPageSwapBacked(new_page);
-- 
cgit v0.10.2


From 168f5ac668f63dfb64439766e3ef9e866b83719d Mon Sep 17 00:00:00 2001
From: Sergei Trofimovich <slyfox@inbox.ru>
Date: Tue, 16 Jun 2009 15:33:02 -0700
Subject: mm cleanup: shmem_file_setup: 'char *' -> 'const char *' for name
 argument

As function shmem_file_setup does not modify/allocate/free/pass given
filename - mark it as const.

Signed-off-by: Sergei Trofimovich <slyfox@inbox.ru>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f0473a8..d88d6fc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -724,7 +724,7 @@ static inline int shmem_lock(struct file *file, int lock,
 	return 0;
 }
 #endif
-struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags);
+struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags);
 
 int shmem_zero_setup(struct vm_area_struct *);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 47ab191..e89d7ec 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2612,7 +2612,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
  * @size: size to be set for the file
  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
  */
-struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
+struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
 {
 	int error;
 	struct file *file;
-- 
cgit v0.10.2


From 608e8e66a154cbc3d591a59dcebfd9cbc9e3431a Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:33:04 -0700
Subject: mm: add a gfp-translate script to help understand page allocation
 failure reports

The page allocation failure messages include a line that looks like

page allocation failure. order:1, mode:0x4020

The mode is easy to translate but irritating for the lazy and a bit error
prone.  This patch adds a very simple helper script gfp-translate for the
mode: portion of the page allocation failure messages.  An example usage
looks like

  mel@machina:~/linux-2.6 $ scripts/gfp-translate 0x4020
  Source: /home/mel/linux-2.6
  Parsing: 0x4020
  #define __GFP_HIGH	(0x20)	/* Should access emergency pools? */
  #define __GFP_COMP	(0x4000) /* Add compound page metadata */

The script is not a work of art but it has come in handy for me a few
times so I thought I would share.

[akpm@linux-foundation.org: clarify an error message]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/gfp-translate b/scripts/gfp-translate
new file mode 100644
index 0000000..073cb6d
--- /dev/null
+++ b/scripts/gfp-translate
@@ -0,0 +1,81 @@
+#!/bin/bash
+# Translate the bits making up a GFP mask
+# (c) 2009, Mel Gorman <mel@csn.ul.ie>
+# Licensed under the terms of the GNU GPL License version 2
+SOURCE=
+GFPMASK=none
+
+# Helper function to report failures and exit
+die() {
+	echo ERROR: $@
+	if [ "$TMPFILE" != "" ]; then
+		rm -f $TMPFILE
+	fi
+	exit -1
+}
+
+usage() {
+	echo "usage: gfp-translate [-h] [ --source DIRECTORY ] gfpmask"
+	exit 0
+}
+
+# Parse command-line arguements
+while [ $# -gt 0 ]; do
+	case $1 in
+		--source)
+			SOURCE=$2
+			shift 2
+			;;
+		-h)
+			usage
+			;;
+		--help)
+			usage
+			;;
+		*)
+			GFPMASK=$1
+			shift
+			;;
+	esac
+done
+
+# Guess the kernel source directory if it's not set. Preference is in order of
+# o current directory
+# o /usr/src/linux
+if [ "$SOURCE" = "" ]; then
+	if [ -r "/usr/src/linux/Makefile" ]; then
+		SOURCE=/usr/src/linux
+	fi
+	if [ -r "`pwd`/Makefile" ]; then
+		SOURCE=`pwd`
+	fi
+fi
+
+# Confirm that a source directory exists
+if [ ! -r "$SOURCE/Makefile" ]; then
+	die "Could not locate kernel source directory or it is invalid"
+fi
+
+# Confirm that a GFP mask has been specified
+if [ "$GFPMASK" = "none" ]; then
+	usage
+fi
+
+# Extract GFP flags from the kernel source
+TMPFILE=`mktemp -t gfptranslate-XXXXXX` || exit 1
+grep "^#define __GFP" $SOURCE/include/linux/gfp.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE
+
+# Parse the flags
+IFS="
+"
+echo Source: $SOURCE
+echo Parsing: $GFPMASK
+for LINE in `cat $TMPFILE`; do
+	MASK=`echo $LINE | awk '{print $3}'`
+	if [ $(($GFPMASK&$MASK)) -ne 0 ]; then
+		echo $LINE
+	fi
+done
+
+rm -f $TMPFILE
+exit 0
-- 
cgit v0.10.2


From 6fe6b7e35785e3232ffe7f81d3893f1316710a02 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:33:05 -0700
Subject: vmscan: report vm_flags in page_referenced()

Collect vma->vm_flags of the VMAs that actually referenced the page.

This is preparing for more informed reclaim heuristics, eg.  to protect
executable file pages more aggressively.  For now only the VM_EXEC bit
will be used by the caller.

Thanks to Johannes, Peter and Minchan for all the good tips.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 619379a..216d024 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -83,7 +83,8 @@ static inline void page_dup_rmap(struct page *page, struct vm_area_struct *vma,
 /*
  * Called from mm/vmscan.c to handle paging out
  */
-int page_referenced(struct page *, int is_locked, struct mem_cgroup *cnt);
+int page_referenced(struct page *, int is_locked,
+			struct mem_cgroup *cnt, unsigned long *vm_flags);
 int try_to_unmap(struct page *, int ignore_refs);
 
 /*
@@ -117,7 +118,7 @@ int try_to_munlock(struct page *);
 #define anon_vma_prepare(vma)	(0)
 #define anon_vma_link(vma)	do {} while (0)
 
-#define page_referenced(page,l,cnt) TestClearPageReferenced(page)
+#define page_referenced(page, locked, cnt, flags) TestClearPageReferenced(page)
 #define try_to_unmap(page, refs) SWAP_FAIL
 
 static inline int page_mkclean(struct page *page)
diff --git a/mm/rmap.c b/mm/rmap.c
index 316c9d6..c9ccc1a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
  * repeatedly from either page_referenced_anon or page_referenced_file.
  */
 static int page_referenced_one(struct page *page,
-	struct vm_area_struct *vma, unsigned int *mapcount)
+			       struct vm_area_struct *vma,
+			       unsigned int *mapcount,
+			       unsigned long *vm_flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
@@ -381,11 +383,14 @@ out_unmap:
 	(*mapcount)--;
 	pte_unmap_unlock(pte, ptl);
 out:
+	if (referenced)
+		*vm_flags |= vma->vm_flags;
 	return referenced;
 }
 
 static int page_referenced_anon(struct page *page,
-				struct mem_cgroup *mem_cont)
+				struct mem_cgroup *mem_cont,
+				unsigned long *vm_flags)
 {
 	unsigned int mapcount;
 	struct anon_vma *anon_vma;
@@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page,
 		 */
 		if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
 			continue;
-		referenced += page_referenced_one(page, vma, &mapcount);
+		referenced += page_referenced_one(page, vma,
+						  &mapcount, vm_flags);
 		if (!mapcount)
 			break;
 	}
@@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page,
  * page_referenced_file - referenced check for object-based rmap
  * @page: the page we're checking references on.
  * @mem_cont: target memory controller
+ * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
  *
  * For an object-based mapped page, find all the places it is mapped and
  * check/clear the referenced flag.  This is done by following the page->mapping
@@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page,
  * This function is only called from page_referenced for object-based pages.
  */
 static int page_referenced_file(struct page *page,
-				struct mem_cgroup *mem_cont)
+				struct mem_cgroup *mem_cont,
+				unsigned long *vm_flags)
 {
 	unsigned int mapcount;
 	struct address_space *mapping = page->mapping;
@@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page,
 		 */
 		if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
 			continue;
-		referenced += page_referenced_one(page, vma, &mapcount);
+		referenced += page_referenced_one(page, vma,
+						  &mapcount, vm_flags);
 		if (!mapcount)
 			break;
 	}
@@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page,
  * @page: the page to test
  * @is_locked: caller holds lock on the page
  * @mem_cont: target memory controller
+ * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
  *
  * Quick test_and_clear_referenced for all mappings to a page,
  * returns the number of ptes which referenced the page.
  */
-int page_referenced(struct page *page, int is_locked,
-			struct mem_cgroup *mem_cont)
+int page_referenced(struct page *page,
+		    int is_locked,
+		    struct mem_cgroup *mem_cont,
+		    unsigned long *vm_flags)
 {
 	int referenced = 0;
 
 	if (TestClearPageReferenced(page))
 		referenced++;
 
+	*vm_flags = 0;
 	if (page_mapped(page) && page->mapping) {
 		if (PageAnon(page))
-			referenced += page_referenced_anon(page, mem_cont);
+			referenced += page_referenced_anon(page, mem_cont,
+								vm_flags);
 		else if (is_locked)
-			referenced += page_referenced_file(page, mem_cont);
+			referenced += page_referenced_file(page, mem_cont,
+								vm_flags);
 		else if (!trylock_page(page))
 			referenced++;
 		else {
 			if (page->mapping)
-				referenced +=
-					page_referenced_file(page, mem_cont);
+				referenced += page_referenced_file(page,
+							mem_cont, vm_flags);
 			unlock_page(page);
 		}
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 52339dd..6be2068 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -577,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 	struct pagevec freed_pvec;
 	int pgactivate = 0;
 	unsigned long nr_reclaimed = 0;
+	unsigned long vm_flags;
 
 	cond_resched();
 
@@ -627,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto keep_locked;
 		}
 
-		referenced = page_referenced(page, 1, sc->mem_cgroup);
+		referenced = page_referenced(page, 1,
+						sc->mem_cgroup, &vm_flags);
 		/* In active use or really unfreeable?  Activate it. */
 		if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
 					referenced && page_mapping_inuse(page))
@@ -1208,6 +1210,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 {
 	unsigned long pgmoved;
 	unsigned long pgscanned;
+	unsigned long vm_flags;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
 	LIST_HEAD(l_inactive);
 	struct page *page;
@@ -1248,7 +1251,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 
 		/* page_referenced clears PageReferenced */
 		if (page_mapping_inuse(page) &&
-		    page_referenced(page, 0, sc->mem_cgroup))
+		    page_referenced(page, 0, sc->mem_cgroup, &vm_flags))
 			pgmoved++;
 
 		list_add(&page->lru, &l_inactive);
-- 
cgit v0.10.2


From 8cab4754d24a0f2e05920170c845bd84472814c6 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:33:12 -0700
Subject: vmscan: make mapped executable pages the first class citizen

Protect referenced PROT_EXEC mapped pages from being deactivated.

PROT_EXEC(or its internal presentation VM_EXEC) pages normally belong to some
currently running executables and their linked libraries, they shall really be
cached aggressively to provide good user experiences.

Thanks to Johannes Weiner for the advice to reuse the VMA walk in
page_referenced() to get the PROT_EXEC bit.

[more details]

( The consequences of this patch will have to be discussed together with
  Rik van Riel's recent patch "vmscan: evict use-once pages first". )

( Some of the good points and insights are taken into this changelog.
  Thanks to all the involved people for the great LKML discussions. )

the problem
===========

For a typical desktop, the most precious working set is composed of
*actively accessed*
	(1) memory mapped executables
	(2) and their anonymous pages
	(3) and other files
	(4) and the dcache/icache/.. slabs
while the least important data are
	(5) infrequently used or use-once files

For a typical desktop, one major problem is busty and large amount of (5)
use-once files flushing out the working set.

Inside the working set, (4) dcache/icache have already been too sticky ;-)
So we only have to care (2) anonymous and (1)(3) file pages.

anonymous pages
===============

Anonymous pages are effectively immune to the streaming IO attack, because we
now have separate file/anon LRU lists. When the use-once files crowd into the
file LRU, the list's "quality" is significantly lowered. Therefore the scan
balance policy in get_scan_ratio() will choose to scan the (low quality) file
LRU much more frequently than the anon LRU.

file pages
==========

Rik proposed to *not* scan the active file LRU when the inactive list grows
larger than active list. This guarantees that when there are use-once streaming
IO, and the working set is not too large(so that active_size < inactive_size),
the active file LRU will *not* be scanned at all. So the not-too-large working
set can be well protected.

But there are also situations where the file working set is a bit large so that
(active_size >= inactive_size), or the streaming IOs are not purely use-once.
In these cases, the active list will be scanned slowly. Because the current
shrink_active_list() policy is to deactivate active pages regardless of their
referenced bits. The deactivated pages become susceptible to the streaming IO
attack: the inactive list could be scanned fast (500MB / 50MBps = 10s) so that
the deactivated pages don't have enough time to get re-referenced. Because a
user tend to switch between windows in intervals from seconds to minutes.

This patch holds mapped executable pages in the active list as long as they
are referenced during each full scan of the active list.  Because the active
list is normally scanned much slower, they get longer grace time (eg. 100s)
for further references, which better matches the pace of user operations.

Therefore this patch greatly prolongs the in-cache time of executable code,
when there are moderate memory pressures.

	before patch: guaranteed to be cached if reference intervals < I
	after  patch: guaranteed to be cached if reference intervals < I+A
		      (except when randomly reclaimed by the lumpy reclaim)
where
	A = time to fully scan the   active file LRU
	I = time to fully scan the inactive file LRU

Note that normally A >> I.

side effects
============

This patch is safe in general, it restores the pre-2.6.28 mmap() behavior
but in a much smaller and well targeted scope.

One may worry about some one to abuse the PROT_EXEC heuristic.  But as
Andrew Morton stated, there are other tricks to getting that sort of boost.

Another concern is the PROT_EXEC mapped pages growing large in rare cases,
and therefore hurting reclaim efficiency. But a sane application targeted for
large audience will never use PROT_EXEC for data mappings. If some home made
application tries to abuse that bit, it shall be aware of the consequences.
If it is abused to scale of 2/3 total memory, it gains nothing but overheads.

benchmarks
==========

1) memory tight desktop

1.1) brief summary

- clock time and major faults are reduced by 50%;
- pswpin numbers are reduced to ~1/3.

That means X desktop responsiveness is doubled under high memory/swap pressure.

1.2) test scenario

- nfsroot gnome desktop with 512M physical memory
- run some programs, and switch between the existing windows
  after starting each new program.

1.3) progress timing (seconds)

  before       after    programs
    0.02        0.02    N xeyes
    0.75        0.76    N firefox
    2.02        1.88    N nautilus
    3.36        3.17    N nautilus --browser
    5.26        4.89    N gthumb
    7.12        6.47    N gedit
    9.22        8.16    N xpdf /usr/share/doc/shared-mime-info/shared-mime-info-spec.pdf
   13.58       12.55    N xterm
   15.87       14.57    N mlterm
   18.63       17.06    N gnome-terminal
   21.16       18.90    N urxvt
   26.24       23.48    N gnome-system-monitor
   28.72       26.52    N gnome-help
   32.15       29.65    N gnome-dictionary
   39.66       36.12    N /usr/games/sol
   43.16       39.27    N /usr/games/gnometris
   48.65       42.56    N /usr/games/gnect
   53.31       47.03    N /usr/games/gtali
   58.60       52.05    N /usr/games/iagno
   65.77       55.42    N /usr/games/gnotravex
   70.76       61.47    N /usr/games/mahjongg
   76.15       67.11    N /usr/games/gnome-sudoku
   86.32       75.15    N /usr/games/glines
   92.21       79.70    N /usr/games/glchess
  103.79       88.48    N /usr/games/gnomine
  113.84       96.51    N /usr/games/gnotski
  124.40      102.19    N /usr/games/gnibbles
  137.41      114.93    N /usr/games/gnobots2
  155.53      125.02    N /usr/games/blackjack
  179.85      135.11    N /usr/games/same-gnome
  224.49      154.50    N /usr/bin/gnome-window-properties
  248.44      162.09    N /usr/bin/gnome-default-applications-properties
  282.62      173.29    N /usr/bin/gnome-at-properties
  323.72      188.21    N /usr/bin/gnome-typing-monitor
  363.99      199.93    N /usr/bin/gnome-at-visual
  394.21      206.95    N /usr/bin/gnome-sound-properties
  435.14      224.49    N /usr/bin/gnome-at-mobility
  463.05      234.11    N /usr/bin/gnome-keybinding-properties
  503.75      248.59    N /usr/bin/gnome-about-me
  554.00      276.27    N /usr/bin/gnome-display-properties
  615.48      304.39    N /usr/bin/gnome-network-preferences
  693.03      342.01    N /usr/bin/gnome-mouse-properties
  759.90      388.58    N /usr/bin/gnome-appearance-properties
  937.90      508.47    N /usr/bin/gnome-control-center
 1109.75      587.57    N /usr/bin/gnome-keyboard-properties
 1399.05      758.16    N : oocalc
 1524.64      830.03    N : oodraw
 1684.31      900.03    N : ooimpress
 1874.04      993.91    N : oomath
 2115.12     1081.89    N : ooweb
 2369.02     1161.99    N : oowriter

Note that the last ": oo*" commands are actually commented out.

1.4) vmstat numbers (some relevant ones are marked with *)

                            before    after
 nr_free_pages              1293      3898
 nr_inactive_anon           59956     53460
 nr_active_anon             26815     30026
 nr_inactive_file           2657      3218
 nr_active_file             2019      2806
 nr_unevictable             4         4
 nr_mlock                   4         4
 nr_anon_pages              26706     27859
*nr_mapped                  3542      4469
 nr_file_pages              72232     67681
 nr_dirty                   1         0
 nr_writeback               123       19
 nr_slab_reclaimable        3375      3534
 nr_slab_unreclaimable      11405     10665
 nr_page_table_pages        8106      7864
 nr_unstable                0         0
 nr_bounce                  0         0
*nr_vmscan_write            394776    230839
 nr_writeback_temp          0         0
 numa_hit                   6843353   3318676
 numa_miss                  0         0
 numa_foreign               0         0
 numa_interleave            1719      1719
 numa_local                 6843353   3318676
 numa_other                 0         0
*pgpgin                     5954683   2057175
*pgpgout                    1578276   922744
*pswpin                     1486615   512238
*pswpout                    394568    230685
 pgalloc_dma                277432    56602
 pgalloc_dma32              6769477   3310348
 pgalloc_normal             0         0
 pgalloc_movable            0         0
 pgfree                     7048396   3371118
 pgactivate                 2036343   1471492
 pgdeactivate               2189691   1612829
 pgfault                    3702176   3100702
*pgmajfault                 452116    201343
 pgrefill_dma               12185     7127
 pgrefill_dma32             334384    653703
 pgrefill_normal            0         0
 pgrefill_movable           0         0
 pgsteal_dma                74214     22179
 pgsteal_dma32              3334164   1638029
 pgsteal_normal             0         0
 pgsteal_movable            0         0
 pgscan_kswapd_dma          1081421   1216199
 pgscan_kswapd_dma32        58979118  46002810
 pgscan_kswapd_normal       0         0
 pgscan_kswapd_movable      0         0
 pgscan_direct_dma          2015438   1086109
 pgscan_direct_dma32        55787823  36101597
 pgscan_direct_normal       0         0
 pgscan_direct_movable      0         0
 pginodesteal               3461      7281
 slabs_scanned              564864    527616
 kswapd_steal               2889797   1448082
 kswapd_inodesteal          14827     14835
 pageoutrun                 43459     21562
 allocstall                 9653      4032
 pgrotated                  384216    228631

1.5) free numbers at the end of the tests

before patch:
                             total       used       free     shared    buffers     cached
                Mem:           474        467          7          0          0        236
                -/+ buffers/cache:        230        243
                Swap:         1023        418        605

after patch:
                             total       used       free     shared    buffers     cached
                Mem:           474        457         16          0          0        236
                -/+ buffers/cache:        221        253
                Swap:         1023        404        619

2) memory flushing in a file server

2.1) brief summary

The number of major faults from 50 to 3 during 10% cache hot reads.

That means this patch successfully stops major faults when the active file
list is slowly scanned when there are partially cache hot streaming IO.

2.2) test scenario

Do 100000 pread(size=110 pages, offset=(i*100) pages), where 10% of the
pages will be activated:

        for i in `seq 0 100 10000000`; do echo $i 110;  done > pattern-hot-10
        iotrace.rb --load pattern-hot-10 --play /b/sparse
	vmmon  nr_mapped nr_active_file nr_inactive_file   pgmajfault pgdeactivate pgfree

and monitor /proc/vmstat during the time. The test box has 2G memory.

I carried out tests on fresh booted console as well as X desktop, and
fetched the vmstat numbers on

(1) begin:     shortly after the big read IO starts;
(2) end:       just before the big read IO stops;
(3) restore:   the big read IO stops and the zsh working set restored
(4) restore X: after IO, switch back and forth between the urxvt and firefox
               windows to restore their working set.

2.3) console mode results

        nr_mapped   nr_active_file nr_inactive_file       pgmajfault     pgdeactivate           pgfree

2.6.29 VM_EXEC protection ON:
begin:       2481             2237             8694              630                0           574299
end:          275           231976           233914              633           776271         20933042
restore:      370           232154           234524              691           777183         20958453

2.6.29 VM_EXEC protection ON (second run):
begin:       2434             2237             8493              629                0           574195
end:          284           231970           233536              632           771918         20896129
restore:      399           232218           234789              690           774526         20957909

2.6.30-rc4-mm VM_EXEC protection OFF:
begin:       2479             2344             9659              210                0           579643
end:          284           232010           234142              260           772776         20917184
restore:      379           232159           234371              301           774888         20967849

The above console numbers show that

- The startup pgmajfault of 2.6.30-rc4-mm is merely 1/3 that of 2.6.29.
  I'd attribute that improvement to the mmap readahead improvements :-)

- The pgmajfault increment during the file copy is 633-630=3 vs 260-210=50.
  That's a huge improvement - which means with the VM_EXEC protection logic,
  active mmap pages is pretty safe even under partially cache hot streaming IO.

- when active:inactive file lru size reaches 1:1, their scan rates is 1:20.8
  under 10% cache hot IO. (computed with formula Dpgdeactivate:Dpgfree)
  That roughly means the active mmap pages get 20.8 more chances to get
  re-referenced to stay in memory.

- The absolute nr_mapped drops considerably to 1/9 during the big IO, and the
  dropped pages are mostly inactive ones. The patch has almost no impact in
  this aspect, that means it won't unnecessarily increase memory pressure.
  (In contrast, your 20% mmap protection ratio will keep them all, and
  therefore eliminate the extra 41 major faults to restore working set
  of zsh etc.)

The iotrace.rb read throughput is
	151.194384MB/s 284.198252s 100001x 450560b --load pattern-hot-10 --play /b/sparse
which means the inactive list is rotated at the speed of 250MB/s,
so a full scan of which takes about 3.5 seconds, while a full scan
of active file list takes about 77 seconds.

2.4) X mode results

We can reach roughly the same conclusions for X desktop:

        nr_mapped   nr_active_file nr_inactive_file       pgmajfault     pgdeactivate           pgfree

2.6.30-rc4-mm VM_EXEC protection ON:
begin:       9740             8920            64075              561                0           678360
end:          768           218254           220029              565           798953         21057006
restore:      857           218543           220987              606           799462         21075710
restore X:   2414           218560           225344              797           799462         21080795

2.6.30-rc4-mm VM_EXEC protection OFF:
begin:       9368             5035            26389              554                0           633391
end:          770           218449           221230              661           646472         17832500
restore:     1113           218466           220978              710           649881         17905235
restore X:   2687           218650           225484              947           802700         21083584

- the absolute nr_mapped drops considerably (to 1/13 of the original size)
  during the streaming IO.
- the delta of pgmajfault is 3 vs 107 during IO, or 236 vs 393
  during the whole process.

Cc: Elladan <elladan@eskimo.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6be2068..1024979 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1212,6 +1212,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	unsigned long pgscanned;
 	unsigned long vm_flags;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
+	LIST_HEAD(l_active);
 	LIST_HEAD(l_inactive);
 	struct page *page;
 	struct pagevec pvec;
@@ -1251,28 +1252,42 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 
 		/* page_referenced clears PageReferenced */
 		if (page_mapping_inuse(page) &&
-		    page_referenced(page, 0, sc->mem_cgroup, &vm_flags))
+		    page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
 			pgmoved++;
+			/*
+			 * Identify referenced, file-backed active pages and
+			 * give them one more trip around the active list. So
+			 * that executable code get better chances to stay in
+			 * memory under moderate memory pressure.  Anon pages
+			 * are not likely to be evicted by use-once streaming
+			 * IO, plus JVM can create lots of anon VM_EXEC pages,
+			 * so we ignore them here.
+			 */
+			if ((vm_flags & VM_EXEC) && !PageAnon(page)) {
+				list_add(&page->lru, &l_active);
+				continue;
+			}
+		}
 
 		list_add(&page->lru, &l_inactive);
 	}
 
 	/*
-	 * Move the pages to the [file or anon] inactive list.
+	 * Move pages back to the lru list.
 	 */
 	pagevec_init(&pvec, 1);
-	lru = LRU_BASE + file * LRU_FILE;
 
 	spin_lock_irq(&zone->lru_lock);
 	/*
-	 * Count referenced pages from currently used mappings as
-	 * rotated, even though they are moved to the inactive list.
-	 * This helps balance scan pressure between file and anonymous
-	 * pages in get_scan_ratio.
+	 * Count referenced pages from currently used mappings as rotated,
+	 * even though only some of them are actually re-activated.  This
+	 * helps balance scan pressure between file and anonymous pages in
+	 * get_scan_ratio.
 	 */
 	reclaim_stat->recent_rotated[!!file] += pgmoved;
 
 	pgmoved = 0;  /* count pages moved to inactive list */
+	lru = LRU_BASE + file * LRU_FILE;
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
@@ -1295,6 +1310,29 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
 	__count_zone_vm_events(PGREFILL, zone, pgscanned);
 	__count_vm_events(PGDEACTIVATE, pgmoved);
+
+	pgmoved = 0;  /* count pages moved back to active list */
+	lru = LRU_ACTIVE + file * LRU_FILE;
+	while (!list_empty(&l_active)) {
+		page = lru_to_page(&l_active);
+		prefetchw_prev_lru_page(page, &l_active, flags);
+		VM_BUG_ON(PageLRU(page));
+		SetPageLRU(page);
+		VM_BUG_ON(!PageActive(page));
+
+		list_move(&page->lru, &zone->lru[lru].list);
+		mem_cgroup_add_lru_list(page, lru);
+		pgmoved++;
+		if (!pagevec_add(&pvec, page)) {
+			spin_unlock_irq(&zone->lru_lock);
+			if (buffer_heads_over_limit)
+				pagevec_strip(&pvec);
+			__pagevec_release(&pvec);
+			spin_lock_irq(&zone->lru_lock);
+		}
+	}
+	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+
 	spin_unlock_irq(&zone->lru_lock);
 	if (buffer_heads_over_limit)
 		pagevec_strip(&pvec);
-- 
cgit v0.10.2


From 3eb4140f0389bdada022d5e8efd88504ad30df14 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:33:13 -0700
Subject: vmscan: merge duplicate code in shrink_active_list()

The "move pages to active list" and "move pages to inactive list" code
blocks are mostly identical and can be served by a function.

Thanks to Andrew Morton for pointing this out.

Note that buffer_heads_over_limit check will also be carried out for
re-activated pages, which is slightly different from pre-2.6.28 kernels.
Also, Rik's "vmscan: evict use-once pages first" patch could totally stop
scans of active file list when memory pressure is low.  So the net effect
could be, the number of buffer heads is now more likely to grow large.

However that's fine according to Johannes' comments:

  I don't think that this could be harmful.  We just preserve the buffer
  mappings of what we consider the working set and with low memory
  pressure, as you say, this set is not big.

  As to stripping of reactivated pages: the only pages we re-activate
  for now are those VM_EXEC mapped ones.  Since we don't expect IO from
  or to these pages, removing the buffer mappings in case they grow too
  large should be okay, I guess.

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1024979..f3a55d1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1204,6 +1204,43 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
  * But we had to alter page->flags anyway.
  */
 
+static void move_active_pages_to_lru(struct zone *zone,
+				     struct list_head *list,
+				     enum lru_list lru)
+{
+	unsigned long pgmoved = 0;
+	struct pagevec pvec;
+	struct page *page;
+
+	pagevec_init(&pvec, 1);
+
+	while (!list_empty(list)) {
+		page = lru_to_page(list);
+		prefetchw_prev_lru_page(page, list, flags);
+
+		VM_BUG_ON(PageLRU(page));
+		SetPageLRU(page);
+
+		VM_BUG_ON(!PageActive(page));
+		if (!is_active_lru(lru))
+			ClearPageActive(page);	/* we are de-activating */
+
+		list_move(&page->lru, &zone->lru[lru].list);
+		mem_cgroup_add_lru_list(page, lru);
+		pgmoved++;
+
+		if (!pagevec_add(&pvec, page) || list_empty(list)) {
+			spin_unlock_irq(&zone->lru_lock);
+			if (buffer_heads_over_limit)
+				pagevec_strip(&pvec);
+			__pagevec_release(&pvec);
+			spin_lock_irq(&zone->lru_lock);
+		}
+	}
+	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+	if (!is_active_lru(lru))
+		__count_vm_events(PGDEACTIVATE, pgmoved);
+}
 
 static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 			struct scan_control *sc, int priority, int file)
@@ -1215,8 +1252,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	LIST_HEAD(l_active);
 	LIST_HEAD(l_inactive);
 	struct page *page;
-	struct pagevec pvec;
-	enum lru_list lru;
 	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
 
 	lru_add_drain();
@@ -1233,6 +1268,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	}
 	reclaim_stat->recent_scanned[!!file] += pgmoved;
 
+	__count_zone_vm_events(PGREFILL, zone, pgscanned);
 	if (file)
 		__mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
 	else
@@ -1275,8 +1311,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	/*
 	 * Move pages back to the lru list.
 	 */
-	pagevec_init(&pvec, 1);
-
 	spin_lock_irq(&zone->lru_lock);
 	/*
 	 * Count referenced pages from currently used mappings as rotated,
@@ -1286,57 +1320,12 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	 */
 	reclaim_stat->recent_rotated[!!file] += pgmoved;
 
-	pgmoved = 0;  /* count pages moved to inactive list */
-	lru = LRU_BASE + file * LRU_FILE;
-	while (!list_empty(&l_inactive)) {
-		page = lru_to_page(&l_inactive);
-		prefetchw_prev_lru_page(page, &l_inactive, flags);
-		VM_BUG_ON(PageLRU(page));
-		SetPageLRU(page);
-		VM_BUG_ON(!PageActive(page));
-		ClearPageActive(page);
-
-		list_move(&page->lru, &zone->lru[lru].list);
-		mem_cgroup_add_lru_list(page, lru);
-		pgmoved++;
-		if (!pagevec_add(&pvec, page)) {
-			spin_unlock_irq(&zone->lru_lock);
-			if (buffer_heads_over_limit)
-				pagevec_strip(&pvec);
-			__pagevec_release(&pvec);
-			spin_lock_irq(&zone->lru_lock);
-		}
-	}
-	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
-	__count_zone_vm_events(PGREFILL, zone, pgscanned);
-	__count_vm_events(PGDEACTIVATE, pgmoved);
-
-	pgmoved = 0;  /* count pages moved back to active list */
-	lru = LRU_ACTIVE + file * LRU_FILE;
-	while (!list_empty(&l_active)) {
-		page = lru_to_page(&l_active);
-		prefetchw_prev_lru_page(page, &l_active, flags);
-		VM_BUG_ON(PageLRU(page));
-		SetPageLRU(page);
-		VM_BUG_ON(!PageActive(page));
-
-		list_move(&page->lru, &zone->lru[lru].list);
-		mem_cgroup_add_lru_list(page, lru);
-		pgmoved++;
-		if (!pagevec_add(&pvec, page)) {
-			spin_unlock_irq(&zone->lru_lock);
-			if (buffer_heads_over_limit)
-				pagevec_strip(&pvec);
-			__pagevec_release(&pvec);
-			spin_lock_irq(&zone->lru_lock);
-		}
-	}
-	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+	move_active_pages_to_lru(zone, &l_active,
+						LRU_ACTIVE + file * LRU_FILE);
+	move_active_pages_to_lru(zone, &l_inactive,
+						LRU_BASE   + file * LRU_FILE);
 
 	spin_unlock_irq(&zone->lru_lock);
-	if (buffer_heads_over_limit)
-		pagevec_strip(&pvec);
-	pagevec_release(&pvec);
 }
 
 static int inactive_anon_is_low_global(struct zone *zone)
-- 
cgit v0.10.2


From 9198e96c06744517e3b18fce8be6db61e96a3227 Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Date: Tue, 16 Jun 2009 15:33:15 -0700
Subject: vmscan: handle may_swap more strictly

Commit 2e2e425989080cc534fc0fca154cae515f971cf5 ("vmscan,memcg:
reintroduce sc->may_swap) add may_swap flag and handle it at
get_scan_ratio().

But the result of get_scan_ratio() is ignored when priority == 0, so anon
lru is scanned even if may_swap == 0 or nr_swap_pages == 0.  IMHO, this is
not an expected behavior.

As for memcg especially, because of this behavior many and many pages are
swapped-out just in vain when oom is invoked by mem+swap limit.

This patch is for handling may_swap flag more strictly.

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f3a55d1..057e44b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1430,13 +1430,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 	unsigned long ap, fp;
 	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
 
-	/* If we have no swap space, do not bother scanning anon pages. */
-	if (!sc->may_swap || (nr_swap_pages <= 0)) {
-		percent[0] = 0;
-		percent[1] = 100;
-		return;
-	}
-
 	anon  = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
 		zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
 	file  = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1534,15 +1527,22 @@ static void shrink_zone(int priority, struct zone *zone,
 	enum lru_list l;
 	unsigned long nr_reclaimed = sc->nr_reclaimed;
 	unsigned long swap_cluster_max = sc->swap_cluster_max;
+	int noswap = 0;
 
-	get_scan_ratio(zone, sc, percent);
+	/* If we have no swap space, do not bother scanning anon pages. */
+	if (!sc->may_swap || (nr_swap_pages <= 0)) {
+		noswap = 1;
+		percent[0] = 0;
+		percent[1] = 100;
+	} else
+		get_scan_ratio(zone, sc, percent);
 
 	for_each_evictable_lru(l) {
 		int file = is_file_lru(l);
 		unsigned long scan;
 
 		scan = zone_nr_pages(zone, sc, l);
-		if (priority) {
+		if (priority || noswap) {
 			scan >>= priority;
 			scan = (scan * percent[file]) / 100;
 		}
-- 
cgit v0.10.2


From 81236810226f71bd9ff77321c8e8276dae7efc61 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 16 Jun 2009 15:33:16 -0700
Subject: oom: only oom kill exiting tasks with attached memory

When a task is chosen for oom kill and is found to be PF_EXITING,
__oom_kill_task() is called to elevate the task's timeslice and give it
access to memory reserves so that it may quickly exit.

This privilege is unnecessary, however, if the task has already detached
its mm.  Although its possible for the mm to become detached later since
task_lock() is not held, __oom_kill_task() will simply be a no-op in such
circumstances.

Subsequently, it is no longer necessary to warn about killing mm-less
tasks since it is a no-op.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index cdcf89c..175a67a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -325,11 +325,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
 		return;
 	}
 
-	if (!p->mm) {
-		WARN_ON(1);
-		printk(KERN_WARNING "tried to kill an mm-less task!\n");
+	if (!p->mm)
 		return;
-	}
 
 	if (verbose)
 		printk(KERN_ERR "Killed process %d (%s)\n",
@@ -397,8 +394,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	/*
 	 * If the task is already exiting, don't alarm the sysadmin or kill
 	 * its children or threads, just set TIF_MEMDIE so it can die quickly
+	 * if its mm is still attached.
 	 */
-	if (p->flags & PF_EXITING) {
+	if (p->mm && (p->flags & PF_EXITING)) {
 		__oom_kill_task(p, 0);
 		return 0;
 	}
-- 
cgit v0.10.2


From 84a892456046921a40646114deed65e2df93a1bc Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:33:17 -0700
Subject: writeback: skip new or to-be-freed inodes

1) I_FREEING tests should be coupled with I_CLEAR

The two I_FREEING tests are racy because clear_inode() can set i_state to
I_CLEAR between the clear of I_SYNC and the test of I_FREEING.

2) skip I_WILL_FREE inodes in generic_sync_sb_inodes() to avoid possible
   races with generic_forget_inode()

generic_forget_inode() sets I_WILL_FREE call writeback on its own, so
generic_sync_sb_inodes() shall not try to step in and create possible races:

  generic_forget_inode
    inode->i_state |= I_WILL_FREE;
    spin_unlock(&inode_lock);
                                       generic_sync_sb_inodes()
                                         spin_lock(&inode_lock);
                                         __iget(inode);
                                         __writeback_single_inode
                                           // see non zero i_count
 may WARN here ==>                         WARN_ON(inode->i_state & I_WILL_FREE);
                                         spin_unlock(&inode_lock);
 may call generic_forget_inode again ==> iput(inode);

The above race and warning didn't turn up because writeback_inodes() holds
the s_umount lock, so generic_forget_inode() finds MS_ACTIVE and returns
early.  But we are not sure the UBIFS calls and future callers will
guarantee that.  So skip I_WILL_FREE inodes for the sake of safety.

Cc: Eric Sandeen <sandeen@sandeen.net>
Acked-by: Jeff Layton <jlayton@redhat.com>
Cc: Masayoshi MIZUMA <m.mizuma@jp.fujitsu.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Artem Bityutskiy <dedekind1@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Acked-by: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 40308e9..caf0491 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -321,7 +321,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_SYNC;
-	if (!(inode->i_state & I_FREEING)) {
+	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
 		if (!(inode->i_state & I_DIRTY) &&
 		    mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 			/*
@@ -492,7 +492,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
 			break;
 		}
 
-		if (inode->i_state & I_NEW) {
+		if (inode->i_state & (I_NEW | I_WILL_FREE)) {
 			requeue_io(inode);
 			continue;
 		}
@@ -523,7 +523,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
 		if (current_is_pdflush() && !writeback_acquire(bdi))
 			break;
 
-		BUG_ON(inode->i_state & I_FREEING);
+		BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
 		__iget(inode);
 		pages_skipped = wbc->pages_skipped;
 		__writeback_single_inode(inode, wbc);
-- 
cgit v0.10.2


From 90afa5de6f3fa89a733861e843377302479fcf7e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:33:20 -0700
Subject: vmscan: properly account for the number of page cache pages
 zone_reclaim() can reclaim

A bug was brought to my attention against a distro kernel but it affects
mainline and I believe problems like this have been reported in various
guises on the mailing lists although I don't have specific examples at the
moment.

The reported problem was that malloc() stalled for a long time (minutes in
some cases) if a large tmpfs mount was occupying a large percentage of
memory overall.  The pages did not get cleaned or reclaimed by
zone_reclaim() because the zone_reclaim_mode was unsuitable, but the lists
are uselessly scanned frequencly making the CPU spin at near 100%.

This patchset intends to address that bug and bring the behaviour of
zone_reclaim() more in line with expectations which were noticed during
investigation.  It is based on top of mmotm and takes advantage of
Kosaki's work with respect to zone_reclaim().

Patch 1 fixes the heuristics that zone_reclaim() uses to determine if the
	scan should go ahead. The broken heuristic is what was causing the
	malloc() stall as it uselessly scanned the LRU constantly. Currently,
	zone_reclaim is assuming zone_reclaim_mode is 1 and historically it
	could not deal with tmpfs pages at all. This fixes up the heuristic so
	that an unnecessary scan is more likely to be correctly avoided.

Patch 2 notes that zone_reclaim() returning a failure automatically means
	the zone is marked full. This is not always true. It could have
	failed because the GFP mask or zone_reclaim_mode were unsuitable.

Patch 3 introduces a counter zreclaim_failed that will increment each
	time the zone_reclaim scan-avoidance heuristics fail. If that
	counter is rapidly increasing, then zone_reclaim_mode should be
	set to 0 as a temporarily resolution and a bug reported because
	the scan-avoidance heuristic is still broken.

This patch:

On NUMA machines, the administrator can configure zone_reclaim_mode that
is a more targetted form of direct reclaim.  On machines with large NUMA
distances for example, a zone_reclaim_mode defaults to 1 meaning that
clean unmapped pages will be reclaimed if the zone watermarks are not
being met.

There is a heuristic that determines if the scan is worthwhile but the
problem is that the heuristic is not being properly applied and is
basically assuming zone_reclaim_mode is 1 if it is enabled.  The lack of
proper detection can manfiest as high CPU usage as the LRU list is scanned
uselessly.

Historically, once enabled it was depending on NR_FILE_PAGES which may
include swapcache pages that the reclaim_mode cannot deal with.  Patch
vmscan-change-the-number-of-the-unmapped-files-in-zone-reclaim.patch by
Kosaki Motohiro noted that zone_page_state(zone, NR_FILE_PAGES) included
pages that were not file-backed such as swapcache and made a calculation
based on the inactive, active and mapped files.  This is far superior when
zone_reclaim==1 but if RECLAIM_SWAP is set, then NR_FILE_PAGES is a
reasonable starting figure.

This patch alters how zone_reclaim() works out how many pages it might be
able to reclaim given the current reclaim_mode.  If RECLAIM_SWAP is set in
the reclaim_mode it will either consider NR_FILE_PAGES as potential
candidates or else use NR_{IN}ACTIVE}_PAGES-NR_FILE_MAPPED to discount
swapcache and other non-file-backed pages.  If RECLAIM_WRITE is not set,
then NR_FILE_DIRTY number of pages are not candidates.  If RECLAIM_SWAP is
not set, then NR_FILE_MAPPED are not.

[kosaki.motohiro@jp.fujitsu.com: Estimate unmapped pages minus tmpfs pages]
[fengguang.wu@intel.com: Fix underflow problem in Kosaki's estimate]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 0ea5adb..c4de635 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -315,10 +315,14 @@ min_unmapped_ratio:
 
 This is available only on NUMA kernels.
 
-A percentage of the total pages in each zone.  Zone reclaim will only
-occur if more than this percentage of pages are file backed and unmapped.
-This is to insure that a minimal amount of local pages is still available for
-file I/O even if the node is overallocated.
+This is a percentage of the total pages in each zone. Zone reclaim will
+only occur if more than this percentage of pages are in a state that
+zone_reclaim_mode allows to be reclaimed.
+
+If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared
+against all file-backed unmapped pages including swapcache pages and tmpfs
+files. Otherwise, only unmapped pages backed by normal files but not tmpfs
+files and similar are considered.
 
 The default is 1 percent.
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 057e44b..79a98d9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2356,6 +2356,48 @@ int sysctl_min_unmapped_ratio = 1;
  */
 int sysctl_min_slab_ratio = 5;
 
+static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
+{
+	unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
+	unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
+		zone_page_state(zone, NR_ACTIVE_FILE);
+
+	/*
+	 * It's possible for there to be more file mapped pages than
+	 * accounted for by the pages on the file LRU lists because
+	 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
+	 */
+	return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
+}
+
+/* Work out how many page cache pages we can reclaim in this reclaim_mode */
+static long zone_pagecache_reclaimable(struct zone *zone)
+{
+	long nr_pagecache_reclaimable;
+	long delta = 0;
+
+	/*
+	 * If RECLAIM_SWAP is set, then all file pages are considered
+	 * potentially reclaimable. Otherwise, we have to worry about
+	 * pages like swapcache and zone_unmapped_file_pages() provides
+	 * a better estimate
+	 */
+	if (zone_reclaim_mode & RECLAIM_SWAP)
+		nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
+	else
+		nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
+
+	/* If we can't clean pages, remove dirty pages from consideration */
+	if (!(zone_reclaim_mode & RECLAIM_WRITE))
+		delta += zone_page_state(zone, NR_FILE_DIRTY);
+
+	/* Watch for any possible underflows due to delta */
+	if (unlikely(delta > nr_pagecache_reclaimable))
+		delta = nr_pagecache_reclaimable;
+
+	return nr_pagecache_reclaimable - delta;
+}
+
 /*
  * Try to free up some pages from this zone through reclaim.
  */
@@ -2390,9 +2432,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	if (zone_page_state(zone, NR_FILE_PAGES) -
-		zone_page_state(zone, NR_FILE_MAPPED) >
-		zone->min_unmapped_pages) {
+	if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
 		/*
 		 * Free memory by calling shrink zone with increasing
 		 * priorities until we have enough memory freed.
@@ -2450,10 +2490,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	 * if less than a specified percentage of the zone is used by
 	 * unmapped file backed pages.
 	 */
-	if (zone_page_state(zone, NR_FILE_PAGES) -
-	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
-	    && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
-			<= zone->min_slab_pages)
+	if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
+	    zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
 		return 0;
 
 	if (zone_is_all_unreclaimable(zone))
-- 
cgit v0.10.2


From fa5e084e43eb14c14942027e1e2e894aeed96097 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:33:22 -0700
Subject: vmscan: do not unconditionally treat zones that fail zone_reclaim()
 as full

On NUMA machines, the administrator can configure zone_reclaim_mode that
is a more targetted form of direct reclaim.  On machines with large NUMA
distances for example, a zone_reclaim_mode defaults to 1 meaning that
clean unmapped pages will be reclaimed if the zone watermarks are not
being met.  The problem is that zone_reclaim() failing at all means the
zone gets marked full.

This can cause situations where a zone is usable, but is being skipped
because it has been considered full.  Take a situation where a large tmpfs
mount is occuping a large percentage of memory overall.  The pages do not
get cleaned or reclaimed by zone_reclaim(), but the zone gets marked full
and the zonelist cache considers them not worth trying in the future.

This patch makes zone_reclaim() return more fine-grained information about
what occured when zone_reclaim() failued.  The zone only gets marked full
if it really is unreclaimable.  If it's a case that the scan did not occur
or if enough pages were not reclaimed with the limited reclaim_mode, then
the zone is simply skipped.

There is a side-effect to this patch.  Currently, if zone_reclaim()
successfully reclaimed SWAP_CLUSTER_MAX, an allocation attempt would go
ahead.  With this patch applied, zone watermarks are rechecked after
zone_reclaim() does some work.

This bug was introduced by commit 9276b1bc96a132f4068fdee00983c532f43d3a26
("memory page_alloc zonelist caching speedup") way back in 2.6.19 when the
zonelist_cache was introduced.  It was not intended that zone_reclaim()
aggressively consider the zone to be full when it failed as full direct
reclaim can still be an option.  Due to the age of the bug, it should be
considered a -stable candidate.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/internal.h b/mm/internal.h
index f02c750..f290c4d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -259,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		     unsigned long start, int len, int flags,
 		     struct page **pages, struct vm_area_struct **vmas);
 
+#define ZONE_RECLAIM_NOSCAN	-2
+#define ZONE_RECLAIM_FULL	-1
+#define ZONE_RECLAIM_SOME	0
+#define ZONE_RECLAIM_SUCCESS	1
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6407cbf..2f457a7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1462,15 +1462,33 @@ zonelist_scan:
 		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
 		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
 			unsigned long mark;
+			int ret;
+
 			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
-			if (!zone_watermark_ok(zone, order, mark,
-				    classzone_idx, alloc_flags)) {
-				if (!zone_reclaim_mode ||
-				    !zone_reclaim(zone, gfp_mask, order))
+			if (zone_watermark_ok(zone, order, mark,
+				    classzone_idx, alloc_flags))
+				goto try_this_zone;
+
+			if (zone_reclaim_mode == 0)
+				goto this_zone_full;
+
+			ret = zone_reclaim(zone, gfp_mask, order);
+			switch (ret) {
+			case ZONE_RECLAIM_NOSCAN:
+				/* did not scan */
+				goto try_next_zone;
+			case ZONE_RECLAIM_FULL:
+				/* scanned but unreclaimable */
+				goto this_zone_full;
+			default:
+				/* did we reclaim enough */
+				if (!zone_watermark_ok(zone, order, mark,
+						classzone_idx, alloc_flags))
 					goto this_zone_full;
 			}
 		}
 
+try_this_zone:
 		page = buffered_rmqueue(preferred_zone, zone, order,
 						gfp_mask, migratetype);
 		if (page)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 79a98d9..16c82a8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2492,16 +2492,16 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	 */
 	if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
 	    zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
-		return 0;
+		return ZONE_RECLAIM_FULL;
 
 	if (zone_is_all_unreclaimable(zone))
-		return 0;
+		return ZONE_RECLAIM_FULL;
 
 	/*
 	 * Do not scan if the allocation should not be delayed.
 	 */
 	if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
-			return 0;
+		return ZONE_RECLAIM_NOSCAN;
 
 	/*
 	 * Only run zone reclaim on the local zone or on zones that do not
@@ -2511,10 +2511,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	 */
 	node_id = zone_to_nid(zone);
 	if (node_state(node_id, N_CPU) && node_id != numa_node_id())
-		return 0;
+		return ZONE_RECLAIM_NOSCAN;
 
 	if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
-		return 0;
+		return ZONE_RECLAIM_NOSCAN;
+
 	ret = __zone_reclaim(zone, gfp_mask, order);
 	zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
 
-- 
cgit v0.10.2


From 24cf72518c79cdcda486ed26074ff8151291cf65 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:33:23 -0700
Subject: vmscan: count the number of times zone_reclaim() scans and fails

On NUMA machines, the administrator can configure zone_reclaim_mode that
is a more targetted form of direct reclaim.  On machines with large NUMA
distances for example, a zone_reclaim_mode defaults to 1 meaning that
clean unmapped pages will be reclaimed if the zone watermarks are not
being met.

There is a heuristic that determines if the scan is worthwhile but it is
possible that the heuristic will fail and the CPU gets tied up scanning
uselessly.  Detecting the situation requires some guesswork and
experimentation so this patch adds a counter "zreclaim_failed" to
/proc/vmstat.  If during high CPU utilisation this counter is increasing
rapidly, then the resolution to the problem may be to set
/proc/sys/vm/zone_reclaim_mode to 0.

[akpm@linux-foundation.org: name things consistently]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ff4696c..81a97cf 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -36,6 +36,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		FOR_ALL_ZONES(PGSTEAL),
 		FOR_ALL_ZONES(PGSCAN_KSWAPD),
 		FOR_ALL_ZONES(PGSCAN_DIRECT),
+#ifdef CONFIG_NUMA
+		PGSCAN_ZONE_RECLAIM_FAILED,
+#endif
 		PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
 		PAGEOUTRUN, ALLOCSTALL, PGROTATED,
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 16c82a8..3018ad7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2519,6 +2519,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	ret = __zone_reclaim(zone, gfp_mask, order);
 	zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
 
+	if (!ret)
+		count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
+
 	return ret;
 }
 #endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1e3aa81..138bed5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -673,6 +673,9 @@ static const char * const vmstat_text[] = {
 	TEXTS_FOR_ZONES("pgscan_kswapd")
 	TEXTS_FOR_ZONES("pgscan_direct")
 
+#ifdef CONFIG_NUMA
+	"zone_reclaim_failed",
+#endif
 	"pginodesteal",
 	"slabs_scanned",
 	"kswapd_steal",
-- 
cgit v0.10.2


From ee993b135ec75a93bd5c45e636bb210d2975159b Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 16 Jun 2009 15:33:24 -0700
Subject: mm: fix lumpy reclaim lru handling at isolate_lru_pages

At lumpy reclaim, a page failed to be taken by __isolate_lru_page() can be
pushed back to "src" list by list_move().  But the page may not be from
"src" list.  This pushes the page back to wrong LRU.  And list_move()
itself is unnecessary because the page is not on top of LRU.  Then, leave
it as it is if __isolate_lru_page() fails.

Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3018ad7..4139aa5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -929,18 +929,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 			/* Check that we have not crossed a zone boundary. */
 			if (unlikely(page_zone_id(cursor_page) != zone_id))
 				continue;
-			switch (__isolate_lru_page(cursor_page, mode, file)) {
-			case 0:
+			if (__isolate_lru_page(cursor_page, mode, file) == 0) {
 				list_move(&cursor_page->lru, dst);
 				nr_taken++;
 				scan++;
-				break;
-
-			case -EBUSY:
-				/* else it is being freed elsewhere */
-				list_move(&cursor_page->lru, src);
-			default:
-				break;	/* ! on LRU or wrong list */
 			}
 		}
 	}
-- 
cgit v0.10.2


From 44377f622ee4f23ea0afc9b83dba5d3ec2d560cd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 Jun 2009 15:33:25 -0700
Subject: alpha: remove obsolete hw_interrupt_type

The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t) have
been kept around for migration reasons.  After more than two years it's
time to remove them finally.

This patch cleans up one of the remaining users.  When all such patches
hit mainline we can remove the defines and typedefs finally.

Impact: cleanup

Convert the last remaining users to struct irq_chip and remove the
define.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Henderson <rth@twiddle.net>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/alpha/kernel/irq_alpha.c b/arch/alpha/kernel/irq_alpha.c
index 67c19f8..38c805d 100644
--- a/arch/alpha/kernel/irq_alpha.c
+++ b/arch/alpha/kernel/irq_alpha.c
@@ -227,7 +227,7 @@ struct irqaction timer_irqaction = {
 	.name		= "timer",
 };
 
-static struct hw_interrupt_type rtc_irq_type = {
+static struct irq_chip rtc_irq_type = {
 	.typename	= "RTC",
 	.startup	= rtc_startup,
 	.shutdown	= rtc_enable_disable,
diff --git a/arch/alpha/kernel/irq_i8259.c b/arch/alpha/kernel/irq_i8259.c
index 9405bee..50bfec9 100644
--- a/arch/alpha/kernel/irq_i8259.c
+++ b/arch/alpha/kernel/irq_i8259.c
@@ -83,7 +83,7 @@ i8259a_end_irq(unsigned int irq)
 		i8259a_enable_irq(irq);
 }
 
-struct hw_interrupt_type i8259a_irq_type = {
+struct irq_chip i8259a_irq_type = {
 	.typename	= "XT-PIC",
 	.startup	= i8259a_startup_irq,
 	.shutdown	= i8259a_disable_irq,
diff --git a/arch/alpha/kernel/irq_impl.h b/arch/alpha/kernel/irq_impl.h
index cc9a8a7..b63ccd7 100644
--- a/arch/alpha/kernel/irq_impl.h
+++ b/arch/alpha/kernel/irq_impl.h
@@ -36,7 +36,7 @@ extern void i8259a_disable_irq(unsigned int);
 extern void i8259a_mask_and_ack_irq(unsigned int);
 extern unsigned int i8259a_startup_irq(unsigned int);
 extern void i8259a_end_irq(unsigned int);
-extern struct hw_interrupt_type i8259a_irq_type;
+extern struct irq_chip i8259a_irq_type;
 extern void init_i8259a_irqs(void);
 
 extern void handle_irq(int irq);
diff --git a/arch/alpha/kernel/irq_pyxis.c b/arch/alpha/kernel/irq_pyxis.c
index d53edbc..69199a7 100644
--- a/arch/alpha/kernel/irq_pyxis.c
+++ b/arch/alpha/kernel/irq_pyxis.c
@@ -70,7 +70,7 @@ pyxis_mask_and_ack_irq(unsigned int irq)
 	*(vulp)PYXIS_INT_MASK;
 }
 
-static struct hw_interrupt_type pyxis_irq_type = {
+static struct irq_chip pyxis_irq_type = {
 	.typename	= "PYXIS",
 	.startup	= pyxis_startup_irq,
 	.shutdown	= pyxis_disable_irq,
diff --git a/arch/alpha/kernel/irq_srm.c b/arch/alpha/kernel/irq_srm.c
index a03fbca..8522936 100644
--- a/arch/alpha/kernel/irq_srm.c
+++ b/arch/alpha/kernel/irq_srm.c
@@ -48,7 +48,7 @@ srm_end_irq(unsigned int irq)
 }
 
 /* Handle interrupts from the SRM, assuming no additional weirdness.  */
-static struct hw_interrupt_type srm_irq_type = {
+static struct irq_chip srm_irq_type = {
 	.typename	= "SRM",
 	.startup	= srm_startup_irq,
 	.shutdown	= srm_disable_irq,
diff --git a/arch/alpha/kernel/sys_alcor.c b/arch/alpha/kernel/sys_alcor.c
index e53a1e1..382035e 100644
--- a/arch/alpha/kernel/sys_alcor.c
+++ b/arch/alpha/kernel/sys_alcor.c
@@ -89,7 +89,7 @@ alcor_end_irq(unsigned int irq)
 		alcor_enable_irq(irq);
 }
 
-static struct hw_interrupt_type alcor_irq_type = {
+static struct irq_chip alcor_irq_type = {
 	.typename	= "ALCOR",
 	.startup	= alcor_startup_irq,
 	.shutdown	= alcor_disable_irq,
diff --git a/arch/alpha/kernel/sys_cabriolet.c b/arch/alpha/kernel/sys_cabriolet.c
index ace475c..ed34943 100644
--- a/arch/alpha/kernel/sys_cabriolet.c
+++ b/arch/alpha/kernel/sys_cabriolet.c
@@ -71,7 +71,7 @@ cabriolet_end_irq(unsigned int irq)
 		cabriolet_enable_irq(irq);
 }
 
-static struct hw_interrupt_type cabriolet_irq_type = {
+static struct irq_chip cabriolet_irq_type = {
 	.typename	= "CABRIOLET",
 	.startup	= cabriolet_startup_irq,
 	.shutdown	= cabriolet_disable_irq,
diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c
index 5bd5259..46e70ec 100644
--- a/arch/alpha/kernel/sys_dp264.c
+++ b/arch/alpha/kernel/sys_dp264.c
@@ -198,7 +198,7 @@ clipper_set_affinity(unsigned int irq, const struct cpumask *affinity)
 	return 0;
 }
 
-static struct hw_interrupt_type dp264_irq_type = {
+static struct irq_chip dp264_irq_type = {
 	.typename	= "DP264",
 	.startup	= dp264_startup_irq,
 	.shutdown	= dp264_disable_irq,
@@ -209,7 +209,7 @@ static struct hw_interrupt_type dp264_irq_type = {
 	.set_affinity	= dp264_set_affinity,
 };
 
-static struct hw_interrupt_type clipper_irq_type = {
+static struct irq_chip clipper_irq_type = {
 	.typename	= "CLIPPER",
 	.startup	= clipper_startup_irq,
 	.shutdown	= clipper_disable_irq,
@@ -298,7 +298,7 @@ clipper_srm_device_interrupt(unsigned long vector)
 }
 
 static void __init
-init_tsunami_irqs(struct hw_interrupt_type * ops, int imin, int imax)
+init_tsunami_irqs(struct irq_chip * ops, int imin, int imax)
 {
 	long i;
 	for (i = imin; i <= imax; ++i) {
diff --git a/arch/alpha/kernel/sys_eb64p.c b/arch/alpha/kernel/sys_eb64p.c
index 9c5a306..660c23e 100644
--- a/arch/alpha/kernel/sys_eb64p.c
+++ b/arch/alpha/kernel/sys_eb64p.c
@@ -69,7 +69,7 @@ eb64p_end_irq(unsigned int irq)
 		eb64p_enable_irq(irq);
 }
 
-static struct hw_interrupt_type eb64p_irq_type = {
+static struct irq_chip eb64p_irq_type = {
 	.typename	= "EB64P",
 	.startup	= eb64p_startup_irq,
 	.shutdown	= eb64p_disable_irq,
diff --git a/arch/alpha/kernel/sys_eiger.c b/arch/alpha/kernel/sys_eiger.c
index baf60f3..b99ea48 100644
--- a/arch/alpha/kernel/sys_eiger.c
+++ b/arch/alpha/kernel/sys_eiger.c
@@ -80,7 +80,7 @@ eiger_end_irq(unsigned int irq)
 		eiger_enable_irq(irq);
 }
 
-static struct hw_interrupt_type eiger_irq_type = {
+static struct irq_chip eiger_irq_type = {
 	.typename	= "EIGER",
 	.startup	= eiger_startup_irq,
 	.shutdown	= eiger_disable_irq,
diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c
index 2b5caf3..ef0b83a 100644
--- a/arch/alpha/kernel/sys_jensen.c
+++ b/arch/alpha/kernel/sys_jensen.c
@@ -118,7 +118,7 @@ jensen_local_end(unsigned int irq)
 		i8259a_end_irq(1);
 }
 
-static struct hw_interrupt_type jensen_local_irq_type = {
+static struct irq_chip jensen_local_irq_type = {
 	.typename	= "LOCAL",
 	.startup	= jensen_local_startup,
 	.shutdown	= jensen_local_shutdown,
diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c
index c5a1a24..bbfc4f2 100644
--- a/arch/alpha/kernel/sys_marvel.c
+++ b/arch/alpha/kernel/sys_marvel.c
@@ -169,7 +169,7 @@ marvel_irq_noop_return(unsigned int irq)
 	return 0; 
 }
 
-static struct hw_interrupt_type marvel_legacy_irq_type = {
+static struct irq_chip marvel_legacy_irq_type = {
 	.typename	= "LEGACY",
 	.startup	= marvel_irq_noop_return,
 	.shutdown	= marvel_irq_noop,
@@ -179,7 +179,7 @@ static struct hw_interrupt_type marvel_legacy_irq_type = {
 	.end		= marvel_irq_noop,
 };
 
-static struct hw_interrupt_type io7_lsi_irq_type = {
+static struct irq_chip io7_lsi_irq_type = {
 	.typename	= "LSI",
 	.startup	= io7_startup_irq,
 	.shutdown	= io7_disable_irq,
@@ -189,7 +189,7 @@ static struct hw_interrupt_type io7_lsi_irq_type = {
 	.end		= io7_end_irq,
 };
 
-static struct hw_interrupt_type io7_msi_irq_type = {
+static struct irq_chip io7_msi_irq_type = {
 	.typename	= "MSI",
 	.startup	= io7_startup_irq,
 	.shutdown	= io7_disable_irq,
@@ -273,8 +273,8 @@ init_one_io7_msi(struct io7 *io7, unsigned int which, unsigned int where)
 
 static void __init
 init_io7_irqs(struct io7 *io7, 
-	      struct hw_interrupt_type *lsi_ops,
-	      struct hw_interrupt_type *msi_ops)
+	      struct irq_chip *lsi_ops,
+	      struct irq_chip *msi_ops)
 {
 	long base = (io7->pe << MARVEL_IRQ_VEC_PE_SHIFT) + 16;
 	long i;
diff --git a/arch/alpha/kernel/sys_mikasa.c b/arch/alpha/kernel/sys_mikasa.c
index 8d3e942..4e36664 100644
--- a/arch/alpha/kernel/sys_mikasa.c
+++ b/arch/alpha/kernel/sys_mikasa.c
@@ -68,7 +68,7 @@ mikasa_end_irq(unsigned int irq)
 		mikasa_enable_irq(irq);
 }
 
-static struct hw_interrupt_type mikasa_irq_type = {
+static struct irq_chip mikasa_irq_type = {
 	.typename	= "MIKASA",
 	.startup	= mikasa_startup_irq,
 	.shutdown	= mikasa_disable_irq,
diff --git a/arch/alpha/kernel/sys_noritake.c b/arch/alpha/kernel/sys_noritake.c
index 538876b..35753a1 100644
--- a/arch/alpha/kernel/sys_noritake.c
+++ b/arch/alpha/kernel/sys_noritake.c
@@ -73,7 +73,7 @@ noritake_end_irq(unsigned int irq)
                 noritake_enable_irq(irq);
 }
 
-static struct hw_interrupt_type noritake_irq_type = {
+static struct irq_chip noritake_irq_type = {
 	.typename	= "NORITAKE",
 	.startup	= noritake_startup_irq,
 	.shutdown	= noritake_disable_irq,
diff --git a/arch/alpha/kernel/sys_rawhide.c b/arch/alpha/kernel/sys_rawhide.c
index 672cb2d..f3aec7e 100644
--- a/arch/alpha/kernel/sys_rawhide.c
+++ b/arch/alpha/kernel/sys_rawhide.c
@@ -135,7 +135,7 @@ rawhide_end_irq(unsigned int irq)
 		rawhide_enable_irq(irq);
 }
 
-static struct hw_interrupt_type rawhide_irq_type = {
+static struct irq_chip rawhide_irq_type = {
 	.typename	= "RAWHIDE",
 	.startup	= rawhide_startup_irq,
 	.shutdown	= rawhide_disable_irq,
diff --git a/arch/alpha/kernel/sys_rx164.c b/arch/alpha/kernel/sys_rx164.c
index ce1faa6..fc92463 100644
--- a/arch/alpha/kernel/sys_rx164.c
+++ b/arch/alpha/kernel/sys_rx164.c
@@ -72,7 +72,7 @@ rx164_end_irq(unsigned int irq)
 		rx164_enable_irq(irq);
 }
 
-static struct hw_interrupt_type rx164_irq_type = {
+static struct irq_chip rx164_irq_type = {
 	.typename	= "RX164",
 	.startup	= rx164_startup_irq,
 	.shutdown	= rx164_disable_irq,
diff --git a/arch/alpha/kernel/sys_sable.c b/arch/alpha/kernel/sys_sable.c
index 9e26325..426eb69 100644
--- a/arch/alpha/kernel/sys_sable.c
+++ b/arch/alpha/kernel/sys_sable.c
@@ -501,7 +501,7 @@ sable_lynx_mask_and_ack_irq(unsigned int irq)
 	spin_unlock(&sable_lynx_irq_lock);
 }
 
-static struct hw_interrupt_type sable_lynx_irq_type = {
+static struct irq_chip sable_lynx_irq_type = {
 	.typename	= "SABLE/LYNX",
 	.startup	= sable_lynx_startup_irq,
 	.shutdown	= sable_lynx_disable_irq,
diff --git a/arch/alpha/kernel/sys_takara.c b/arch/alpha/kernel/sys_takara.c
index 9bd9a31..830318c 100644
--- a/arch/alpha/kernel/sys_takara.c
+++ b/arch/alpha/kernel/sys_takara.c
@@ -74,7 +74,7 @@ takara_end_irq(unsigned int irq)
 		takara_enable_irq(irq);
 }
 
-static struct hw_interrupt_type takara_irq_type = {
+static struct irq_chip takara_irq_type = {
 	.typename	= "TAKARA",
 	.startup	= takara_startup_irq,
 	.shutdown	= takara_disable_irq,
diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c
index 8dd239e..88978fc 100644
--- a/arch/alpha/kernel/sys_titan.c
+++ b/arch/alpha/kernel/sys_titan.c
@@ -185,7 +185,7 @@ titan_srm_device_interrupt(unsigned long vector)
 
 
 static void __init
-init_titan_irqs(struct hw_interrupt_type * ops, int imin, int imax)
+init_titan_irqs(struct irq_chip * ops, int imin, int imax)
 {
 	long i;
 	for (i = imin; i <= imax; ++i) {
@@ -194,7 +194,7 @@ init_titan_irqs(struct hw_interrupt_type * ops, int imin, int imax)
 	}
 }
 
-static struct hw_interrupt_type titan_irq_type = {
+static struct irq_chip titan_irq_type = {
        .typename       = "TITAN",
        .startup        = titan_startup_irq,
        .shutdown       = titan_disable_irq,
diff --git a/arch/alpha/kernel/sys_wildfire.c b/arch/alpha/kernel/sys_wildfire.c
index 42c3eed..e91b4c3 100644
--- a/arch/alpha/kernel/sys_wildfire.c
+++ b/arch/alpha/kernel/sys_wildfire.c
@@ -157,7 +157,7 @@ wildfire_end_irq(unsigned int irq)
 		wildfire_enable_irq(irq);
 }
 
-static struct hw_interrupt_type wildfire_irq_type = {
+static struct irq_chip wildfire_irq_type = {
 	.typename	= "WILDFIRE",
 	.startup	= wildfire_startup_irq,
 	.shutdown	= wildfire_disable_irq,
-- 
cgit v0.10.2


From fb26b3e63e9685ce250377bf905c78425a8e8b2b Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 16 Jun 2009 15:33:25 -0700
Subject: alpha: bad macro expansion, parameter is member

`for_each_mem_cluster(x, y, z)' will expand to
`for ((x) = (y)->x ...' but correct is `for ((x) = (y)->cluster ...'

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 80df86c..d2634e4 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -252,9 +252,9 @@ reserve_std_resources(void)
 }
 
 #define PFN_MAX		PFN_DOWN(0x80000000)
-#define for_each_mem_cluster(memdesc, cluster, i)		\
-	for ((cluster) = (memdesc)->cluster, (i) = 0;		\
-	     (i) < (memdesc)->numclusters; (i)++, (cluster)++)
+#define for_each_mem_cluster(memdesc, _cluster, i)		\
+	for ((_cluster) = (memdesc)->cluster, (i) = 0;		\
+	     (i) < (memdesc)->numclusters; (i)++, (_cluster)++)
 
 static unsigned long __init
 get_mem_size_limit(char *s)
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index a13de49..0eab557 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -28,9 +28,9 @@ EXPORT_SYMBOL(node_data);
 #define DBGDCONT(args...)
 #endif
 
-#define for_each_mem_cluster(memdesc, cluster, i)		\
-	for ((cluster) = (memdesc)->cluster, (i) = 0;		\
-	     (i) < (memdesc)->numclusters; (i)++, (cluster)++)
+#define for_each_mem_cluster(memdesc, _cluster, i)		\
+	for ((_cluster) = (memdesc)->cluster, (i) = 0;		\
+	     (i) < (memdesc)->numclusters; (i)++, (_cluster)++)
 
 static void __init show_mem_layout(void)
 {
-- 
cgit v0.10.2


From 189e91f5f5c09043ef78cad956a71ac339203a5d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 Jun 2009 15:33:26 -0700
Subject: m32r: remove obsolete hw_interrupt_type

The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t) have
been kept around for migration reasons.  After more than two years it's
time to remove them finally.

This patch cleans up one of the remaining users.  When all such patches
hit mainline we can remove the defines and typedefs finally.

Impact: cleanup

Convert the last remaining users to struct irq_chip and remove the
define.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Hirokazu Takata <takata@linux-m32r.org>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/m32r/platforms/m32104ut/setup.c b/arch/m32r/platforms/m32104ut/setup.c
index 98138b4..922fdfd 100644
--- a/arch/m32r/platforms/m32104ut/setup.c
+++ b/arch/m32r/platforms/m32104ut/setup.c
@@ -63,7 +63,7 @@ static void shutdown_m32104ut_irq(unsigned int irq)
 	outl(M32R_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type m32104ut_irq_type =
+static struct irq_chip m32104ut_irq_type =
 {
 	.typename = "M32104UT-IRQ",
 	.startup = startup_m32104ut_irq,
diff --git a/arch/m32r/platforms/m32700ut/setup.c b/arch/m32r/platforms/m32700ut/setup.c
index 77b0ae9..9c1bc74 100644
--- a/arch/m32r/platforms/m32700ut/setup.c
+++ b/arch/m32r/platforms/m32700ut/setup.c
@@ -69,7 +69,7 @@ static void shutdown_m32700ut_irq(unsigned int irq)
 	outl(M32R_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type m32700ut_irq_type =
+static struct irq_chip m32700ut_irq_type =
 {
 	.typename = "M32700UT-IRQ",
 	.startup = startup_m32700ut_irq,
@@ -146,7 +146,7 @@ static void shutdown_m32700ut_pld_irq(unsigned int irq)
 	outw(PLD_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type m32700ut_pld_irq_type =
+static struct irq_chip m32700ut_pld_irq_type =
 {
 	.typename = "M32700UT-PLD-IRQ",
 	.startup = startup_m32700ut_pld_irq,
@@ -215,7 +215,7 @@ static void shutdown_m32700ut_lanpld_irq(unsigned int irq)
 	outw(PLD_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type m32700ut_lanpld_irq_type =
+static struct irq_chip m32700ut_lanpld_irq_type =
 {
 	.typename = "M32700UT-PLD-LAN-IRQ",
 	.startup = startup_m32700ut_lanpld_irq,
@@ -284,7 +284,7 @@ static void shutdown_m32700ut_lcdpld_irq(unsigned int irq)
 	outw(PLD_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type m32700ut_lcdpld_irq_type =
+static struct irq_chip m32700ut_lcdpld_irq_type =
 {
 	.typename = "M32700UT-PLD-LCD-IRQ",
 	.startup = startup_m32700ut_lcdpld_irq,
diff --git a/arch/m32r/platforms/mappi/setup.c b/arch/m32r/platforms/mappi/setup.c
index 3ec087f..fb4b177 100644
--- a/arch/m32r/platforms/mappi/setup.c
+++ b/arch/m32r/platforms/mappi/setup.c
@@ -63,7 +63,7 @@ static void shutdown_mappi_irq(unsigned int irq)
 	outl(M32R_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type mappi_irq_type =
+static struct irq_chip mappi_irq_type =
 {
 	.typename = "MAPPI-IRQ",
 	.startup = startup_mappi_irq,
diff --git a/arch/m32r/platforms/mappi2/setup.c b/arch/m32r/platforms/mappi2/setup.c
index d87969c..6a65eda 100644
--- a/arch/m32r/platforms/mappi2/setup.c
+++ b/arch/m32r/platforms/mappi2/setup.c
@@ -70,7 +70,7 @@ static void shutdown_mappi2_irq(unsigned int irq)
 	outl(M32R_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type mappi2_irq_type =
+static struct irq_chip mappi2_irq_type =
 {
 	.typename = "MAPPI2-IRQ",
 	.startup = startup_mappi2_irq,
diff --git a/arch/m32r/platforms/mappi3/setup.c b/arch/m32r/platforms/mappi3/setup.c
index 785b4bd..9c337ae 100644
--- a/arch/m32r/platforms/mappi3/setup.c
+++ b/arch/m32r/platforms/mappi3/setup.c
@@ -70,7 +70,7 @@ static void shutdown_mappi3_irq(unsigned int irq)
 	outl(M32R_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type mappi3_irq_type =
+static struct irq_chip mappi3_irq_type =
 {
 	.typename = "MAPPI3-IRQ",
 	.startup = startup_mappi3_irq,
diff --git a/arch/m32r/platforms/oaks32r/setup.c b/arch/m32r/platforms/oaks32r/setup.c
index 6faa5db..ed86574 100644
--- a/arch/m32r/platforms/oaks32r/setup.c
+++ b/arch/m32r/platforms/oaks32r/setup.c
@@ -61,7 +61,7 @@ static void shutdown_oaks32r_irq(unsigned int irq)
 	outl(M32R_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type oaks32r_irq_type =
+static struct irq_chip oaks32r_irq_type =
 {
 	.typename = "OAKS32R-IRQ",
 	.startup = startup_oaks32r_irq,
diff --git a/arch/m32r/platforms/opsput/setup.c b/arch/m32r/platforms/opsput/setup.c
index fab13fd..80d6806 100644
--- a/arch/m32r/platforms/opsput/setup.c
+++ b/arch/m32r/platforms/opsput/setup.c
@@ -70,7 +70,7 @@ static void shutdown_opsput_irq(unsigned int irq)
 	outl(M32R_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type opsput_irq_type =
+static struct irq_chip opsput_irq_type =
 {
 	.typename = "OPSPUT-IRQ",
 	.startup = startup_opsput_irq,
@@ -147,7 +147,7 @@ static void shutdown_opsput_pld_irq(unsigned int irq)
 	outw(PLD_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type opsput_pld_irq_type =
+static struct irq_chip opsput_pld_irq_type =
 {
 	.typename = "OPSPUT-PLD-IRQ",
 	.startup = startup_opsput_pld_irq,
@@ -216,7 +216,7 @@ static void shutdown_opsput_lanpld_irq(unsigned int irq)
 	outw(PLD_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type opsput_lanpld_irq_type =
+static struct irq_chip opsput_lanpld_irq_type =
 {
 	.typename = "OPSPUT-PLD-LAN-IRQ",
 	.startup = startup_opsput_lanpld_irq,
@@ -285,7 +285,7 @@ static void shutdown_opsput_lcdpld_irq(unsigned int irq)
 	outw(PLD_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type opsput_lcdpld_irq_type =
+static struct irq_chip opsput_lcdpld_irq_type =
 {
 	"OPSPUT-PLD-LCD-IRQ",
 	startup_opsput_lcdpld_irq,
diff --git a/arch/m32r/platforms/usrv/setup.c b/arch/m32r/platforms/usrv/setup.c
index 89588d6..7573026 100644
--- a/arch/m32r/platforms/usrv/setup.c
+++ b/arch/m32r/platforms/usrv/setup.c
@@ -61,7 +61,7 @@ static void shutdown_mappi_irq(unsigned int irq)
 	outl(M32R_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type mappi_irq_type =
+static struct irq_chip mappi_irq_type =
 {
 	.typename = "M32700-IRQ",
 	.startup = startup_mappi_irq,
@@ -134,7 +134,7 @@ static void shutdown_m32700ut_pld_irq(unsigned int irq)
 	outw(PLD_ICUCR_ILEVEL7, port);
 }
 
-static struct hw_interrupt_type m32700ut_pld_irq_type =
+static struct irq_chip m32700ut_pld_irq_type =
 {
 	.typename = "USRV-PLD-IRQ",
 	.startup = startup_m32700ut_pld_irq,
-- 
cgit v0.10.2


From 7e1cb780452809da09ee47860736a9c8d86d67c6 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Tue, 16 Jun 2009 15:33:28 -0700
Subject: uml: UML net driver does not allow for vlans

See ancient discussion at
http://marc.info/?l=user-mode-linux-devel&m=101990155831279&w=2

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=7854

Signed-off-by: Alan Cox <alan@linux.intel.com>
Reported-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Roland Kletzing <devzero@web.de>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/um/include/shared/net_user.h b/arch/um/include/shared/net_user.h
index 63bee15..3dabbe1 100644
--- a/arch/um/include/shared/net_user.h
+++ b/arch/um/include/shared/net_user.h
@@ -8,7 +8,7 @@
 
 #define ETH_ADDR_LEN (6)
 #define ETH_HEADER_ETHERTAP (16)
-#define ETH_HEADER_OTHER (14)
+#define ETH_HEADER_OTHER (26) /* 14 for ethernet + VLAN + MPLS for crazy people */
 #define ETH_MAX_PACKET (1500)
 
 #define UML_NET_VERSION (4)
-- 
cgit v0.10.2


From 6fa851c3e9746e3cf23694f0571a9e080107ba7c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 Jun 2009 15:33:29 -0700
Subject: um: remove obsolete hw_interrupt_type

The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t) have
been kept around for migration reasons.  After more than two years it's
time to remove them finally.

This patch cleans up one of the remaining users.  When all such patches
hit mainline we can remove the defines and typedefs finally.

Impact: cleanup

Convert the last remaining users to struct irq_chip and remove the
define.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 336b615..454cdb4 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -358,7 +358,7 @@ EXPORT_SYMBOL(um_request_irq);
 EXPORT_SYMBOL(reactivate_fd);
 
 /*
- * hw_interrupt_type must define (startup || enable) &&
+ * irq_chip must define (startup || enable) &&
  * (shutdown || disable) && end
  */
 static void dummy(unsigned int irq)
@@ -366,7 +366,7 @@ static void dummy(unsigned int irq)
 }
 
 /* This is used for everything else than the timer. */
-static struct hw_interrupt_type normal_irq_type = {
+static struct irq_chip normal_irq_type = {
 	.typename = "SIGIO",
 	.release = free_irq_by_irq_and_dev,
 	.disable = dummy,
@@ -375,7 +375,7 @@ static struct hw_interrupt_type normal_irq_type = {
 	.end = dummy
 };
 
-static struct hw_interrupt_type SIGVTALRM_irq_type = {
+static struct irq_chip SIGVTALRM_irq_type = {
 	.typename = "SIGVTALRM",
 	.release = free_irq_by_irq_and_dev,
 	.shutdown = dummy, /* never called */
-- 
cgit v0.10.2


From 276c974ac7965e7335f0f4ab945729d8f30f11b5 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Tue, 16 Jun 2009 15:33:30 -0700
Subject: uml: fix a section warning

When compiling uml on x86_64:

  MODPOST vmlinux.o
WARNING: vmlinux.o (.__syscall_stub.2): unexpected non-allocatable section.
Did you forget to use "ax"/"aw" in a .S file?
Note that for example <linux/init.h> contains
section definitions for use in .S files.

Because modpost checks for missing SHF_ALLOC section flag.  So just add
it.

Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/um/sys-i386/stub.S b/arch/um/sys-i386/stub.S
index c41b04b..54a36ec 100644
--- a/arch/um/sys-i386/stub.S
+++ b/arch/um/sys-i386/stub.S
@@ -1,7 +1,7 @@
 #include "as-layout.h"
 
 	.globl syscall_stub
-.section .__syscall_stub, "x"
+.section .__syscall_stub, "ax"
 
 	.globl batch_syscall_stub
 batch_syscall_stub:
diff --git a/arch/um/sys-x86_64/stub.S b/arch/um/sys-x86_64/stub.S
index 6d9edf9..20e4a96 100644
--- a/arch/um/sys-x86_64/stub.S
+++ b/arch/um/sys-x86_64/stub.S
@@ -1,7 +1,7 @@
 #include "as-layout.h"
 
 	.globl syscall_stub
-.section .__syscall_stub, "x"
+.section .__syscall_stub, "ax"
 syscall_stub:
 	syscall
 	/* We don't have 64-bit constants, so this constructs the address
-- 
cgit v0.10.2


From b08cd961cc3869e14d841a3ed1641f47b11348f3 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 16 Jun 2009 15:33:32 -0700
Subject: uml: bad macro expansion, parameter is member

`ELF_CORE_COPY_REGS(x, y)' will make expansions like:
`(y)[0] = (x)->x.gp[0]' but correct is `(y)[0] = (x)->regs.gp[0]'

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: WANG Cong <amwang@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/um/sys-x86_64/asm/elf.h b/arch/um/sys-x86_64/asm/elf.h
index 6e8a919..04b9e87 100644
--- a/arch/um/sys-x86_64/asm/elf.h
+++ b/arch/um/sys-x86_64/asm/elf.h
@@ -66,28 +66,28 @@ typedef struct user_i387_struct elf_fpregset_t;
 	PT_REGS_R15(regs) = 0; \
 } while (0)
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)		\
-	(pr_reg)[0] = (regs)->regs.gp[0];			\
-	(pr_reg)[1] = (regs)->regs.gp[1];			\
-	(pr_reg)[2] = (regs)->regs.gp[2];			\
-	(pr_reg)[3] = (regs)->regs.gp[3];			\
-	(pr_reg)[4] = (regs)->regs.gp[4];			\
-	(pr_reg)[5] = (regs)->regs.gp[5];			\
-	(pr_reg)[6] = (regs)->regs.gp[6];			\
-	(pr_reg)[7] = (regs)->regs.gp[7];			\
-	(pr_reg)[8] = (regs)->regs.gp[8];			\
-	(pr_reg)[9] = (regs)->regs.gp[9];			\
-	(pr_reg)[10] = (regs)->regs.gp[10];			\
-	(pr_reg)[11] = (regs)->regs.gp[11];			\
-	(pr_reg)[12] = (regs)->regs.gp[12];			\
-	(pr_reg)[13] = (regs)->regs.gp[13];			\
-	(pr_reg)[14] = (regs)->regs.gp[14];			\
-	(pr_reg)[15] = (regs)->regs.gp[15];			\
-	(pr_reg)[16] = (regs)->regs.gp[16];			\
-	(pr_reg)[17] = (regs)->regs.gp[17];			\
-	(pr_reg)[18] = (regs)->regs.gp[18];			\
-	(pr_reg)[19] = (regs)->regs.gp[19];			\
-	(pr_reg)[20] = (regs)->regs.gp[20];			\
+#define ELF_CORE_COPY_REGS(pr_reg, _regs)		\
+	(pr_reg)[0] = (_regs)->regs.gp[0];			\
+	(pr_reg)[1] = (_regs)->regs.gp[1];			\
+	(pr_reg)[2] = (_regs)->regs.gp[2];			\
+	(pr_reg)[3] = (_regs)->regs.gp[3];			\
+	(pr_reg)[4] = (_regs)->regs.gp[4];			\
+	(pr_reg)[5] = (_regs)->regs.gp[5];			\
+	(pr_reg)[6] = (_regs)->regs.gp[6];			\
+	(pr_reg)[7] = (_regs)->regs.gp[7];			\
+	(pr_reg)[8] = (_regs)->regs.gp[8];			\
+	(pr_reg)[9] = (_regs)->regs.gp[9];			\
+	(pr_reg)[10] = (_regs)->regs.gp[10];			\
+	(pr_reg)[11] = (_regs)->regs.gp[11];			\
+	(pr_reg)[12] = (_regs)->regs.gp[12];			\
+	(pr_reg)[13] = (_regs)->regs.gp[13];			\
+	(pr_reg)[14] = (_regs)->regs.gp[14];			\
+	(pr_reg)[15] = (_regs)->regs.gp[15];			\
+	(pr_reg)[16] = (_regs)->regs.gp[16];			\
+	(pr_reg)[17] = (_regs)->regs.gp[17];			\
+	(pr_reg)[18] = (_regs)->regs.gp[18];			\
+	(pr_reg)[19] = (_regs)->regs.gp[19];			\
+	(pr_reg)[20] = (_regs)->regs.gp[20];			\
 	(pr_reg)[21] = current->thread.arch.fs;			\
 	(pr_reg)[22] = 0;					\
 	(pr_reg)[23] = 0;					\
-- 
cgit v0.10.2


From a7d932af06e8eee2163627d19898e18da5635449 Mon Sep 17 00:00:00 2001
From: Dan Smith <danms@us.ibm.com>
Date: Tue, 16 Jun 2009 15:33:33 -0700
Subject: utsname.h: make new_utsname fields use the proper length constant

The members of the new_utsname structure are defined with magic numbers
that *should* correspond to the constant __NEW_UTS_LEN+1.  Everywhere
else, code assumes this and uses the constant, so this patch makes the
structure match.

Originally suggested by Serge here:

https://lists.linux-foundation.org/pipermail/containers/2009-March/016258.html

Signed-off-by: Dan Smith <danms@us.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 1123267..3656b30 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -22,12 +22,12 @@ struct old_utsname {
 };
 
 struct new_utsname {
-	char sysname[65];
-	char nodename[65];
-	char release[65];
-	char version[65];
-	char machine[65];
-	char domainname[65];
+	char sysname[__NEW_UTS_LEN + 1];
+	char nodename[__NEW_UTS_LEN + 1];
+	char release[__NEW_UTS_LEN + 1];
+	char version[__NEW_UTS_LEN + 1];
+	char machine[__NEW_UTS_LEN + 1];
+	char domainname[__NEW_UTS_LEN + 1];
 };
 
 #ifdef __KERNEL__
-- 
cgit v0.10.2


From 417dcdf99ec9f8d8d6917189130bdc17cb67c678 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Tue, 16 Jun 2009 15:33:33 -0700
Subject: atomic: only take lock when the counter drops to zero on UP as well

_atomic_dec_and_lock() should not unconditionally take the lock before
calling atomic_dec_and_test() in the UP case.  For consistency reasons it
should behave exactly like in the SMP case.

Besides that this works around the problem that with CONFIG_DEBUG_SPINLOCK
this spins in __spin_lock_debug() if the lock is already taken even if the
counter doesn't drop to 0.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Nick Piggin <npiggin@suse.de>
Cc: Valerie Aurora <vaurora@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/lib/dec_and_lock.c b/lib/dec_and_lock.c
index a65c314..e73822a 100644
--- a/lib/dec_and_lock.c
+++ b/lib/dec_and_lock.c
@@ -19,11 +19,10 @@
  */
 int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
 {
-#ifdef CONFIG_SMP
 	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
 	if (atomic_add_unless(atomic, -1, 1))
 		return 0;
-#endif
+
 	/* Otherwise do it the slow way */
 	spin_lock(lock);
 	if (atomic_dec_and_test(atomic))
-- 
cgit v0.10.2


From b33112d1cc25e658c334125d127a6ae15d5a0ad6 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Tue, 16 Jun 2009 15:33:34 -0700
Subject: kernel/kfifo.c: replace conditional test with is_power_of_2()

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index bc41ad0..26539e3 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -72,9 +72,9 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
 
 	/*
 	 * round up to the next power of 2, since our 'let the indices
-	 * wrap' tachnique works only in this case.
+	 * wrap' technique works only in this case.
 	 */
-	if (size & (size - 1)) {
+	if (!is_power_of_2(size)) {
 		BUG_ON(size > 0x80000000);
 		size = roundup_pow_of_two(size);
 	}
-- 
cgit v0.10.2


From 02d5341ae53d32681241b27a40397475caef1c83 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Tue, 16 Jun 2009 15:33:35 -0700
Subject: ntfs: use is_power_of_2() function for clarity.

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Cc: Anton Altaparmakov <aia21@cantab.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 82c5085..9938034 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -27,6 +27,7 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/slab.h>
+#include <linux/log2.h>
 
 #include "aops.h"
 #include "attrib.h"
@@ -1570,7 +1571,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 	ntfs_debug("Index collation rule is 0x%x.",
 			le32_to_cpu(ir->collation_rule));
 	ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
-	if (ni->itype.index.block_size & (ni->itype.index.block_size - 1)) {
+	if (!is_power_of_2(ni->itype.index.block_size)) {
 		ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
 				"two.", ni->itype.index.block_size);
 		goto unm_err_out;
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index d7932e9..89b0298 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -26,6 +26,7 @@
 #include <linux/highmem.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include <linux/log2.h>
 
 #include "attrib.h"
 #include "aops.h"
@@ -65,7 +66,7 @@ static bool ntfs_check_restart_page_header(struct inode *vi,
 			logfile_log_page_size < NTFS_BLOCK_SIZE ||
 			logfile_system_page_size &
 			(logfile_system_page_size - 1) ||
-			logfile_log_page_size & (logfile_log_page_size - 1)) {
+			!is_power_of_2(logfile_log_page_size)) {
 		ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
 		return false;
 	}
-- 
cgit v0.10.2


From 4938d7e0233a455f04507bac81d0886c71529537 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 16 Jun 2009 15:33:36 -0700
Subject: poll: avoid extra wakeups in select/poll

After introduction of keyed wakeups Davide Libenzi did on epoll, we are
able to avoid spurious wakeups in poll()/select() code too.

For example, typical use of poll()/select() is to wait for incoming
network frames on many sockets.  But TX completion for UDP/TCP frames call
sock_wfree() which in turn schedules thread.

When scheduled, thread does a full scan of all polled fds and can sleep
again, because nothing is really available.  If number of fds is large,
this cause significant load.

This patch makes select()/poll() aware of keyed wakeups and useless
wakeups are avoided.  This reduces number of context switches by about 50%
on some setups, and work performed by sofirq handlers.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/select.c b/fs/select.c
index 0fe0e14..d870237 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -168,7 +168,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
 	return table->entry++;
 }
 
-static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
 	struct poll_wqueues *pwq = wait->private;
 	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
@@ -194,6 +194,16 @@ static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 	return default_wake_function(&dummy_wait, mode, sync, key);
 }
 
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct poll_table_entry *entry;
+
+	entry = container_of(wait, struct poll_table_entry, wait);
+	if (key && !((unsigned long)key & entry->key))
+		return 0;
+	return __pollwake(wait, mode, sync, key);
+}
+
 /* Add a new entry */
 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 				poll_table *p)
@@ -205,6 +215,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 	get_file(filp);
 	entry->filp = filp;
 	entry->wait_address = wait_address;
+	entry->key = p->key;
 	init_waitqueue_func_entry(&entry->wait, pollwake);
 	entry->wait.private = pwq;
 	add_wait_queue(wait_address, &entry->wait);
@@ -362,6 +373,18 @@ get_max:
 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
 #define POLLEX_SET (POLLPRI)
 
+static inline void wait_key_set(poll_table *wait, unsigned long in,
+				unsigned long out, unsigned long bit)
+{
+	if (wait) {
+		wait->key = POLLEX_SET;
+		if (in & bit)
+			wait->key |= POLLIN_SET;
+		if (out & bit)
+			wait->key |= POLLOUT_SET;
+	}
+}
+
 int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 {
 	ktime_t expire, *to = NULL;
@@ -418,20 +441,25 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 				if (file) {
 					f_op = file->f_op;
 					mask = DEFAULT_POLLMASK;
-					if (f_op && f_op->poll)
-						mask = (*f_op->poll)(file, retval ? NULL : wait);
+					if (f_op && f_op->poll) {
+						wait_key_set(wait, in, out, bit);
+						mask = (*f_op->poll)(file, wait);
+					}
 					fput_light(file, fput_needed);
 					if ((mask & POLLIN_SET) && (in & bit)) {
 						res_in |= bit;
 						retval++;
+						wait = NULL;
 					}
 					if ((mask & POLLOUT_SET) && (out & bit)) {
 						res_out |= bit;
 						retval++;
+						wait = NULL;
 					}
 					if ((mask & POLLEX_SET) && (ex & bit)) {
 						res_ex |= bit;
 						retval++;
+						wait = NULL;
 					}
 				}
 			}
@@ -685,8 +713,12 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 		mask = POLLNVAL;
 		if (file != NULL) {
 			mask = DEFAULT_POLLMASK;
-			if (file->f_op && file->f_op->poll)
+			if (file->f_op && file->f_op->poll) {
+				if (pwait)
+					pwait->key = pollfd->events |
+							POLLERR | POLLHUP;
 				mask = file->f_op->poll(file, pwait);
+			}
 			/* Mask out unneeded events. */
 			mask &= pollfd->events | POLLERR | POLLHUP;
 			fput_light(file, fput_needed);
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 8c24ef8..fa287f2 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -32,6 +32,7 @@ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_
 
 typedef struct poll_table_struct {
 	poll_queue_proc qproc;
+	unsigned long key;
 } poll_table;
 
 static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
@@ -43,10 +44,12 @@ static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_addres
 static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 {
 	pt->qproc = qproc;
+	pt->key   = ~0UL; /* all events enabled */
 }
 
 struct poll_table_entry {
 	struct file *filp;
+	unsigned long key;
 	wait_queue_t wait;
 	wait_queue_head_t *wait_address;
 };
-- 
cgit v0.10.2


From 0d9c25dde878a636ee9a9b53923569171bf9a55b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 16 Jun 2009 15:33:37 -0700
Subject: headers: move module_bug_finalize()/module_bug_cleanup() definitions
 into module.h

They're in linux/bug.h at present, which causes include order tangles.  In
particular, linux/bug.h cannot be used by linux/atomic.h because,
according to Nikanth:

linux/bug.h pulls in linux/module.h => linux/spinlock.h => asm/spinlock.h
(which uses atomic_inc) => asm/atomic.h.

bug.h is a pretty low-level thing and module.h is a higher-level thing,
IMO.

Cc: Nikanth Karthikesan <knikanth@novell.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/bug.h b/include/linux/bug.h
index 54398d2..d276b55 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_BUG_H
 #define _LINUX_BUG_H
 
-#include <linux/module.h>
 #include <asm/bug.h>
 
 enum bug_trap_type {
@@ -24,10 +23,6 @@ const struct bug_entry *find_bug(unsigned long bugaddr);
 
 enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
 
-int  module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
-			 struct module *);
-void module_bug_cleanup(struct module *);
-
 /* These are defined by the architecture */
 int is_valid_bugaddr(unsigned long addr);
 
@@ -38,13 +33,6 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
 {
 	return BUG_TRAP_TYPE_BUG;
 }
-static inline int  module_bug_finalize(const Elf_Ehdr *hdr,
-					const Elf_Shdr *sechdrs,
-					struct module *mod)
-{
-	return 0;
-}
-static inline void module_bug_cleanup(struct module *mod) {}
 
 #endif	/* CONFIG_GENERIC_BUG */
 #endif	/* _LINUX_BUG_H */
diff --git a/include/linux/module.h b/include/linux/module.h
index a7bc6e7..505f20d 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -697,4 +697,21 @@ static inline void module_remove_modinfo_attrs(struct module *mod)
 
 #define __MODULE_STRING(x) __stringify(x)
 
+
+#ifdef CONFIG_GENERIC_BUG
+int  module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
+			 struct module *);
+void module_bug_cleanup(struct module *);
+
+#else	/* !CONFIG_GENERIC_BUG */
+
+static inline int  module_bug_finalize(const Elf_Ehdr *hdr,
+					const Elf_Shdr *sechdrs,
+					struct module *mod)
+{
+	return 0;
+}
+static inline void module_bug_cleanup(struct module *mod) {}
+#endif	/* CONFIG_GENERIC_BUG */
+
 #endif /* _LINUX_MODULE_H */
-- 
cgit v0.10.2


From 10fc89d01a7ea2ecc2a58d2f4e7700f47178cd62 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 16 Jun 2009 15:33:38 -0700
Subject: drbd: add major number to major.h

Since we have had a LANANA major number for years, and it is documented in
devices.txt, I think that this first patch can go upstream.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/major.h b/include/linux/major.h
index 058ec15..6a8ca98 100644
--- a/include/linux/major.h
+++ b/include/linux/major.h
@@ -145,6 +145,7 @@
 #define UNIX98_PTY_MAJOR_COUNT	8
 #define UNIX98_PTY_SLAVE_MAJOR	(UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT)
 
+#define DRBD_MAJOR		147
 #define RTF_MAJOR		150
 #define RAW_MAJOR		162
 
-- 
cgit v0.10.2


From 8b0b1db0133e4218a9b45c09e53793c039edebe1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 Jun 2009 15:33:39 -0700
Subject: remove put_cpu_no_resched()

put_cpu_no_resched() is an optimization of put_cpu() which unfortunately
can cause high latencies.

The nfs iostats code uses put_cpu_no_resched() in a code sequence where a
reschedule request caused by an interrupt between the get_cpu() and the
put_cpu_no_resched() can delay the reschedule for at least HZ.

The other users of put_cpu_no_resched() optimize correctly in interrupt
code, but there is no real harm in using the put_cpu() function which is
an alias for preempt_enable().  The extra check of the preemmpt count is
not as critical as the potential source of missing a reschedule.

Debugged in the preempt-rt tree and verified in mainline.

Impact: remove a high latency source

[akpm@linux-foundation.org: build fix]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 8a06dc48..bdc176c 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -5595,7 +5595,7 @@ pfm_interrupt_handler(int irq, void *arg)
 		(*pfm_alt_intr_handler->handler)(irq, arg, regs);
 	}
 
-	put_cpu_no_resched();
+	put_cpu();
 	return IRQ_HANDLED;
 }
 
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a2ab252..ceda50a 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -31,7 +31,7 @@ static inline void nfs_inc_server_stats(const struct nfs_server *server,
 	cpu = get_cpu();
 	iostats = per_cpu_ptr(server->io_stats, cpu);
 	iostats->events[stat]++;
-	put_cpu_no_resched();
+	put_cpu();
 }
 
 static inline void nfs_inc_stats(const struct inode *inode,
@@ -50,7 +50,7 @@ static inline void nfs_add_server_stats(const struct nfs_server *server,
 	cpu = get_cpu();
 	iostats = per_cpu_ptr(server->io_stats, cpu);
 	iostats->bytes[stat] += addend;
-	put_cpu_no_resched();
+	put_cpu();
 }
 
 static inline void nfs_add_stats(const struct inode *inode,
@@ -71,7 +71,7 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
 	cpu = get_cpu();
 	iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
 	iostats->fscache[stat] += addend;
-	put_cpu_no_resched();
+	put_cpu();
 }
 #endif
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index a69db82..9e3d8af 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -177,7 +177,6 @@ static inline void init_call_single_data(void)
 
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
 #define put_cpu()		preempt_enable()
-#define put_cpu_no_resched()	preempt_enable_no_resched()
 
 /*
  * Callback to arch code if there's nosmp or maxcpus=0 on the
-- 
cgit v0.10.2


From 30639b6af85a92491b22dd14c17b14ca11da60e6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jun 2009 15:33:40 -0700
Subject: groups: move code to kernel/groups.c

Move supplementary groups implementation to kernel/groups.c .
kernel/sys.c already accumulated quite a few random stuff.

Do strictly copy/paste + add required headers to compile.  Compile-tested
on many configs and archs.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/Makefile b/kernel/Makefile
index 90b53f6..9df4501 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
 	    async.o
+obj-y += groups.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/groups.c b/kernel/groups.c
new file mode 100644
index 0000000..2b45b2e
--- /dev/null
+++ b/kernel/groups.c
@@ -0,0 +1,288 @@
+/*
+ * Supplementary group IDs
+ */
+#include <linux/cred.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+
+struct group_info *groups_alloc(int gidsetsize)
+{
+	struct group_info *group_info;
+	int nblocks;
+	int i;
+
+	nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
+	/* Make sure we always allocate at least one indirect block pointer */
+	nblocks = nblocks ? : 1;
+	group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
+	if (!group_info)
+		return NULL;
+	group_info->ngroups = gidsetsize;
+	group_info->nblocks = nblocks;
+	atomic_set(&group_info->usage, 1);
+
+	if (gidsetsize <= NGROUPS_SMALL)
+		group_info->blocks[0] = group_info->small_block;
+	else {
+		for (i = 0; i < nblocks; i++) {
+			gid_t *b;
+			b = (void *)__get_free_page(GFP_USER);
+			if (!b)
+				goto out_undo_partial_alloc;
+			group_info->blocks[i] = b;
+		}
+	}
+	return group_info;
+
+out_undo_partial_alloc:
+	while (--i >= 0) {
+		free_page((unsigned long)group_info->blocks[i]);
+	}
+	kfree(group_info);
+	return NULL;
+}
+
+EXPORT_SYMBOL(groups_alloc);
+
+void groups_free(struct group_info *group_info)
+{
+	if (group_info->blocks[0] != group_info->small_block) {
+		int i;
+		for (i = 0; i < group_info->nblocks; i++)
+			free_page((unsigned long)group_info->blocks[i]);
+	}
+	kfree(group_info);
+}
+
+EXPORT_SYMBOL(groups_free);
+
+/* export the group_info to a user-space array */
+static int groups_to_user(gid_t __user *grouplist,
+			  const struct group_info *group_info)
+{
+	int i;
+	unsigned int count = group_info->ngroups;
+
+	for (i = 0; i < group_info->nblocks; i++) {
+		unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+		unsigned int len = cp_count * sizeof(*grouplist);
+
+		if (copy_to_user(grouplist, group_info->blocks[i], len))
+			return -EFAULT;
+
+		grouplist += NGROUPS_PER_BLOCK;
+		count -= cp_count;
+	}
+	return 0;
+}
+
+/* fill a group_info from a user-space array - it must be allocated already */
+static int groups_from_user(struct group_info *group_info,
+    gid_t __user *grouplist)
+{
+	int i;
+	unsigned int count = group_info->ngroups;
+
+	for (i = 0; i < group_info->nblocks; i++) {
+		unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+		unsigned int len = cp_count * sizeof(*grouplist);
+
+		if (copy_from_user(group_info->blocks[i], grouplist, len))
+			return -EFAULT;
+
+		grouplist += NGROUPS_PER_BLOCK;
+		count -= cp_count;
+	}
+	return 0;
+}
+
+/* a simple Shell sort */
+static void groups_sort(struct group_info *group_info)
+{
+	int base, max, stride;
+	int gidsetsize = group_info->ngroups;
+
+	for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+		; /* nothing */
+	stride /= 3;
+
+	while (stride) {
+		max = gidsetsize - stride;
+		for (base = 0; base < max; base++) {
+			int left = base;
+			int right = left + stride;
+			gid_t tmp = GROUP_AT(group_info, right);
+
+			while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
+				GROUP_AT(group_info, right) =
+				    GROUP_AT(group_info, left);
+				right = left;
+				left -= stride;
+			}
+			GROUP_AT(group_info, right) = tmp;
+		}
+		stride /= 3;
+	}
+}
+
+/* a simple bsearch */
+int groups_search(const struct group_info *group_info, gid_t grp)
+{
+	unsigned int left, right;
+
+	if (!group_info)
+		return 0;
+
+	left = 0;
+	right = group_info->ngroups;
+	while (left < right) {
+		unsigned int mid = (left+right)/2;
+		int cmp = grp - GROUP_AT(group_info, mid);
+		if (cmp > 0)
+			left = mid + 1;
+		else if (cmp < 0)
+			right = mid;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+/**
+ * set_groups - Change a group subscription in a set of credentials
+ * @new: The newly prepared set of credentials to alter
+ * @group_info: The group list to install
+ *
+ * Validate a group subscription and, if valid, insert it into a set
+ * of credentials.
+ */
+int set_groups(struct cred *new, struct group_info *group_info)
+{
+	int retval;
+
+	retval = security_task_setgroups(group_info);
+	if (retval)
+		return retval;
+
+	put_group_info(new->group_info);
+	groups_sort(group_info);
+	get_group_info(group_info);
+	new->group_info = group_info;
+	return 0;
+}
+
+EXPORT_SYMBOL(set_groups);
+
+/**
+ * set_current_groups - Change current's group subscription
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon current's task
+ * security record.
+ */
+int set_current_groups(struct group_info *group_info)
+{
+	struct cred *new;
+	int ret;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	ret = set_groups(new, group_info);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret;
+	}
+
+	return commit_creds(new);
+}
+
+EXPORT_SYMBOL(set_current_groups);
+
+SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+	const struct cred *cred = current_cred();
+	int i;
+
+	if (gidsetsize < 0)
+		return -EINVAL;
+
+	/* no need to grab task_lock here; it cannot change */
+	i = cred->group_info->ngroups;
+	if (gidsetsize) {
+		if (i > gidsetsize) {
+			i = -EINVAL;
+			goto out;
+		}
+		if (groups_to_user(grouplist, cred->group_info)) {
+			i = -EFAULT;
+			goto out;
+		}
+	}
+out:
+	return i;
+}
+
+/*
+ *	SMP: Our groups are copy-on-write. We can set them safely
+ *	without another task interfering.
+ */
+
+SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+	struct group_info *group_info;
+	int retval;
+
+	if (!capable(CAP_SETGID))
+		return -EPERM;
+	if ((unsigned)gidsetsize > NGROUPS_MAX)
+		return -EINVAL;
+
+	group_info = groups_alloc(gidsetsize);
+	if (!group_info)
+		return -ENOMEM;
+	retval = groups_from_user(group_info, grouplist);
+	if (retval) {
+		put_group_info(group_info);
+		return retval;
+	}
+
+	retval = set_current_groups(group_info);
+	put_group_info(group_info);
+
+	return retval;
+}
+
+/*
+ * Check whether we're fsgid/egid or in the supplemental group..
+ */
+int in_group_p(gid_t grp)
+{
+	const struct cred *cred = current_cred();
+	int retval = 1;
+
+	if (grp != cred->fsgid)
+		retval = groups_search(cred->group_info, grp);
+	return retval;
+}
+
+EXPORT_SYMBOL(in_group_p);
+
+int in_egroup_p(gid_t grp)
+{
+	const struct cred *cred = current_cred();
+	int retval = 1;
+
+	if (grp != cred->egid)
+		retval = groups_search(cred->group_info, grp);
+	return retval;
+}
+
+EXPORT_SYMBOL(in_egroup_p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 438d99a..b3f1097 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1113,289 +1113,6 @@ out:
 	return err;
 }
 
-/*
- * Supplementary group IDs
- */
-
-/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
-
-struct group_info *groups_alloc(int gidsetsize)
-{
-	struct group_info *group_info;
-	int nblocks;
-	int i;
-
-	nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
-	/* Make sure we always allocate at least one indirect block pointer */
-	nblocks = nblocks ? : 1;
-	group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
-	if (!group_info)
-		return NULL;
-	group_info->ngroups = gidsetsize;
-	group_info->nblocks = nblocks;
-	atomic_set(&group_info->usage, 1);
-
-	if (gidsetsize <= NGROUPS_SMALL)
-		group_info->blocks[0] = group_info->small_block;
-	else {
-		for (i = 0; i < nblocks; i++) {
-			gid_t *b;
-			b = (void *)__get_free_page(GFP_USER);
-			if (!b)
-				goto out_undo_partial_alloc;
-			group_info->blocks[i] = b;
-		}
-	}
-	return group_info;
-
-out_undo_partial_alloc:
-	while (--i >= 0) {
-		free_page((unsigned long)group_info->blocks[i]);
-	}
-	kfree(group_info);
-	return NULL;
-}
-
-EXPORT_SYMBOL(groups_alloc);
-
-void groups_free(struct group_info *group_info)
-{
-	if (group_info->blocks[0] != group_info->small_block) {
-		int i;
-		for (i = 0; i < group_info->nblocks; i++)
-			free_page((unsigned long)group_info->blocks[i]);
-	}
-	kfree(group_info);
-}
-
-EXPORT_SYMBOL(groups_free);
-
-/* export the group_info to a user-space array */
-static int groups_to_user(gid_t __user *grouplist,
-			  const struct group_info *group_info)
-{
-	int i;
-	unsigned int count = group_info->ngroups;
-
-	for (i = 0; i < group_info->nblocks; i++) {
-		unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
-		unsigned int len = cp_count * sizeof(*grouplist);
-
-		if (copy_to_user(grouplist, group_info->blocks[i], len))
-			return -EFAULT;
-
-		grouplist += NGROUPS_PER_BLOCK;
-		count -= cp_count;
-	}
-	return 0;
-}
-
-/* fill a group_info from a user-space array - it must be allocated already */
-static int groups_from_user(struct group_info *group_info,
-    gid_t __user *grouplist)
-{
-	int i;
-	unsigned int count = group_info->ngroups;
-
-	for (i = 0; i < group_info->nblocks; i++) {
-		unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
-		unsigned int len = cp_count * sizeof(*grouplist);
-
-		if (copy_from_user(group_info->blocks[i], grouplist, len))
-			return -EFAULT;
-
-		grouplist += NGROUPS_PER_BLOCK;
-		count -= cp_count;
-	}
-	return 0;
-}
-
-/* a simple Shell sort */
-static void groups_sort(struct group_info *group_info)
-{
-	int base, max, stride;
-	int gidsetsize = group_info->ngroups;
-
-	for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
-		; /* nothing */
-	stride /= 3;
-
-	while (stride) {
-		max = gidsetsize - stride;
-		for (base = 0; base < max; base++) {
-			int left = base;
-			int right = left + stride;
-			gid_t tmp = GROUP_AT(group_info, right);
-
-			while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
-				GROUP_AT(group_info, right) =
-				    GROUP_AT(group_info, left);
-				right = left;
-				left -= stride;
-			}
-			GROUP_AT(group_info, right) = tmp;
-		}
-		stride /= 3;
-	}
-}
-
-/* a simple bsearch */
-int groups_search(const struct group_info *group_info, gid_t grp)
-{
-	unsigned int left, right;
-
-	if (!group_info)
-		return 0;
-
-	left = 0;
-	right = group_info->ngroups;
-	while (left < right) {
-		unsigned int mid = (left+right)/2;
-		int cmp = grp - GROUP_AT(group_info, mid);
-		if (cmp > 0)
-			left = mid + 1;
-		else if (cmp < 0)
-			right = mid;
-		else
-			return 1;
-	}
-	return 0;
-}
-
-/**
- * set_groups - Change a group subscription in a set of credentials
- * @new: The newly prepared set of credentials to alter
- * @group_info: The group list to install
- *
- * Validate a group subscription and, if valid, insert it into a set
- * of credentials.
- */
-int set_groups(struct cred *new, struct group_info *group_info)
-{
-	int retval;
-
-	retval = security_task_setgroups(group_info);
-	if (retval)
-		return retval;
-
-	put_group_info(new->group_info);
-	groups_sort(group_info);
-	get_group_info(group_info);
-	new->group_info = group_info;
-	return 0;
-}
-
-EXPORT_SYMBOL(set_groups);
-
-/**
- * set_current_groups - Change current's group subscription
- * @group_info: The group list to impose
- *
- * Validate a group subscription and, if valid, impose it upon current's task
- * security record.
- */
-int set_current_groups(struct group_info *group_info)
-{
-	struct cred *new;
-	int ret;
-
-	new = prepare_creds();
-	if (!new)
-		return -ENOMEM;
-
-	ret = set_groups(new, group_info);
-	if (ret < 0) {
-		abort_creds(new);
-		return ret;
-	}
-
-	return commit_creds(new);
-}
-
-EXPORT_SYMBOL(set_current_groups);
-
-SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
-{
-	const struct cred *cred = current_cred();
-	int i;
-
-	if (gidsetsize < 0)
-		return -EINVAL;
-
-	/* no need to grab task_lock here; it cannot change */
-	i = cred->group_info->ngroups;
-	if (gidsetsize) {
-		if (i > gidsetsize) {
-			i = -EINVAL;
-			goto out;
-		}
-		if (groups_to_user(grouplist, cred->group_info)) {
-			i = -EFAULT;
-			goto out;
-		}
-	}
-out:
-	return i;
-}
-
-/*
- *	SMP: Our groups are copy-on-write. We can set them safely
- *	without another task interfering.
- */
- 
-SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
-{
-	struct group_info *group_info;
-	int retval;
-
-	if (!capable(CAP_SETGID))
-		return -EPERM;
-	if ((unsigned)gidsetsize > NGROUPS_MAX)
-		return -EINVAL;
-
-	group_info = groups_alloc(gidsetsize);
-	if (!group_info)
-		return -ENOMEM;
-	retval = groups_from_user(group_info, grouplist);
-	if (retval) {
-		put_group_info(group_info);
-		return retval;
-	}
-
-	retval = set_current_groups(group_info);
-	put_group_info(group_info);
-
-	return retval;
-}
-
-/*
- * Check whether we're fsgid/egid or in the supplemental group..
- */
-int in_group_p(gid_t grp)
-{
-	const struct cred *cred = current_cred();
-	int retval = 1;
-
-	if (grp != cred->fsgid)
-		retval = groups_search(cred->group_info, grp);
-	return retval;
-}
-
-EXPORT_SYMBOL(in_group_p);
-
-int in_egroup_p(gid_t grp)
-{
-	const struct cred *cred = current_cred();
-	int retval = 1;
-
-	if (grp != cred->egid)
-		retval = groups_search(cred->group_info, grp);
-	return retval;
-}
-
-EXPORT_SYMBOL(in_egroup_p);
-
 DECLARE_RWSEM(uts_sem);
 
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
-- 
cgit v0.10.2


From b72b71c6cb6ecc564d4d5f9c512a7df269837846 Mon Sep 17 00:00:00 2001
From: Huang Shijie <shijie8@gmail.com>
Date: Tue, 16 Jun 2009 15:33:42 -0700
Subject: lib: do code optimization for radix_tree_lookup() and
 radix_tree_lookup_slot()

radix_tree_lookup() and radix_tree_lookup_slot() have much the
same code except for the return value.

Introduce radix_tree_lookup_element() to do the real work.

/*
 * is_slot == 1 : search for the slot.
 * is_slot == 0 : search for the node.
 */
static void * radix_tree_lookup_element(struct radix_tree_root *root,
					unsigned long index, int is_slot);

Signed-off-by: Huang Shijie <shijie8@gmail.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 5301a52..23abbd9 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -351,20 +351,12 @@ int radix_tree_insert(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_insert);
 
-/**
- *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *
- *	Returns:  the slot corresponding to the position @index in the
- *	radix tree @root. This is useful for update-if-exists operations.
- *
- *	This function can be called under rcu_read_lock iff the slot is not
- *	modified by radix_tree_replace_slot, otherwise it must be called
- *	exclusive from other writers. Any dereference of the slot must be done
- *	using radix_tree_deref_slot.
+/*
+ * is_slot == 1 : search for the slot.
+ * is_slot == 0 : search for the node.
  */
-void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
+static void *radix_tree_lookup_element(struct radix_tree_root *root,
+				unsigned long index, int is_slot)
 {
 	unsigned int height, shift;
 	struct radix_tree_node *node, **slot;
@@ -376,7 +368,7 @@ void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
 	if (!radix_tree_is_indirect_ptr(node)) {
 		if (index > 0)
 			return NULL;
-		return (void **)&root->rnode;
+		return is_slot ? (void *)&root->rnode : node;
 	}
 	node = radix_tree_indirect_to_ptr(node);
 
@@ -397,7 +389,25 @@ void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
 		height--;
 	} while (height > 0);
 
-	return (void **)slot;
+	return is_slot ? (void *)slot:node;
+}
+
+/**
+ *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Returns:  the slot corresponding to the position @index in the
+ *	radix tree @root. This is useful for update-if-exists operations.
+ *
+ *	This function can be called under rcu_read_lock iff the slot is not
+ *	modified by radix_tree_replace_slot, otherwise it must be called
+ *	exclusive from other writers. Any dereference of the slot must be done
+ *	using radix_tree_deref_slot.
+ */
+void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
+{
+	return (void **)radix_tree_lookup_element(root, index, 1);
 }
 EXPORT_SYMBOL(radix_tree_lookup_slot);
 
@@ -415,38 +425,7 @@ EXPORT_SYMBOL(radix_tree_lookup_slot);
  */
 void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
 {
-	unsigned int height, shift;
-	struct radix_tree_node *node, **slot;
-
-	node = rcu_dereference(root->rnode);
-	if (node == NULL)
-		return NULL;
-
-	if (!radix_tree_is_indirect_ptr(node)) {
-		if (index > 0)
-			return NULL;
-		return node;
-	}
-	node = radix_tree_indirect_to_ptr(node);
-
-	height = node->height;
-	if (index > radix_tree_maxindex(height))
-		return NULL;
-
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-
-	do {
-		slot = (struct radix_tree_node **)
-			(node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
-		node = rcu_dereference(*slot);
-		if (node == NULL)
-			return NULL;
-
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	} while (height > 0);
-
-	return node;
+	return radix_tree_lookup_element(root, index, 0);
 }
 EXPORT_SYMBOL(radix_tree_lookup);
 
-- 
cgit v0.10.2


From 009789f040b71699278e70a6664701c10065e430 Mon Sep 17 00:00:00 2001
From: Chris Peterson <cpeterso@cpeterso.com>
Date: Tue, 16 Jun 2009 15:33:43 -0700
Subject: slow-work: use round_jiffies() for thread pool's cull and OOM timers

Round the slow work queue's cull and OOM timeouts to whole second boundary
with round_jiffies().  The slow work queue uses a pair of timers to cull
idle threads and, after OOM, to delay new thread creation.

This patch also extracts the mod_timer() logic for the cull timer into a
separate helper function.

By rounding non-time-critical timers such as these to whole seconds, they
will be batched up to fire at the same time rather than being spread out.
This allows the CPU wake up less, which saves power.

Signed-off-by: Chris Peterson <cpeterso@cpeterso.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 521ed20..09d7519 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -319,6 +319,15 @@ cant_get_ref:
 EXPORT_SYMBOL(slow_work_enqueue);
 
 /*
+ * Schedule a cull of the thread pool at some time in the near future
+ */
+static void slow_work_schedule_cull(void)
+{
+	mod_timer(&slow_work_cull_timer,
+		  round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
+}
+
+/*
  * Worker thread culling algorithm
  */
 static bool slow_work_cull_thread(void)
@@ -335,8 +344,7 @@ static bool slow_work_cull_thread(void)
 		    list_empty(&vslow_work_queue) &&
 		    atomic_read(&slow_work_thread_count) >
 		    slow_work_min_threads) {
-			mod_timer(&slow_work_cull_timer,
-				  jiffies + SLOW_WORK_CULL_TIMEOUT);
+			slow_work_schedule_cull();
 			do_cull = true;
 		}
 	}
@@ -393,8 +401,7 @@ static int slow_work_thread(void *_data)
 			    list_empty(&vslow_work_queue) &&
 			    atomic_read(&slow_work_thread_count) >
 			    slow_work_min_threads)
-				mod_timer(&slow_work_cull_timer,
-					  jiffies + SLOW_WORK_CULL_TIMEOUT);
+				slow_work_schedule_cull();
 			continue;
 		}
 
@@ -458,7 +465,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
 		if (atomic_dec_and_test(&slow_work_thread_count))
 			BUG(); /* we're running on a slow work thread... */
 		mod_timer(&slow_work_oom_timer,
-			  jiffies + SLOW_WORK_OOM_TIMEOUT);
+			  round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
 	} else {
 		/* ratelimit the starting of new threads */
 		mod_timer(&slow_work_oom_timer, jiffies + 1);
@@ -502,8 +509,7 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
 			if (n < 0 && !slow_work_may_not_start_new_thread)
 				slow_work_enqueue(&slow_work_new_thread);
 			else if (n > 0)
-				mod_timer(&slow_work_cull_timer,
-					  jiffies + SLOW_WORK_CULL_TIMEOUT);
+				slow_work_schedule_cull();
 		}
 		mutex_unlock(&slow_work_user_lock);
 	}
@@ -529,8 +535,7 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
 				atomic_read(&slow_work_thread_count);
 
 			if (n < 0)
-				mod_timer(&slow_work_cull_timer,
-					  jiffies + SLOW_WORK_CULL_TIMEOUT);
+				slow_work_schedule_cull();
 		}
 		mutex_unlock(&slow_work_user_lock);
 	}
-- 
cgit v0.10.2


From a9c569539312cfd3c820b38036679a9d72c55331 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 16 Jun 2009 15:33:44 -0700
Subject: use printk_once() in several places

There are some places to be able to use printk_once instead of hard coding.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3ffdcfa..9fa3388 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -487,7 +487,6 @@ out:
 static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 {
 	char *v = c->x86_vendor_id;
-	static int printed;
 	int i;
 
 	for (i = 0; i < X86_VENDOR_NUM; i++) {
@@ -504,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 		}
 	}
 
-	if (!printed) {
-		printed++;
-		printk(KERN_ERR
-		    "CPU: vendor_id '%s' unknown, using generic init.\n", v);
-
-		printk(KERN_ERR "CPU: Your system may be unstable.\n");
-	}
+	printk_once(KERN_ERR
+			"CPU: vendor_id '%s' unknown, using generic init.\n" \
+			"CPU: Your system may be unstable.\n", v);
 
 	c->x86_vendor = X86_VENDOR_UNKNOWN;
 	this_cpu = &default_cpu;
diff --git a/drivers/pcmcia/pcmcia_ioctl.c b/drivers/pcmcia/pcmcia_ioctl.c
index 1703b20..6095f8d 100644
--- a/drivers/pcmcia/pcmcia_ioctl.c
+++ b/drivers/pcmcia/pcmcia_ioctl.c
@@ -915,12 +915,9 @@ static int ds_ioctl(struct inode * inode, struct file * file,
 		err = -EPERM;
 		goto free_out;
 	} else {
-		static int printed = 0;
-		if (!printed) {
-			printk(KERN_WARNING "2.6. kernels use pcmciamtd instead of memory_cs.c and do not require special\n");
-			printk(KERN_WARNING "MTD handling any more.\n");
-			printed++;
-		}
+			printk_once(KERN_WARNING
+				"2.6. kernels use pcmciamtd instead of memory_cs.c and do not require special\n");
+			printk_once(KERN_WARNING "MTD handling any more.\n");
 	}
 	err = -EINVAL;
 	goto free_out;
-- 
cgit v0.10.2


From c67ae69b661f3c2fe1a9c8259bc948c68b082166 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 16 Jun 2009 15:33:45 -0700
Subject: hexdump: remove the trailing space

For example:
        hex_dump_to_buffer("AB", 2, 16, 1, buf, 100, 0);
        pr_info("[%s]\n", buf);

I'd expect the output to be "[41 42]", but actually it's "[41 42 ]"

This patch also makes the required buf to be minimum.  To print the hex
format of "AB", a buf with size 6 should be sufficient, but
hex_dump_to_buffer() required at least 8.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/lib/hexdump.c b/lib/hexdump.c
index f07c0db..39af256 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -65,7 +65,8 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
 
 		for (j = 0; j < ngroups; j++)
 			lx += scnprintf(linebuf + lx, linebuflen - lx,
-				"%16.16llx ", (unsigned long long)*(ptr8 + j));
+				"%s%16.16llx", j ? " " : "",
+				(unsigned long long)*(ptr8 + j));
 		ascii_column = 17 * ngroups + 2;
 		break;
 	}
@@ -76,7 +77,7 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
 
 		for (j = 0; j < ngroups; j++)
 			lx += scnprintf(linebuf + lx, linebuflen - lx,
-				"%8.8x ", *(ptr4 + j));
+				"%s%8.8x", j ? " " : "", *(ptr4 + j));
 		ascii_column = 9 * ngroups + 2;
 		break;
 	}
@@ -87,19 +88,21 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
 
 		for (j = 0; j < ngroups; j++)
 			lx += scnprintf(linebuf + lx, linebuflen - lx,
-				"%4.4x ", *(ptr2 + j));
+				"%s%4.4x", j ? " " : "", *(ptr2 + j));
 		ascii_column = 5 * ngroups + 2;
 		break;
 	}
 
 	default:
-		for (j = 0; (j < rowsize) && (j < len) && (lx + 4) < linebuflen;
-		     j++) {
+		for (j = 0; (j < len) && (lx + 3) <= linebuflen; j++) {
 			ch = ptr[j];
 			linebuf[lx++] = hex_asc_hi(ch);
 			linebuf[lx++] = hex_asc_lo(ch);
 			linebuf[lx++] = ' ';
 		}
+		if (j)
+			lx--;
+
 		ascii_column = 3 * rowsize + 2;
 		break;
 	}
@@ -108,7 +111,7 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
 
 	while (lx < (linebuflen - 1) && lx < (ascii_column - 1))
 		linebuf[lx++] = ' ';
-	for (j = 0; (j < rowsize) && (j < len) && (lx + 2) < linebuflen; j++)
+	for (j = 0; (j < len) && (lx + 2) < linebuflen; j++)
 		linebuf[lx++] = (isascii(ptr[j]) && isprint(ptr[j])) ? ptr[j]
 				: '.';
 nil:
-- 
cgit v0.10.2


From b8d9a86590fb334d28c5905a4c419ece7d08e37d Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Tue, 16 Jun 2009 15:33:46 -0700
Subject: Documentation/accounting/getdelays.c intialize the variable before
 using it

Fix compilation warning:

Documentation/accounting/getdelays.c: In function `main':
Documentation/accounting/getdelays.c:249: warning: `cmd_type' may be used uninitialized in this function

This is in fact a false positive.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Acked-by: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index 7ea2311..aa73e72 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -246,7 +246,8 @@ void print_ioacct(struct taskstats *t)
 
 int main(int argc, char *argv[])
 {
-	int c, rc, rep_len, aggr_len, len2, cmd_type;
+	int c, rc, rep_len, aggr_len, len2;
+	int cmd_type = TASKSTATS_CMD_ATTR_UNSPEC;
 	__u16 id;
 	__u32 mypid;
 
-- 
cgit v0.10.2


From e4c9dd0fbad60c098a026e9b06d9de1bc98c5e89 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 16 Jun 2009 15:33:47 -0700
Subject: kmap_types: make most arches use generic header file

Convert most arches to use asm-generic/kmap_types.h.

Move the KM_FENCE_ macro additions into asm-generic/kmap_types.h,
controlled by __WITH_KM_FENCE from each arch's kmap_types.h file.

Would be nice to be able to add custom KM_types per arch, but I don't yet
see a nice, clean way to do that.

Built on x86_64, i386, mips, sparc, alpha(tonyb), powerpc(tonyb), and
68k(tonyb).

Note: avr32 should be able to remove KM_PTE2 (since it's not used) and
then just use the generic kmap_types.h file.  Get avr32 maintainer
approval.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: <linux-arch@vger.kernel.org>
Acked-by: Mike Frysinger <vapier@gentoo.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Bryan Wu <cooloney@kernel.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: "Luck Tony" <tony.luck@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/alpha/include/asm/kmap_types.h b/arch/alpha/include/asm/kmap_types.h
index 3e6735a..a8d4ec8 100644
--- a/arch/alpha/include/asm/kmap_types.h
+++ b/arch/alpha/include/asm/kmap_types.h
@@ -3,30 +3,12 @@
 
 /* Dummy header just to define km_type. */
 
-
 #ifdef CONFIG_DEBUG_HIGHMEM
-# define D(n) __KM_FENCE_##n ,
-#else
-# define D(n)
+#define  __WITH_KM_FENCE
 #endif
 
-enum km_type {
-D(0)	KM_BOUNCE_READ,
-D(1)	KM_SKB_SUNRPC_DATA,
-D(2)	KM_SKB_DATA_SOFTIRQ,
-D(3)	KM_USER0,
-D(4)	KM_USER1,
-D(5)	KM_BIO_SRC_IRQ,
-D(6)	KM_BIO_DST_IRQ,
-D(7)	KM_PTE0,
-D(8)	KM_PTE1,
-D(9)	KM_IRQ0,
-D(10)	KM_IRQ1,
-D(11)	KM_SOFTIRQ0,
-D(12)	KM_SOFTIRQ1,
-D(13)	KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
-#undef D
+#undef __WITH_KM_FENCE
 
 #endif
diff --git a/arch/blackfin/include/asm/kmap_types.h b/arch/blackfin/include/asm/kmap_types.h
index e215f71..0a88622 100644
--- a/arch/blackfin/include/asm/kmap_types.h
+++ b/arch/blackfin/include/asm/kmap_types.h
@@ -1,21 +1,6 @@
 #ifndef _ASM_KMAP_TYPES_H
 #define _ASM_KMAP_TYPES_H
 
-enum km_type {
-	KM_BOUNCE_READ,
-	KM_SKB_SUNRPC_DATA,
-	KM_SKB_DATA_SOFTIRQ,
-	KM_USER0,
-	KM_USER1,
-	KM_BIO_SRC_IRQ,
-	KM_BIO_DST_IRQ,
-	KM_PTE0,
-	KM_PTE1,
-	KM_IRQ0,
-	KM_IRQ1,
-	KM_SOFTIRQ0,
-	KM_SOFTIRQ1,
-	KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
 #endif
diff --git a/arch/cris/include/asm/kmap_types.h b/arch/cris/include/asm/kmap_types.h
index 492988c..d2d643c 100644
--- a/arch/cris/include/asm/kmap_types.h
+++ b/arch/cris/include/asm/kmap_types.h
@@ -5,21 +5,6 @@
  * is actually used on cris. 
  */
 
-enum km_type {
-	KM_BOUNCE_READ,
-	KM_SKB_SUNRPC_DATA,
-	KM_SKB_DATA_SOFTIRQ,
-	KM_USER0,
-	KM_USER1,
-	KM_BIO_SRC_IRQ,
-	KM_BIO_DST_IRQ,
-	KM_PTE0,
-	KM_PTE1,
-	KM_IRQ0,
-	KM_IRQ1,
-	KM_SOFTIRQ0,
-	KM_SOFTIRQ1,
-	KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
 #endif
diff --git a/arch/h8300/include/asm/kmap_types.h b/arch/h8300/include/asm/kmap_types.h
index 1ec8a34..be12a71 100644
--- a/arch/h8300/include/asm/kmap_types.h
+++ b/arch/h8300/include/asm/kmap_types.h
@@ -1,21 +1,6 @@
 #ifndef _ASM_H8300_KMAP_TYPES_H
 #define _ASM_H8300_KMAP_TYPES_H
 
-enum km_type {
-	KM_BOUNCE_READ,
-	KM_SKB_SUNRPC_DATA,
-	KM_SKB_DATA_SOFTIRQ,
-	KM_USER0,
-	KM_USER1,
-	KM_BIO_SRC_IRQ,
-	KM_BIO_DST_IRQ,
-	KM_PTE0,
-	KM_PTE1,
-	KM_IRQ0,
-	KM_IRQ1,
-	KM_SOFTIRQ0,
-	KM_SOFTIRQ1,
-	KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
 #endif
diff --git a/arch/ia64/include/asm/kmap_types.h b/arch/ia64/include/asm/kmap_types.h
index 5d1658a..05d5f99 100644
--- a/arch/ia64/include/asm/kmap_types.h
+++ b/arch/ia64/include/asm/kmap_types.h
@@ -1,30 +1,12 @@
 #ifndef _ASM_IA64_KMAP_TYPES_H
 #define _ASM_IA64_KMAP_TYPES_H
 
-
 #ifdef CONFIG_DEBUG_HIGHMEM
-# define D(n) __KM_FENCE_##n ,
-#else
-# define D(n)
+#define  __WITH_KM_FENCE
 #endif
 
-enum km_type {
-D(0)	KM_BOUNCE_READ,
-D(1)	KM_SKB_SUNRPC_DATA,
-D(2)	KM_SKB_DATA_SOFTIRQ,
-D(3)	KM_USER0,
-D(4)	KM_USER1,
-D(5)	KM_BIO_SRC_IRQ,
-D(6)	KM_BIO_DST_IRQ,
-D(7)	KM_PTE0,
-D(8)	KM_PTE1,
-D(9)	KM_IRQ0,
-D(10)	KM_IRQ1,
-D(11)	KM_SOFTIRQ0,
-D(12)	KM_SOFTIRQ1,
-D(13)	KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
-#undef D
+#undef __WITH_KM_FENCE
 
 #endif /* _ASM_IA64_KMAP_TYPES_H */
diff --git a/arch/m32r/include/asm/kmap_types.h b/arch/m32r/include/asm/kmap_types.h
index fa94dc6..4cdb5e3 100644
--- a/arch/m32r/include/asm/kmap_types.h
+++ b/arch/m32r/include/asm/kmap_types.h
@@ -2,28 +2,11 @@
 #define __M32R_KMAP_TYPES_H
 
 #ifdef CONFIG_DEBUG_HIGHMEM
-# define D(n) __KM_FENCE_##n ,
-#else
-# define D(n)
+#define  __WITH_KM_FENCE
 #endif
 
-enum km_type {
-D(0)	KM_BOUNCE_READ,
-D(1)	KM_SKB_SUNRPC_DATA,
-D(2)	KM_SKB_DATA_SOFTIRQ,
-D(3)	KM_USER0,
-D(4)	KM_USER1,
-D(5)	KM_BIO_SRC_IRQ,
-D(6)	KM_BIO_DST_IRQ,
-D(7)	KM_PTE0,
-D(8)	KM_PTE1,
-D(9)	KM_IRQ0,
-D(10)	KM_IRQ1,
-D(11)	KM_SOFTIRQ0,
-D(12)	KM_SOFTIRQ1,
-D(13)	KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
-#undef D
+#undef __WITH_KM_FENCE
 
 #endif /* __M32R_KMAP_TYPES_H */
diff --git a/arch/m68k/include/asm/kmap_types.h b/arch/m68k/include/asm/kmap_types.h
index c843c63..3413cc1 100644
--- a/arch/m68k/include/asm/kmap_types.h
+++ b/arch/m68k/include/asm/kmap_types.h
@@ -1,21 +1,6 @@
 #ifndef __ASM_M68K_KMAP_TYPES_H
 #define __ASM_M68K_KMAP_TYPES_H
 
-enum km_type {
-	KM_BOUNCE_READ,
-	KM_SKB_SUNRPC_DATA,
-	KM_SKB_DATA_SOFTIRQ,
-	KM_USER0,
-	KM_USER1,
-	KM_BIO_SRC_IRQ,
-	KM_BIO_DST_IRQ,
-	KM_PTE0,
-	KM_PTE1,
-	KM_IRQ0,
-	KM_IRQ1,
-	KM_SOFTIRQ0,
-	KM_SOFTIRQ1,
-	KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
 #endif	/* __ASM_M68K_KMAP_TYPES_H */
diff --git a/arch/microblaze/include/asm/kmap_types.h b/arch/microblaze/include/asm/kmap_types.h
index 4d7e222..2597525 100644
--- a/arch/microblaze/include/asm/kmap_types.h
+++ b/arch/microblaze/include/asm/kmap_types.h
@@ -1,29 +1,6 @@
-/*
- * Copyright (C) 2006 Atmark Techno, Inc.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
 #ifndef _ASM_MICROBLAZE_KMAP_TYPES_H
 #define _ASM_MICROBLAZE_KMAP_TYPES_H
 
-enum km_type {
-	KM_BOUNCE_READ,
-	KM_SKB_SUNRPC_DATA,
-	KM_SKB_DATA_SOFTIRQ,
-	KM_USER0,
-	KM_USER1,
-	KM_BIO_SRC_IRQ,
-	KM_BIO_DST_IRQ,
-	KM_PTE0,
-	KM_PTE1,
-	KM_IRQ0,
-	KM_IRQ1,
-	KM_SOFTIRQ0,
-	KM_SOFTIRQ1,
-	KM_TYPE_NR,
-};
+#include <asm-generic/kmap_types.h>
 
 #endif /* _ASM_MICROBLAZE_KMAP_TYPES_H */
diff --git a/arch/mips/include/asm/kmap_types.h b/arch/mips/include/asm/kmap_types.h
index 806aae3..58e91ed 100644
--- a/arch/mips/include/asm/kmap_types.h
+++ b/arch/mips/include/asm/kmap_types.h
@@ -1,30 +1,12 @@
 #ifndef _ASM_KMAP_TYPES_H
 #define _ASM_KMAP_TYPES_H
 
-
 #ifdef CONFIG_DEBUG_HIGHMEM
-# define D(n) __KM_FENCE_##n ,
-#else
-# define D(n)
+#define  __WITH_KM_FENCE
 #endif
 
-enum km_type {
-D(0)	KM_BOUNCE_READ,
-D(1)	KM_SKB_SUNRPC_DATA,
-D(2)	KM_SKB_DATA_SOFTIRQ,
-D(3)	KM_USER0,
-D(4)	KM_USER1,
-D(5)	KM_BIO_SRC_IRQ,
-D(6)	KM_BIO_DST_IRQ,
-D(7)	KM_PTE0,
-D(8)	KM_PTE1,
-D(9)	KM_IRQ0,
-D(10)	KM_IRQ1,
-D(11)	KM_SOFTIRQ0,
-D(12)	KM_SOFTIRQ1,
-D(13)	KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
-#undef D
+#undef __WITH_KM_FENCE
 
 #endif
diff --git a/arch/mn10300/include/asm/kmap_types.h b/arch/mn10300/include/asm/kmap_types.h
index 3398f9f..76d093b 100644
--- a/arch/mn10300/include/asm/kmap_types.h
+++ b/arch/mn10300/include/asm/kmap_types.h
@@ -1,31 +1,6 @@
-/* MN10300 kmap_atomic() slot IDs
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
 #ifndef _ASM_KMAP_TYPES_H
 #define _ASM_KMAP_TYPES_H
 
-enum km_type {
-	KM_BOUNCE_READ,
-	KM_SKB_SUNRPC_DATA,
-	KM_SKB_DATA_SOFTIRQ,
-	KM_USER0,
-	KM_USER1,
-	KM_BIO_SRC_IRQ,
-	KM_BIO_DST_IRQ,
-	KM_PTE0,
-	KM_PTE1,
-	KM_IRQ0,
-	KM_IRQ1,
-	KM_SOFTIRQ0,
-	KM_SOFTIRQ1,
-	KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
 #endif /* _ASM_KMAP_TYPES_H */
diff --git a/arch/parisc/include/asm/kmap_types.h b/arch/parisc/include/asm/kmap_types.h
index 806aae3..58e91ed 100644
--- a/arch/parisc/include/asm/kmap_types.h
+++ b/arch/parisc/include/asm/kmap_types.h
@@ -1,30 +1,12 @@
 #ifndef _ASM_KMAP_TYPES_H
 #define _ASM_KMAP_TYPES_H
 
-
 #ifdef CONFIG_DEBUG_HIGHMEM
-# define D(n) __KM_FENCE_##n ,
-#else
-# define D(n)
+#define  __WITH_KM_FENCE
 #endif
 
-enum km_type {
-D(0)	KM_BOUNCE_READ,
-D(1)	KM_SKB_SUNRPC_DATA,
-D(2)	KM_SKB_DATA_SOFTIRQ,
-D(3)	KM_USER0,
-D(4)	KM_USER1,
-D(5)	KM_BIO_SRC_IRQ,
-D(6)	KM_BIO_DST_IRQ,
-D(7)	KM_PTE0,
-D(8)	KM_PTE1,
-D(9)	KM_IRQ0,
-D(10)	KM_IRQ1,
-D(11)	KM_SOFTIRQ0,
-D(12)	KM_SOFTIRQ1,
-D(13)	KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
-#undef D
+#undef __WITH_KM_FENCE
 
 #endif
diff --git a/arch/s390/include/asm/kmap_types.h b/arch/s390/include/asm/kmap_types.h
index fd15746..94ec3ee 100644
--- a/arch/s390/include/asm/kmap_types.h
+++ b/arch/s390/include/asm/kmap_types.h
@@ -2,22 +2,7 @@
 #ifndef _ASM_KMAP_TYPES_H
 #define _ASM_KMAP_TYPES_H
 
-enum km_type {
-	KM_BOUNCE_READ,
-	KM_SKB_SUNRPC_DATA,
-	KM_SKB_DATA_SOFTIRQ,
-	KM_USER0,
-	KM_USER1,
-	KM_BIO_SRC_IRQ,
-	KM_BIO_DST_IRQ,
-	KM_PTE0,
-	KM_PTE1,
-	KM_IRQ0,
-	KM_IRQ1,
-	KM_SOFTIRQ0,
-	KM_SOFTIRQ1,	
-	KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
 #endif
 #endif /* __KERNEL__ */
diff --git a/arch/sh/include/asm/kmap_types.h b/arch/sh/include/asm/kmap_types.h
index 84d565c..5962b08 100644
--- a/arch/sh/include/asm/kmap_types.h
+++ b/arch/sh/include/asm/kmap_types.h
@@ -3,30 +3,12 @@
 
 /* Dummy header just to define km_type. */
 
-
 #ifdef CONFIG_DEBUG_HIGHMEM
-# define D(n) __KM_FENCE_##n ,
-#else
-# define D(n)
+#define  __WITH_KM_FENCE
 #endif
 
-enum km_type {
-D(0)	KM_BOUNCE_READ,
-D(1)	KM_SKB_SUNRPC_DATA,
-D(2)	KM_SKB_DATA_SOFTIRQ,
-D(3)	KM_USER0,
-D(4)	KM_USER1,
-D(5)	KM_BIO_SRC_IRQ,
-D(6)	KM_BIO_DST_IRQ,
-D(7)	KM_PTE0,
-D(8)	KM_PTE1,
-D(9)	KM_IRQ0,
-D(10)	KM_IRQ1,
-D(11)	KM_SOFTIRQ0,
-D(12)	KM_SOFTIRQ1,
-D(13)	KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
-#undef D
+#undef __WITH_KM_FENCE
 
 #endif
diff --git a/arch/sparc/include/asm/kmap_types.h b/arch/sparc/include/asm/kmap_types.h
index 602f5e0..aad2174 100644
--- a/arch/sparc/include/asm/kmap_types.h
+++ b/arch/sparc/include/asm/kmap_types.h
@@ -5,21 +5,6 @@
  * is actually used on sparc.  -DaveM
  */
 
-enum km_type {
-	KM_BOUNCE_READ,
-	KM_SKB_SUNRPC_DATA,
-	KM_SKB_DATA_SOFTIRQ,
-	KM_USER0,
-	KM_USER1,
-	KM_BIO_SRC_IRQ,
-	KM_BIO_DST_IRQ,
-	KM_PTE0,
-	KM_PTE1,
-	KM_IRQ0,
-	KM_IRQ1,
-	KM_SOFTIRQ0,
-	KM_SOFTIRQ1,
-	KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
 #endif
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
index 5759c16..9e00a73 100644
--- a/arch/x86/include/asm/kmap_types.h
+++ b/arch/x86/include/asm/kmap_types.h
@@ -2,28 +2,11 @@
 #define _ASM_X86_KMAP_TYPES_H
 
 #if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
-# define D(n) __KM_FENCE_##n ,
-#else
-# define D(n)
+#define  __WITH_KM_FENCE
 #endif
 
-enum km_type {
-D(0)	KM_BOUNCE_READ,
-D(1)	KM_SKB_SUNRPC_DATA,
-D(2)	KM_SKB_DATA_SOFTIRQ,
-D(3)	KM_USER0,
-D(4)	KM_USER1,
-D(5)	KM_BIO_SRC_IRQ,
-D(6)	KM_BIO_DST_IRQ,
-D(7)	KM_PTE0,
-D(8)	KM_PTE1,
-D(9)	KM_IRQ0,
-D(10)	KM_IRQ1,
-D(11)	KM_SOFTIRQ0,
-D(12)	KM_SOFTIRQ1,
-D(13)	KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
-#undef D
+#undef __WITH_KM_FENCE
 
 #endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/xtensa/include/asm/kmap_types.h b/arch/xtensa/include/asm/kmap_types.h
index 9e822d2..11c687e 100644
--- a/arch/xtensa/include/asm/kmap_types.h
+++ b/arch/xtensa/include/asm/kmap_types.h
@@ -1,31 +1,6 @@
-/*
- * include/asm-xtensa/kmap_types.h
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2001 - 2005 Tensilica Inc.
- */
-
 #ifndef _XTENSA_KMAP_TYPES_H
 #define _XTENSA_KMAP_TYPES_H
 
-enum km_type {
-  KM_BOUNCE_READ,
-  KM_SKB_SUNRPC_DATA,
-  KM_SKB_DATA_SOFTIRQ,
-  KM_USER0,
-  KM_USER1,
-  KM_BIO_SRC_IRQ,
-  KM_BIO_DST_IRQ,
-  KM_PTE0,
-  KM_PTE1,
-  KM_IRQ0,
-  KM_IRQ1,
-  KM_SOFTIRQ0,
-  KM_SOFTIRQ1,
-  KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
 
 #endif	/* _XTENSA_KMAP_TYPES_H */
diff --git a/include/asm-generic/kmap_types.h b/include/asm-generic/kmap_types.h
index 58c3305..54e8b3d 100644
--- a/include/asm-generic/kmap_types.h
+++ b/include/asm-generic/kmap_types.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_GENERIC_KMAP_TYPES_H
 #define _ASM_GENERIC_KMAP_TYPES_H
 
-#ifdef CONFIG_DEBUG_HIGHMEM
+#ifdef __WITH_KM_FENCE
 # define D(n) __KM_FENCE_##n ,
 #else
 # define D(n)
-- 
cgit v0.10.2


From cc6f26774136b7f5307abcd3887f08360c9b7554 Mon Sep 17 00:00:00 2001
From: Masatake YAMATO <yamato@redhat.com>
Date: Tue, 16 Jun 2009 15:33:49 -0700
Subject: syscalls.h: remove duplicated declarations for sys_pipe2

sys_pipe2 is declared twice in include/linux/syscalls.h.

Signed-off-by: Masatake YAMATO <yamato@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 418d90f..fa4242c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -434,6 +434,7 @@ asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg);
 asmlinkage long sys_fcntl64(unsigned int fd,
 				unsigned int cmd, unsigned long arg);
 #endif
+asmlinkage long sys_pipe(int __user *fildes);
 asmlinkage long sys_pipe2(int __user *fildes, int flags);
 asmlinkage long sys_dup(unsigned int fildes);
 asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd);
@@ -751,8 +752,6 @@ asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
 asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
 			  struct timespec __user *, const sigset_t __user *,
 			  size_t);
-asmlinkage long sys_pipe2(int __user *, int);
-asmlinkage long sys_pipe(int __user *);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
-- 
cgit v0.10.2


From 73d05163d15e4a400db63df906c55260a6dae987 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 16 Jun 2009 15:33:50 -0700
Subject: eisa.ids: add Network Peripherals FDDI boards

Add EISA IDs for Network Peripherals FDDI boards.  Descriptions taken from
the respective EISA configuration files.

It's unlikely we'll ever support these cards, the problem being the lack
of documentation.  Assuming the policy for the EISA ID database is the
same as for PCI I'm sending these entries for the sake of completeness.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Marc Zyngier <maz@misterjones.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/eisa/eisa.ids b/drivers/eisa/eisa.ids
index ed69837..6cbb7a5 100644
--- a/drivers/eisa/eisa.ids
+++ b/drivers/eisa/eisa.ids
@@ -1140,6 +1140,11 @@ NON0301 "c't Universale Graphic Adapter"
 NON0401 "c't Universal Ethernet Adapter"
 NON0501 "c't Universal 16-Bit Sound Adapter"
 NON0601 "c't Universal 8-Bit Adapter"
+NPI0120 "Network Peripherals NP-EISA-1 FDDI Interface"
+NPI0221 "Network Peripherals NP-EISA-2 FDDI Interface"
+NPI0223 "Network Peripherals NP-EISA-2E Enhanced FDDI Interface"
+NPI0301 "Network Peripherals NP-EISA-3 FDDI Interface"
+NPI0303 "Network Peripherals NP-EISA-3E Enhanced FDDI Interface"
 NSS0011 "Newport Systems Solutions WNIC Adapter"
 NVL0701 "Novell NE3200 Bus Master Ethernet"
 NVL0702 "Novell NE3200T Bus Master Ethernet"
-- 
cgit v0.10.2


From 4764e280dc7dde1534161e148d38dbd792a2b8ab Mon Sep 17 00:00:00 2001
From: "Figo.zhang" <figo1802@gmail.com>
Date: Tue, 16 Jun 2009 15:33:51 -0700
Subject: Documentation/atomic_ops.txt: fix sample code

list_add() lost a parameter in sample code.

Signed-off-by: Figo.zhang <figo1802@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 4ef2450..396bec3 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -229,10 +229,10 @@ kernel.  It is the use of atomic counters to implement reference
 counting, and it works such that once the counter falls to zero it can
 be guaranteed that no other entity can be accessing the object:
 
-static void obj_list_add(struct obj *obj)
+static void obj_list_add(struct obj *obj, struct list_head *head)
 {
 	obj->active = 1;
-	list_add(&obj->list);
+	list_add(&obj->list, head);
 }
 
 static void obj_list_del(struct obj *obj)
-- 
cgit v0.10.2


From f324edc85e5c1137e49e3b36a58cf436ab5b1fb3 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Tue, 16 Jun 2009 15:33:52 -0700
Subject: console: make blank timeout value a boot option

The console blank timer is currently hardcoded to 10*60 seconds which
might be annoying on systems with no input devices attached to wake up the
console again.  Especially during development, disabling the screen saver
can be handy - for example when debugging the root fs mount mechanism or
other scenarios where no userspace program could be started to do that at
runtime from userspace.

This patch defines a core_param for the variable in charge which allows
users to entirely disable the blank feature at boot time by setting it 0.
The value can still be overwritten at runtime using the standard ioctl
call - this just allows to conditionally change the default.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index ad38006..5578248 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -546,6 +546,10 @@ and is between 256 and 4096 characters. It is defined in the file
 			console=brl,ttyS0
 		For now, only VisioBraille is supported.
 
+	consoleblank=	[KNL] The console blank (screen saver) timeout in
+			seconds. Defaults to 10*60 = 10mins. A value of 0
+			disables the blank timer.
+
 	coredump_filter=
 			[KNL] Change the default value for
 			/proc/<pid>/coredump_filter.
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index c796a86..d9113b4 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -171,8 +171,9 @@ int do_poke_blanked_console;
 int console_blanked;
 
 static int vesa_blank_mode; /* 0:none 1:suspendV 2:suspendH 3:powerdown */
-static int blankinterval = 10*60*HZ;
 static int vesa_off_interval;
+static int blankinterval = 10*60;
+core_param(consoleblank, blankinterval, int, 0444);
 
 static DECLARE_WORK(console_work, console_callback);
 
@@ -1485,7 +1486,7 @@ static void setterm_command(struct vc_data *vc)
 			update_attr(vc);
 			break;
 		case 9:	/* set blanking interval */
-			blankinterval = ((vc->vc_par[1] < 60) ? vc->vc_par[1] : 60) * 60 * HZ;
+			blankinterval = ((vc->vc_par[1] < 60) ? vc->vc_par[1] : 60) * 60;
 			poke_blanked_console();
 			break;
 		case 10: /* set bell frequency in Hz */
@@ -2871,7 +2872,7 @@ static int __init con_init(void)
 
 	if (blankinterval) {
 		blank_state = blank_normal_wait;
-		mod_timer(&console_timer, jiffies + blankinterval);
+		mod_timer(&console_timer, jiffies + (blankinterval * HZ));
 	}
 
 	for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) {
@@ -3677,7 +3678,7 @@ void do_unblank_screen(int leaving_gfx)
 		return; /* but leave console_blanked != 0 */
 
 	if (blankinterval) {
-		mod_timer(&console_timer, jiffies + blankinterval);
+		mod_timer(&console_timer, jiffies + (blankinterval * HZ));
 		blank_state = blank_normal_wait;
 	}
 
@@ -3711,7 +3712,7 @@ void unblank_screen(void)
 static void blank_screen_t(unsigned long dummy)
 {
 	if (unlikely(!keventd_up())) {
-		mod_timer(&console_timer, jiffies + blankinterval);
+		mod_timer(&console_timer, jiffies + (blankinterval * HZ));
 		return;
 	}
 	blank_timer_expired = 1;
@@ -3741,7 +3742,7 @@ void poke_blanked_console(void)
 	if (console_blanked)
 		unblank_screen();
 	else if (blankinterval) {
-		mod_timer(&console_timer, jiffies + blankinterval);
+		mod_timer(&console_timer, jiffies + (blankinterval * HZ));
 		blank_state = blank_normal_wait;
 	}
 }
-- 
cgit v0.10.2


From 55e331cf7ebe20665253770589cd9eb06048bf25 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian@openwrt.org>
Date: Tue, 16 Jun 2009 15:33:53 -0700
Subject: drivers: add support for the TI VLYNQ bus

Add support for the TI VLYNQ high-speed, serial and packetized bus.

This bus allows external devices to be connected to the System-on-Chip and
appear in the main system memory just like any memory mapped peripheral.
It is widely used in TI's networking and multimedia SoC, including the AR7
SoC.

Signed-off-by: Eugene Konev <ejka@imfi.kspu.ru>
Signed-off-by: Florian Fainelli <florian@openwrt.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index 2cb7566..b735f7d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6281,6 +6281,14 @@ F:	drivers/net/macvlan.c
 F:	include/linux/if_*vlan.h
 F:	net/8021q/
 
+VLYNQ BUS
+P:	Florian Fainelli
+M:	florian@openwrt.org
+L:	openwrt-devel@lists.openwrt.org
+S:	Maintained
+F:	drivers/vlynq/vlynq.c
+F:	include/linux/vlynq.h
+
 VOLTAGE AND CURRENT REGULATOR FRAMEWORK
 P:	Liam Girdwood
 M:	lrg@slimlogic.co.uk
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 00cf955..a442c8f 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -104,6 +104,8 @@ source "drivers/auxdisplay/Kconfig"
 
 source "drivers/uio/Kconfig"
 
+source "drivers/vlynq/Kconfig"
+
 source "drivers/xen/Kconfig"
 
 source "drivers/staging/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 9e7d4e5..00b44f4 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -105,6 +105,7 @@ obj-$(CONFIG_PPC_PS3)		+= ps3/
 obj-$(CONFIG_OF)		+= of/
 obj-$(CONFIG_SSB)		+= ssb/
 obj-$(CONFIG_VIRTIO)		+= virtio/
+obj-$(CONFIG_VLYNQ)		+= vlynq/
 obj-$(CONFIG_STAGING)		+= staging/
 obj-y				+= platform/
 obj-y				+= ieee802154/
diff --git a/drivers/vlynq/Kconfig b/drivers/vlynq/Kconfig
new file mode 100644
index 0000000..f654221
--- /dev/null
+++ b/drivers/vlynq/Kconfig
@@ -0,0 +1,20 @@
+menu "TI VLYNQ"
+
+config VLYNQ
+	bool "TI VLYNQ bus support"
+	depends on AR7 && EXPERIMENTAL
+	help
+	  Support for Texas Instruments(R) VLYNQ bus.
+	  The VLYNQ bus is a high-speed, serial and packetized
+	  data bus which allows external peripherals of a SoC
+	  to appear into the system's main memory.
+
+	  If unsure, say N
+
+config VLYNQ_DEBUG
+	bool "VLYNQ bus debug"
+	depends on VLYNQ && KERNEL_DEBUG
+	help
+	  Turn on VLYNQ bus debugging.
+
+endmenu
diff --git a/drivers/vlynq/Makefile b/drivers/vlynq/Makefile
new file mode 100644
index 0000000..b3f6114
--- /dev/null
+++ b/drivers/vlynq/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for kernel vlynq drivers
+#
+
+obj-$(CONFIG_VLYNQ) += vlynq.o
diff --git a/drivers/vlynq/vlynq.c b/drivers/vlynq/vlynq.c
new file mode 100644
index 0000000..7335433
--- /dev/null
+++ b/drivers/vlynq/vlynq.c
@@ -0,0 +1,814 @@
+/*
+ * Copyright (C) 2006, 2007 Eugene Konev <ejka@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Parts of the VLYNQ specification can be found here:
+ * http://www.ti.com/litv/pdf/sprue36a
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+
+#include <linux/vlynq.h>
+
+#define VLYNQ_CTRL_PM_ENABLE		0x80000000
+#define VLYNQ_CTRL_CLOCK_INT		0x00008000
+#define VLYNQ_CTRL_CLOCK_DIV(x)		(((x) & 7) << 16)
+#define VLYNQ_CTRL_INT_LOCAL		0x00004000
+#define VLYNQ_CTRL_INT_ENABLE		0x00002000
+#define VLYNQ_CTRL_INT_VECTOR(x)	(((x) & 0x1f) << 8)
+#define VLYNQ_CTRL_INT2CFG		0x00000080
+#define VLYNQ_CTRL_RESET		0x00000001
+
+#define VLYNQ_CTRL_CLOCK_MASK          (0x7 << 16)
+
+#define VLYNQ_INT_OFFSET		0x00000014
+#define VLYNQ_REMOTE_OFFSET		0x00000080
+
+#define VLYNQ_STATUS_LINK		0x00000001
+#define VLYNQ_STATUS_LERROR		0x00000080
+#define VLYNQ_STATUS_RERROR		0x00000100
+
+#define VINT_ENABLE			0x00000100
+#define VINT_TYPE_EDGE			0x00000080
+#define VINT_LEVEL_LOW			0x00000040
+#define VINT_VECTOR(x)			((x) & 0x1f)
+#define VINT_OFFSET(irq)		(8 * ((irq) % 4))
+
+#define VLYNQ_AUTONEGO_V2		0x00010000
+
+struct vlynq_regs {
+	u32 revision;
+	u32 control;
+	u32 status;
+	u32 int_prio;
+	u32 int_status;
+	u32 int_pending;
+	u32 int_ptr;
+	u32 tx_offset;
+	struct vlynq_mapping rx_mapping[4];
+	u32 chip;
+	u32 autonego;
+	u32 unused[6];
+	u32 int_device[8];
+};
+
+#ifdef VLYNQ_DEBUG
+static void vlynq_dump_regs(struct vlynq_device *dev)
+{
+	int i;
+
+	printk(KERN_DEBUG "VLYNQ local=%p remote=%p\n",
+			dev->local, dev->remote);
+	for (i = 0; i < 32; i++) {
+		printk(KERN_DEBUG "VLYNQ: local %d: %08x\n",
+			i + 1, ((u32 *)dev->local)[i]);
+		printk(KERN_DEBUG "VLYNQ: remote %d: %08x\n",
+			i + 1, ((u32 *)dev->remote)[i]);
+	}
+}
+
+static void vlynq_dump_mem(u32 *base, int count)
+{
+	int i;
+
+	for (i = 0; i < (count + 3) / 4; i++) {
+		if (i % 4 == 0)
+			printk(KERN_DEBUG "\nMEM[0x%04x]:", i * 4);
+		printk(KERN_DEBUG " 0x%08x", *(base + i));
+	}
+	printk(KERN_DEBUG "\n");
+}
+#endif
+
+/* Check the VLYNQ link status with a given device */
+static int vlynq_linked(struct vlynq_device *dev)
+{
+	int i;
+
+	for (i = 0; i < 100; i++)
+		if (readl(&dev->local->status) & VLYNQ_STATUS_LINK)
+			return 1;
+		else
+			cpu_relax();
+
+	return 0;
+}
+
+static void vlynq_reset(struct vlynq_device *dev)
+{
+	writel(readl(&dev->local->control) | VLYNQ_CTRL_RESET,
+			&dev->local->control);
+
+	/* Wait for the devices to finish resetting */
+	msleep(5);
+
+	/* Remove reset bit */
+	writel(readl(&dev->local->control) & ~VLYNQ_CTRL_RESET,
+			&dev->local->control);
+
+	/* Give some time for the devices to settle */
+	msleep(5);
+}
+
+static void vlynq_irq_unmask(unsigned int irq)
+{
+	u32 val;
+	struct vlynq_device *dev = get_irq_chip_data(irq);
+	int virq;
+
+	BUG_ON(!dev);
+	virq = irq - dev->irq_start;
+	val = readl(&dev->remote->int_device[virq >> 2]);
+	val |= (VINT_ENABLE | virq) << VINT_OFFSET(virq);
+	writel(val, &dev->remote->int_device[virq >> 2]);
+}
+
+static void vlynq_irq_mask(unsigned int irq)
+{
+	u32 val;
+	struct vlynq_device *dev = get_irq_chip_data(irq);
+	int virq;
+
+	BUG_ON(!dev);
+	virq = irq - dev->irq_start;
+	val = readl(&dev->remote->int_device[virq >> 2]);
+	val &= ~(VINT_ENABLE << VINT_OFFSET(virq));
+	writel(val, &dev->remote->int_device[virq >> 2]);
+}
+
+static int vlynq_irq_type(unsigned int irq, unsigned int flow_type)
+{
+	u32 val;
+	struct vlynq_device *dev = get_irq_chip_data(irq);
+	int virq;
+
+	BUG_ON(!dev);
+	virq = irq - dev->irq_start;
+	val = readl(&dev->remote->int_device[virq >> 2]);
+	switch (flow_type & IRQ_TYPE_SENSE_MASK) {
+	case IRQ_TYPE_EDGE_RISING:
+	case IRQ_TYPE_EDGE_FALLING:
+	case IRQ_TYPE_EDGE_BOTH:
+		val |= VINT_TYPE_EDGE << VINT_OFFSET(virq);
+		val &= ~(VINT_LEVEL_LOW << VINT_OFFSET(virq));
+		break;
+	case IRQ_TYPE_LEVEL_HIGH:
+		val &= ~(VINT_TYPE_EDGE << VINT_OFFSET(virq));
+		val &= ~(VINT_LEVEL_LOW << VINT_OFFSET(virq));
+		break;
+	case IRQ_TYPE_LEVEL_LOW:
+		val &= ~(VINT_TYPE_EDGE << VINT_OFFSET(virq));
+		val |= VINT_LEVEL_LOW << VINT_OFFSET(virq);
+		break;
+	default:
+		return -EINVAL;
+	}
+	writel(val, &dev->remote->int_device[virq >> 2]);
+	return 0;
+}
+
+static void vlynq_local_ack(unsigned int irq)
+{
+	struct vlynq_device *dev = get_irq_chip_data(irq);
+
+	u32 status = readl(&dev->local->status);
+
+	pr_debug("%s: local status: 0x%08x\n",
+		       dev_name(&dev->dev), status);
+	writel(status, &dev->local->status);
+}
+
+static void vlynq_remote_ack(unsigned int irq)
+{
+	struct vlynq_device *dev = get_irq_chip_data(irq);
+
+	u32 status = readl(&dev->remote->status);
+
+	pr_debug("%s: remote status: 0x%08x\n",
+		       dev_name(&dev->dev), status);
+	writel(status, &dev->remote->status);
+}
+
+static irqreturn_t vlynq_irq(int irq, void *dev_id)
+{
+	struct vlynq_device *dev = dev_id;
+	u32 status;
+	int virq = 0;
+
+	status = readl(&dev->local->int_status);
+	writel(status, &dev->local->int_status);
+
+	if (unlikely(!status))
+		spurious_interrupt();
+
+	while (status) {
+		if (status & 1)
+			do_IRQ(dev->irq_start + virq);
+		status >>= 1;
+		virq++;
+	}
+
+	return IRQ_HANDLED;
+}
+
+static struct irq_chip vlynq_irq_chip = {
+	.name = "vlynq",
+	.unmask = vlynq_irq_unmask,
+	.mask = vlynq_irq_mask,
+	.set_type = vlynq_irq_type,
+};
+
+static struct irq_chip vlynq_local_chip = {
+	.name = "vlynq local error",
+	.unmask = vlynq_irq_unmask,
+	.mask = vlynq_irq_mask,
+	.ack = vlynq_local_ack,
+};
+
+static struct irq_chip vlynq_remote_chip = {
+	.name = "vlynq local error",
+	.unmask = vlynq_irq_unmask,
+	.mask = vlynq_irq_mask,
+	.ack = vlynq_remote_ack,
+};
+
+static int vlynq_setup_irq(struct vlynq_device *dev)
+{
+	u32 val;
+	int i, virq;
+
+	if (dev->local_irq == dev->remote_irq) {
+		printk(KERN_ERR
+		       "%s: local vlynq irq should be different from remote\n",
+		       dev_name(&dev->dev));
+		return -EINVAL;
+	}
+
+	/* Clear local and remote error bits */
+	writel(readl(&dev->local->status), &dev->local->status);
+	writel(readl(&dev->remote->status), &dev->remote->status);
+
+	/* Now setup interrupts */
+	val = VLYNQ_CTRL_INT_VECTOR(dev->local_irq);
+	val |= VLYNQ_CTRL_INT_ENABLE | VLYNQ_CTRL_INT_LOCAL |
+		VLYNQ_CTRL_INT2CFG;
+	val |= readl(&dev->local->control);
+	writel(VLYNQ_INT_OFFSET, &dev->local->int_ptr);
+	writel(val, &dev->local->control);
+
+	val = VLYNQ_CTRL_INT_VECTOR(dev->remote_irq);
+	val |= VLYNQ_CTRL_INT_ENABLE;
+	val |= readl(&dev->remote->control);
+	writel(VLYNQ_INT_OFFSET, &dev->remote->int_ptr);
+	writel(val, &dev->remote->int_ptr);
+	writel(val, &dev->remote->control);
+
+	for (i = dev->irq_start; i <= dev->irq_end; i++) {
+		virq = i - dev->irq_start;
+		if (virq == dev->local_irq) {
+			set_irq_chip_and_handler(i, &vlynq_local_chip,
+						 handle_level_irq);
+			set_irq_chip_data(i, dev);
+		} else if (virq == dev->remote_irq) {
+			set_irq_chip_and_handler(i, &vlynq_remote_chip,
+						 handle_level_irq);
+			set_irq_chip_data(i, dev);
+		} else {
+			set_irq_chip_and_handler(i, &vlynq_irq_chip,
+						 handle_simple_irq);
+			set_irq_chip_data(i, dev);
+			writel(0, &dev->remote->int_device[virq >> 2]);
+		}
+	}
+
+	if (request_irq(dev->irq, vlynq_irq, IRQF_SHARED, "vlynq", dev)) {
+		printk(KERN_ERR "%s: request_irq failed\n",
+					dev_name(&dev->dev));
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void vlynq_device_release(struct device *dev)
+{
+	struct vlynq_device *vdev = to_vlynq_device(dev);
+	kfree(vdev);
+}
+
+static int vlynq_device_match(struct device *dev,
+			      struct device_driver *drv)
+{
+	struct vlynq_device *vdev = to_vlynq_device(dev);
+	struct vlynq_driver *vdrv = to_vlynq_driver(drv);
+	struct vlynq_device_id *ids = vdrv->id_table;
+
+	while (ids->id) {
+		if (ids->id == vdev->dev_id) {
+			vdev->divisor = ids->divisor;
+			vlynq_set_drvdata(vdev, ids);
+			printk(KERN_INFO "Driver found for VLYNQ "
+				"device: %08x\n", vdev->dev_id);
+			return 1;
+		}
+		printk(KERN_DEBUG "Not using the %08x VLYNQ device's driver"
+			" for VLYNQ device: %08x\n", ids->id, vdev->dev_id);
+		ids++;
+	}
+	return 0;
+}
+
+static int vlynq_device_probe(struct device *dev)
+{
+	struct vlynq_device *vdev = to_vlynq_device(dev);
+	struct vlynq_driver *drv = to_vlynq_driver(dev->driver);
+	struct vlynq_device_id *id = vlynq_get_drvdata(vdev);
+	int result = -ENODEV;
+
+	if (drv->probe)
+		result = drv->probe(vdev, id);
+	if (result)
+		put_device(dev);
+	return result;
+}
+
+static int vlynq_device_remove(struct device *dev)
+{
+	struct vlynq_driver *drv = to_vlynq_driver(dev->driver);
+
+	if (drv->remove)
+		drv->remove(to_vlynq_device(dev));
+
+	return 0;
+}
+
+int __vlynq_register_driver(struct vlynq_driver *driver, struct module *owner)
+{
+	driver->driver.name = driver->name;
+	driver->driver.bus = &vlynq_bus_type;
+	return driver_register(&driver->driver);
+}
+EXPORT_SYMBOL(__vlynq_register_driver);
+
+void vlynq_unregister_driver(struct vlynq_driver *driver)
+{
+	driver_unregister(&driver->driver);
+}
+EXPORT_SYMBOL(vlynq_unregister_driver);
+
+/*
+ * A VLYNQ remote device can clock the VLYNQ bus master
+ * using a dedicated clock line. In that case, both the
+ * remove device and the bus master should have the same
+ * serial clock dividers configured. Iterate through the
+ * 8 possible dividers until we actually link with the
+ * device.
+ */
+static int __vlynq_try_remote(struct vlynq_device *dev)
+{
+	int i;
+
+	vlynq_reset(dev);
+	for (i = dev->dev_id ? vlynq_rdiv2 : vlynq_rdiv8; dev->dev_id ?
+			i <= vlynq_rdiv8 : i >= vlynq_rdiv2;
+		dev->dev_id ? i++ : i--) {
+
+		if (!vlynq_linked(dev))
+			break;
+
+		writel((readl(&dev->remote->control) &
+				~VLYNQ_CTRL_CLOCK_MASK) |
+				VLYNQ_CTRL_CLOCK_INT |
+				VLYNQ_CTRL_CLOCK_DIV(i - vlynq_rdiv1),
+				&dev->remote->control);
+		writel((readl(&dev->local->control)
+				& ~(VLYNQ_CTRL_CLOCK_INT |
+				VLYNQ_CTRL_CLOCK_MASK)) |
+				VLYNQ_CTRL_CLOCK_DIV(i - vlynq_rdiv1),
+				&dev->local->control);
+
+		if (vlynq_linked(dev)) {
+			printk(KERN_DEBUG
+				"%s: using remote clock divisor %d\n",
+				dev_name(&dev->dev), i - vlynq_rdiv1 + 1);
+			dev->divisor = i;
+			return 0;
+		} else {
+			vlynq_reset(dev);
+		}
+	}
+
+	return -ENODEV;
+}
+
+/*
+ * A VLYNQ remote device can be clocked by the VLYNQ bus
+ * master using a dedicated clock line. In that case, only
+ * the bus master configures the serial clock divider.
+ * Iterate through the 8 possible dividers until we
+ * actually get a link with the device.
+ */
+static int __vlynq_try_local(struct vlynq_device *dev)
+{
+	int i;
+
+	vlynq_reset(dev);
+
+	for (i = dev->dev_id ? vlynq_ldiv2 : vlynq_ldiv8; dev->dev_id ?
+			i <= vlynq_ldiv8 : i >= vlynq_ldiv2;
+		dev->dev_id ? i++ : i--) {
+
+		writel((readl(&dev->local->control) &
+				~VLYNQ_CTRL_CLOCK_MASK) |
+				VLYNQ_CTRL_CLOCK_INT |
+				VLYNQ_CTRL_CLOCK_DIV(i - vlynq_ldiv1),
+				&dev->local->control);
+
+		if (vlynq_linked(dev)) {
+			printk(KERN_DEBUG
+				"%s: using local clock divisor %d\n",
+				dev_name(&dev->dev), i - vlynq_ldiv1 + 1);
+			dev->divisor = i;
+			return 0;
+		} else {
+			vlynq_reset(dev);
+		}
+	}
+
+	return -ENODEV;
+}
+
+/*
+ * When using external clocking method, serial clock
+ * is supplied by an external oscillator, therefore we
+ * should mask the local clock bit in the clock control
+ * register for both the bus master and the remote device.
+ */
+static int __vlynq_try_external(struct vlynq_device *dev)
+{
+	vlynq_reset(dev);
+	if (!vlynq_linked(dev))
+		return -ENODEV;
+
+	writel((readl(&dev->remote->control) &
+			~VLYNQ_CTRL_CLOCK_INT),
+			&dev->remote->control);
+
+	writel((readl(&dev->local->control) &
+			~VLYNQ_CTRL_CLOCK_INT),
+			&dev->local->control);
+
+	if (vlynq_linked(dev)) {
+		printk(KERN_DEBUG "%s: using external clock\n",
+			dev_name(&dev->dev));
+			dev->divisor = vlynq_div_external;
+		return 0;
+	}
+
+	return -ENODEV;
+}
+
+static int __vlynq_enable_device(struct vlynq_device *dev)
+{
+	int result;
+	struct plat_vlynq_ops *ops = dev->dev.platform_data;
+
+	result = ops->on(dev);
+	if (result)
+		return result;
+
+	switch (dev->divisor) {
+	case vlynq_div_external:
+	case vlynq_div_auto:
+		/* When the device is brought from reset it should have clock
+		 * generation negotiated by hardware.
+		 * Check which device is generating clocks and perform setup
+		 * accordingly */
+		if (vlynq_linked(dev) && readl(&dev->remote->control) &
+		   VLYNQ_CTRL_CLOCK_INT) {
+			if (!__vlynq_try_remote(dev) ||
+				!__vlynq_try_local(dev)  ||
+				!__vlynq_try_external(dev))
+				return 0;
+		} else {
+			if (!__vlynq_try_external(dev) ||
+				!__vlynq_try_local(dev)    ||
+				!__vlynq_try_remote(dev))
+				return 0;
+		}
+		break;
+	case vlynq_ldiv1:
+	case vlynq_ldiv2:
+	case vlynq_ldiv3:
+	case vlynq_ldiv4:
+	case vlynq_ldiv5:
+	case vlynq_ldiv6:
+	case vlynq_ldiv7:
+	case vlynq_ldiv8:
+		writel(VLYNQ_CTRL_CLOCK_INT |
+			VLYNQ_CTRL_CLOCK_DIV(dev->divisor -
+			vlynq_ldiv1), &dev->local->control);
+		writel(0, &dev->remote->control);
+		if (vlynq_linked(dev)) {
+			printk(KERN_DEBUG
+				"%s: using local clock divisor %d\n",
+				dev_name(&dev->dev),
+				dev->divisor - vlynq_ldiv1 + 1);
+			return 0;
+		}
+		break;
+	case vlynq_rdiv1:
+	case vlynq_rdiv2:
+	case vlynq_rdiv3:
+	case vlynq_rdiv4:
+	case vlynq_rdiv5:
+	case vlynq_rdiv6:
+	case vlynq_rdiv7:
+	case vlynq_rdiv8:
+		writel(0, &dev->local->control);
+		writel(VLYNQ_CTRL_CLOCK_INT |
+			VLYNQ_CTRL_CLOCK_DIV(dev->divisor -
+			vlynq_rdiv1), &dev->remote->control);
+		if (vlynq_linked(dev)) {
+			printk(KERN_DEBUG
+				"%s: using remote clock divisor %d\n",
+				dev_name(&dev->dev),
+				dev->divisor - vlynq_rdiv1 + 1);
+			return 0;
+		}
+		break;
+	}
+
+	ops->off(dev);
+	return -ENODEV;
+}
+
+int vlynq_enable_device(struct vlynq_device *dev)
+{
+	struct plat_vlynq_ops *ops = dev->dev.platform_data;
+	int result = -ENODEV;
+
+	result = __vlynq_enable_device(dev);
+	if (result)
+		return result;
+
+	result = vlynq_setup_irq(dev);
+	if (result)
+		ops->off(dev);
+
+	dev->enabled = !result;
+	return result;
+}
+EXPORT_SYMBOL(vlynq_enable_device);
+
+
+void vlynq_disable_device(struct vlynq_device *dev)
+{
+	struct plat_vlynq_ops *ops = dev->dev.platform_data;
+
+	dev->enabled = 0;
+	free_irq(dev->irq, dev);
+	ops->off(dev);
+}
+EXPORT_SYMBOL(vlynq_disable_device);
+
+int vlynq_set_local_mapping(struct vlynq_device *dev, u32 tx_offset,
+			    struct vlynq_mapping *mapping)
+{
+	int i;
+
+	if (!dev->enabled)
+		return -ENXIO;
+
+	writel(tx_offset, &dev->local->tx_offset);
+	for (i = 0; i < 4; i++) {
+		writel(mapping[i].offset, &dev->local->rx_mapping[i].offset);
+		writel(mapping[i].size, &dev->local->rx_mapping[i].size);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(vlynq_set_local_mapping);
+
+int vlynq_set_remote_mapping(struct vlynq_device *dev, u32 tx_offset,
+			     struct vlynq_mapping *mapping)
+{
+	int i;
+
+	if (!dev->enabled)
+		return -ENXIO;
+
+	writel(tx_offset, &dev->remote->tx_offset);
+	for (i = 0; i < 4; i++) {
+		writel(mapping[i].offset, &dev->remote->rx_mapping[i].offset);
+		writel(mapping[i].size, &dev->remote->rx_mapping[i].size);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(vlynq_set_remote_mapping);
+
+int vlynq_set_local_irq(struct vlynq_device *dev, int virq)
+{
+	int irq = dev->irq_start + virq;
+	if (dev->enabled)
+		return -EBUSY;
+
+	if ((irq < dev->irq_start) || (irq > dev->irq_end))
+		return -EINVAL;
+
+	if (virq == dev->remote_irq)
+		return -EINVAL;
+
+	dev->local_irq = virq;
+
+	return 0;
+}
+EXPORT_SYMBOL(vlynq_set_local_irq);
+
+int vlynq_set_remote_irq(struct vlynq_device *dev, int virq)
+{
+	int irq = dev->irq_start + virq;
+	if (dev->enabled)
+		return -EBUSY;
+
+	if ((irq < dev->irq_start) || (irq > dev->irq_end))
+		return -EINVAL;
+
+	if (virq == dev->local_irq)
+		return -EINVAL;
+
+	dev->remote_irq = virq;
+
+	return 0;
+}
+EXPORT_SYMBOL(vlynq_set_remote_irq);
+
+static int vlynq_probe(struct platform_device *pdev)
+{
+	struct vlynq_device *dev;
+	struct resource *regs_res, *mem_res, *irq_res;
+	int len, result;
+
+	regs_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "regs");
+	if (!regs_res)
+		return -ENODEV;
+
+	mem_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "mem");
+	if (!mem_res)
+		return -ENODEV;
+
+	irq_res = platform_get_resource_byname(pdev, IORESOURCE_IRQ, "devirq");
+	if (!irq_res)
+		return -ENODEV;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev) {
+		printk(KERN_ERR
+		       "vlynq: failed to allocate device structure\n");
+		return -ENOMEM;
+	}
+
+	dev->id = pdev->id;
+	dev->dev.bus = &vlynq_bus_type;
+	dev->dev.parent = &pdev->dev;
+	dev_set_name(&dev->dev, "vlynq%d", dev->id);
+	dev->dev.platform_data = pdev->dev.platform_data;
+	dev->dev.release = vlynq_device_release;
+
+	dev->regs_start = regs_res->start;
+	dev->regs_end = regs_res->end;
+	dev->mem_start = mem_res->start;
+	dev->mem_end = mem_res->end;
+
+	len = regs_res->end - regs_res->start;
+	if (!request_mem_region(regs_res->start, len, dev_name(&dev->dev))) {
+		printk(KERN_ERR "%s: Can't request vlynq registers\n",
+		       dev_name(&dev->dev));
+		result = -ENXIO;
+		goto fail_request;
+	}
+
+	dev->local = ioremap(regs_res->start, len);
+	if (!dev->local) {
+		printk(KERN_ERR "%s: Can't remap vlynq registers\n",
+		       dev_name(&dev->dev));
+		result = -ENXIO;
+		goto fail_remap;
+	}
+
+	dev->remote = (struct vlynq_regs *)((void *)dev->local +
+					    VLYNQ_REMOTE_OFFSET);
+
+	dev->irq = platform_get_irq_byname(pdev, "irq");
+	dev->irq_start = irq_res->start;
+	dev->irq_end = irq_res->end;
+	dev->local_irq = dev->irq_end - dev->irq_start;
+	dev->remote_irq = dev->local_irq - 1;
+
+	if (device_register(&dev->dev))
+		goto fail_register;
+	platform_set_drvdata(pdev, dev);
+
+	printk(KERN_INFO "%s: regs 0x%p, irq %d, mem 0x%p\n",
+	       dev_name(&dev->dev), (void *)dev->regs_start, dev->irq,
+	       (void *)dev->mem_start);
+
+	dev->dev_id = 0;
+	dev->divisor = vlynq_div_auto;
+	result = __vlynq_enable_device(dev);
+	if (result == 0) {
+		dev->dev_id = readl(&dev->remote->chip);
+		((struct plat_vlynq_ops *)(dev->dev.platform_data))->off(dev);
+	}
+	if (dev->dev_id)
+		printk(KERN_INFO "Found a VLYNQ device: %08x\n", dev->dev_id);
+
+	return 0;
+
+fail_register:
+	iounmap(dev->local);
+fail_remap:
+fail_request:
+	release_mem_region(regs_res->start, len);
+	kfree(dev);
+	return result;
+}
+
+static int vlynq_remove(struct platform_device *pdev)
+{
+	struct vlynq_device *dev = platform_get_drvdata(pdev);
+
+	device_unregister(&dev->dev);
+	iounmap(dev->local);
+	release_mem_region(dev->regs_start, dev->regs_end - dev->regs_start);
+
+	kfree(dev);
+
+	return 0;
+}
+
+static struct platform_driver vlynq_platform_driver = {
+	.driver.name = "vlynq",
+	.probe = vlynq_probe,
+	.remove = __devexit_p(vlynq_remove),
+};
+
+struct bus_type vlynq_bus_type = {
+	.name = "vlynq",
+	.match = vlynq_device_match,
+	.probe = vlynq_device_probe,
+	.remove = vlynq_device_remove,
+};
+EXPORT_SYMBOL(vlynq_bus_type);
+
+static int __devinit vlynq_init(void)
+{
+	int res = 0;
+
+	res = bus_register(&vlynq_bus_type);
+	if (res)
+		goto fail_bus;
+
+	res = platform_driver_register(&vlynq_platform_driver);
+	if (res)
+		goto fail_platform;
+
+	return 0;
+
+fail_platform:
+	bus_unregister(&vlynq_bus_type);
+fail_bus:
+	return res;
+}
+
+static void __devexit vlynq_exit(void)
+{
+	platform_driver_unregister(&vlynq_platform_driver);
+	bus_unregister(&vlynq_bus_type);
+}
+
+module_init(vlynq_init);
+module_exit(vlynq_exit);
diff --git a/include/linux/vlynq.h b/include/linux/vlynq.h
new file mode 100644
index 0000000..8f6a958
--- /dev/null
+++ b/include/linux/vlynq.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (C) 2006, 2007 Eugene Konev <ejka@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __VLYNQ_H__
+#define __VLYNQ_H__
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define VLYNQ_NUM_IRQS 32
+
+struct vlynq_mapping {
+	u32 size;
+	u32 offset;
+};
+
+enum vlynq_divisor {
+	vlynq_div_auto = 0,
+	vlynq_ldiv1,
+	vlynq_ldiv2,
+	vlynq_ldiv3,
+	vlynq_ldiv4,
+	vlynq_ldiv5,
+	vlynq_ldiv6,
+	vlynq_ldiv7,
+	vlynq_ldiv8,
+	vlynq_rdiv1,
+	vlynq_rdiv2,
+	vlynq_rdiv3,
+	vlynq_rdiv4,
+	vlynq_rdiv5,
+	vlynq_rdiv6,
+	vlynq_rdiv7,
+	vlynq_rdiv8,
+	vlynq_div_external
+};
+
+struct vlynq_device_id {
+	u32 id;
+	enum vlynq_divisor divisor;
+	unsigned long driver_data;
+};
+
+struct vlynq_regs;
+struct vlynq_device {
+	u32 id, dev_id;
+	int local_irq;
+	int remote_irq;
+	enum vlynq_divisor divisor;
+	u32 regs_start, regs_end;
+	u32 mem_start, mem_end;
+	u32 irq_start, irq_end;
+	int irq;
+	int enabled;
+	struct vlynq_regs *local;
+	struct vlynq_regs *remote;
+	struct device dev;
+};
+
+struct vlynq_driver {
+	char *name;
+	struct vlynq_device_id *id_table;
+	int (*probe)(struct vlynq_device *dev, struct vlynq_device_id *id);
+	void (*remove)(struct vlynq_device *dev);
+	struct device_driver driver;
+};
+
+struct plat_vlynq_ops {
+	int (*on)(struct vlynq_device *dev);
+	void (*off)(struct vlynq_device *dev);
+};
+
+static inline struct vlynq_driver *to_vlynq_driver(struct device_driver *drv)
+{
+	return container_of(drv, struct vlynq_driver, driver);
+}
+
+static inline struct vlynq_device *to_vlynq_device(struct device *device)
+{
+	return container_of(device, struct vlynq_device, dev);
+}
+
+extern struct bus_type vlynq_bus_type;
+
+extern int __vlynq_register_driver(struct vlynq_driver *driver,
+				   struct module *owner);
+
+static inline int vlynq_register_driver(struct vlynq_driver *driver)
+{
+	return __vlynq_register_driver(driver, THIS_MODULE);
+}
+
+static inline void *vlynq_get_drvdata(struct vlynq_device *dev)
+{
+	return dev_get_drvdata(&dev->dev);
+}
+
+static inline void vlynq_set_drvdata(struct vlynq_device *dev, void *data)
+{
+	dev_set_drvdata(&dev->dev, data);
+}
+
+static inline u32 vlynq_mem_start(struct vlynq_device *dev)
+{
+	return dev->mem_start;
+}
+
+static inline u32 vlynq_mem_end(struct vlynq_device *dev)
+{
+	return dev->mem_end;
+}
+
+static inline u32 vlynq_mem_len(struct vlynq_device *dev)
+{
+	return dev->mem_end - dev->mem_start + 1;
+}
+
+static inline int vlynq_virq_to_irq(struct vlynq_device *dev, int virq)
+{
+	int irq = dev->irq_start + virq;
+	if ((irq < dev->irq_start) || (irq > dev->irq_end))
+		return -EINVAL;
+
+	return irq;
+}
+
+static inline int vlynq_irq_to_virq(struct vlynq_device *dev, int irq)
+{
+	if ((irq < dev->irq_start) || (irq > dev->irq_end))
+		return -EINVAL;
+
+	return irq - dev->irq_start;
+}
+
+extern void vlynq_unregister_driver(struct vlynq_driver *driver);
+extern int vlynq_enable_device(struct vlynq_device *dev);
+extern void vlynq_disable_device(struct vlynq_device *dev);
+extern int vlynq_set_local_mapping(struct vlynq_device *dev, u32 tx_offset,
+				   struct vlynq_mapping *mapping);
+extern int vlynq_set_remote_mapping(struct vlynq_device *dev, u32 tx_offset,
+				    struct vlynq_mapping *mapping);
+extern int vlynq_set_local_irq(struct vlynq_device *dev, int virq);
+extern int vlynq_set_remote_irq(struct vlynq_device *dev, int virq);
+
+#endif /* __VLYNQ_H__ */
-- 
cgit v0.10.2


From 69050eee8e08a6234f29fe71a56f8c7c7d4d7186 Mon Sep 17 00:00:00 2001
From: Tomas Szepe <szepe@pinerecords.com>
Date: Tue, 16 Jun 2009 15:33:56 -0700
Subject: CONFIG_FILE_LOCKING should not depend on CONFIG_BLOCK

CONFIG_FILE_LOCKING should not depend on CONFIG_BLOCK.

This makes it possible to run complete systems out of a CONFIG_BLOCK=n
initramfs on current kernels again (this last worked on 2.6.27.*).

Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/Kconfig b/fs/Kconfig
index 525da2e..4044f16 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -39,6 +39,13 @@ config FS_POSIX_ACL
 	bool
 	default n
 
+source "fs/xfs/Kconfig"
+source "fs/gfs2/Kconfig"
+source "fs/ocfs2/Kconfig"
+source "fs/btrfs/Kconfig"
+
+endif # BLOCK
+
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EMBEDDED
 	default y
@@ -47,13 +54,6 @@ config FILE_LOCKING
           for filesystems like NFS and for the flock() system
           call. Disabling this option saves about 11k.
 
-source "fs/xfs/Kconfig"
-source "fs/gfs2/Kconfig"
-source "fs/ocfs2/Kconfig"
-source "fs/btrfs/Kconfig"
-
-endif # BLOCK
-
 source "fs/notify/Kconfig"
 
 source "fs/quota/Kconfig"
-- 
cgit v0.10.2


From 8e8a2dea0ca91fe2cb7de7ea212124cfe8c82c35 Mon Sep 17 00:00:00 2001
From: Zygo Blaxell <zygo.blaxell@xandros.com>
Date: Tue, 16 Jun 2009 15:33:57 -0700
Subject: lib/genalloc.c: remove unmatched write_lock() in gen_pool_destroy

There is a call to write_lock() in gen_pool_destroy which is not balanced
by any corresponding write_unlock().  This causes problems with preemption
because the preemption-disable counter is incremented in the write_lock()
call, but never decremented by any call to write_unlock().  This bug is
gen_pool_destroy, and one of them is non-x86 arch-specific code.

Signed-off-by: Zygo Blaxell <zygo.blaxell@xandros.com>
Cc: Jiri Kosina <trivial@kernel.org>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/lib/genalloc.c b/lib/genalloc.c
index f6d276d..eed2bdb 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -85,7 +85,6 @@ void gen_pool_destroy(struct gen_pool *pool)
 	int bit, end_bit;
 
 
-	write_lock(&pool->lock);
 	list_for_each_safe(_chunk, _next_chunk, &pool->chunks) {
 		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
 		list_del(&chunk->next_chunk);
-- 
cgit v0.10.2


From 290603c1205242691b8a0963f496d0aa80e9ca02 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:33:58 -0700
Subject: scripts/get_maintainer.pl: output first field only in mailing lists
 and after maintainers.

Fix mailing lists that are described, but not "(subscriber-only)"

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 60dc0c4..c4b25a7 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -13,7 +13,7 @@
 use strict;
 
 my $P = $0;
-my $V = '0.15';
+my $V = '0.16';
 
 use Getopt::Long qw(:config no_auto_abbrev);
 
@@ -169,6 +169,7 @@ foreach my $file (@ARGV) {
 }
 
 my @email_to = ();
+my @list_to = ();
 my @scm = ();
 my @web = ();
 my @subsystem = ();
@@ -182,7 +183,7 @@ foreach my $file (@files) {
 
     my $exclude = 0;
     foreach my $line (@typevalue) {
-	if ($line =~ m/^(\C):(.*)/) {
+	if ($line =~ m/^(\C):\s*(.*)/) {
 	    my $type = $1;
 	    my $value = $2;
 	    if ($type eq 'X') {
@@ -196,7 +197,7 @@ foreach my $file (@files) {
     if (!$exclude) {
 	my $tvi = 0;
 	foreach my $line (@typevalue) {
-	    if ($line =~ m/^(\C):(.*)/) {
+	    if ($line =~ m/^(\C):\s*(.*)/) {
 		my $type = $1;
 		my $value = $2;
 		if ($type eq 'F') {
@@ -229,15 +230,18 @@ if ($email_git_penguin_chiefs) {
     }
 }
 
-if ($email) {
-    my $address_cnt = @email_to;
-    if ($address_cnt == 0 && $email_list) {
-	push(@email_to, "linux-kernel\@vger.kernel.org");
+if ($email || $email_list) {
+    my @to = ();
+    if ($email) {
+	@to = (@to, @email_to);
     }
-
-#Don't sort email address list, but do remove duplicates
-    @email_to = uniq(@email_to);
-    output(@email_to);
+    if ($email_list) {
+	if (@list_to == 0) {
+	    push(@list_to, "linux-kernel\@vger.kernel.org");
+	@to = (@to, @list_to);
+	}
+    }
+    output(uniq(@to));
 }
 
 if ($scm) {
@@ -307,7 +311,7 @@ Output type options:
   --multiline => print 1 entry per line
 
 Default options:
-  [--email --git --m --l --multiline]
+  [--email --git --m --n --l --multiline]
 
 Other options:
   --version -> show version
@@ -366,26 +370,30 @@ sub add_categories {
     $index = $index - 1;
     while ($index >= 0) {
 	my $tv = $typevalue[$index];
-	if ($tv =~ m/^(\C):(.*)/) {
+	if ($tv =~ m/^(\C):\s*(.*)/) {
 	    my $ptype = $1;
 	    my $pvalue = $2;
 	    if ($ptype eq "L") {
-		my $subscr = $pvalue;
-		if ($subscr =~ m/\s*\(subscribers-only\)/) {
+		my $list_address = $pvalue;
+		my $list_additional = "";
+		if ($list_address =~ m/([^\s]+)\s+(.*)$/) {
+		    $list_address = $1;
+		    $list_additional = $2;
+		}
+		if ($list_additional =~ m/\(subscribers-only\)/) {
 		    if ($email_subscriber_list) {
-			$subscr =~ s/\s*\(subscribers-only\)//g;
-			push(@email_to, $subscr);
+			push(@list_to, $list_address);
 		    }
 		} else {
 		    if ($email_list) {
-			push(@email_to, $pvalue);
+			push(@list_to, $list_address);
 		    }
 		}
 	    } elsif ($ptype eq "M") {
 		if ($email_maintainer) {
 		    if ($index >= 0) {
 			my $tv = $typevalue[$index - 1];
-			if ($tv =~ m/^(\C):(.*)/) {
+			if ($tv =~ m/^(\C):\s*(.*)/) {
 			    if ($1 eq "P" && $email_usename) {
 				push(@email_to, format_email($2, $pvalue));
 			    } else {
-- 
cgit v0.10.2


From bdf7c685aa4639c95a752b52fa06741a7e3bb34e Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:33:59 -0700
Subject: scripts/get_maintainer.pl: better fix for subscriber-only mailing
 lists

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index c4b25a7..f302ad3 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -380,7 +380,7 @@ sub add_categories {
 		    $list_address = $1;
 		    $list_additional = $2;
 		}
-		if ($list_additional =~ m/\(subscribers-only\)/) {
+		if ($list_additional =~ m/subscribers-only/) {
 		    if ($email_subscriber_list) {
 			push(@list_to, $list_address);
 		    }
-- 
cgit v0.10.2


From f5f5078db2c61bf42ed20527731c0a23bed86c11 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:00 -0700
Subject: scripts/get_maintainer.pl: improve --git-chief-penquins (Linus
 Torvalds) filtering

Moved linux-kernel@vger.kernel.org to MAINTAINERS
lkml will be added to all CC lists via F: pattern match

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index f302ad3..1eb67fc 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -216,15 +216,19 @@ foreach my $file (@files) {
 
 }
 
-if ($email_git_penguin_chiefs) {
+if ($email) {
     foreach my $chief (@penguin_chief) {
 	if ($chief =~ m/^(.*):(.*)/) {
-	    my $chief_name = $1;
-	    my $chief_addr = $2;
+	    my $email_address;
 	    if ($email_usename) {
-		push(@email_to, format_email($chief_name, $chief_addr));
+		$email_address = format_email($1, $2);
 	    } else {
-		push(@email_to, $chief_addr);
+		$email_address = $2;
+	    }
+	    if ($email_git_penguin_chiefs) {
+		push(@email_to, $email_address);
+	    } else {
+		@email_to = grep(!/${email_address}/, @email_to);
 	    }
 	}
     }
@@ -236,10 +240,7 @@ if ($email || $email_list) {
 	@to = (@to, @email_to);
     }
     if ($email_list) {
-	if (@list_to == 0) {
-	    push(@list_to, "linux-kernel\@vger.kernel.org");
 	@to = (@to, @list_to);
-	}
     }
     output(uniq(@to));
 }
@@ -314,7 +315,7 @@ Default options:
   [--email --git --m --n --l --multiline]
 
 Other options:
-  --version -> show version
+  --version => show version
   --help => show this help information
 
 EOT
@@ -423,7 +424,7 @@ sub add_categories {
 sub which {
     my ($bin) = @_;
 
-    foreach my $path (split /:/, $ENV{PATH}) {
+    foreach my $path (split(/:/, $ENV{PATH})) {
 	if (-e "$path/$bin") {
 	    return "$path/$bin";
 	}
@@ -446,12 +447,8 @@ sub recent_git_signoffs {
     }
 
     $cmd = "git log --since=${email_git_since} -- ${file}";
-    $cmd .= " | grep -Pi \"^[-_ 	a-z]+by:.*\\\@\"";
-    if (!$email_git_penguin_chiefs) {
-	$cmd .= " | grep -Pv \"${penguin_chiefs}\"";
-    }
+    $cmd .= " | grep -Pi \"^[-_ 	a-z]+by:.*\\\@.*\$\"";
     $cmd .= " | cut -f2- -d\":\"";
-    $cmd .= " | sed -e \"s/^\\s+//g\"";
     $cmd .= " | sort | uniq -c | sort -rn";
 
     $output = `${cmd}`;
-- 
cgit v0.10.2


From de2fc4922b7db1f5099585f821f854a86b5828eb Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:01 -0700
Subject: scripts/get_maintainer.pl: warn on missing git or git repository

support older versions of grep (use -E not -P)
no need to return data in routine recent_git_signoffs

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 1eb67fc..22c7f4e 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -443,11 +443,21 @@ sub recent_git_signoffs {
     my @lines = ();
 
     if (which("git") eq "") {
-	die("$P: git not found.  Add --nogit to options?\n");
+	warn("$P: git not found.  Add --nogit to options?\n");
+	return;
+    }
+    if (!(-d ".git")) {
+	warn("$P: .git repository not found.\n");
+	warn("Use a .git repository for better results.\n");
+	warn("ie: git clone git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git\n");
+	return;
     }
 
     $cmd = "git log --since=${email_git_since} -- ${file}";
-    $cmd .= " | grep -Pi \"^[-_ 	a-z]+by:.*\\\@.*\$\"";
+    $cmd .= " | grep -Ei \"^[-_ 	a-z]+by:.*\\\@.*\$\"";
+    if (!$email_git_penguin_chiefs) {
+	$cmd .= " | grep -Ev \"${penguin_chiefs}\"";
+    }
     $cmd .= " | cut -f2- -d\":\"";
     $cmd .= " | sort | uniq -c | sort -rn";
 
@@ -486,7 +496,6 @@ sub recent_git_signoffs {
 	    push(@email_to, $line);
 	}
     }
-    return $output;
 }
 
 sub uniq {
-- 
cgit v0.10.2


From 1b5e1cf64a7a376417457c7f2b3885decea276e4 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:01 -0700
Subject: scripts/get_maintainer.pl: support M: lines with names and multiple
 entries per M: line

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 22c7f4e..7cf4309 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -55,6 +55,10 @@ foreach my $chief (@penguin_chief) {
 }
 my $penguin_chiefs = "\(" . join("|",@penguin_chief_names) . "\)";
 
+# rfc822 - preloaded methods go here.
+my $rfc822_lwsp = "(?:(?:\\r\\n)?[ \\t])";
+my $rfc822_char = '[\\000-\\177]';
+
 if (!GetOptions(
 		'email!' => \$email,
 		'git!' => \$email_git,
@@ -392,18 +396,7 @@ sub add_categories {
 		}
 	    } elsif ($ptype eq "M") {
 		if ($email_maintainer) {
-		    if ($index >= 0) {
-			my $tv = $typevalue[$index - 1];
-			if ($tv =~ m/^(\C):\s*(.*)/) {
-			    if ($1 eq "P" && $email_usename) {
-				push(@email_to, format_email($2, $pvalue));
-			    } else {
-				push(@email_to, $pvalue);
-			    }
-			}
-		    } else {
-			push(@email_to, $pvalue);
-		    }
+		    push_email_addresses($pvalue);
 		}
 	    } elsif ($ptype eq "T") {
 		push(@scm, $pvalue);
@@ -421,6 +414,36 @@ sub add_categories {
     }
 }
 
+sub push_email_address {
+    my ($email_address) = @_;
+
+    my $email_name = "";
+    if ($email_address =~ m/([^<]+)<(.*\@.*)>$/) {
+	$email_name = $1;
+	$email_address = $2;
+    }
+
+    if ($email_usename && $email_name) {
+	push(@email_to,	format_email($email_name, $email_address));
+    } else {
+	push(@email_to, $email_address);
+    }
+}
+
+sub push_email_addresses {
+    my ($address) = @_;
+
+    my @address_list = ();
+
+    if (@address_list = rfc822_validlist($address)) {
+	my $array_count = shift(@address_list);
+	while (my $entry = shift(@address_list)) {
+	    push_email_address($entry);
+	}
+    }
+
+}
+
 sub which {
     my ($bin) = @_;
 
@@ -480,10 +503,6 @@ sub recent_git_signoffs {
 	if ($line =~ m/(.+)<(.+)>/) {
 	    my $git_name = $1;
 	    my $git_addr = $2;
-	    $git_name =~ tr/^\"//;
-	    $git_name =~ tr/^\\s*//;
-	    $git_name =~ tr/\"$//;
-	    $git_name =~ tr/\\s*$//;
 	    if ($email_usename) {
 		push(@email_to, format_email($git_name, $git_addr));
 	    } else {
@@ -527,3 +546,97 @@ sub output {
 	print("\n");
     }
 }
+
+my $rfc822re;
+
+sub make_rfc822re {
+#   Basic lexical tokens are specials, domain_literal, quoted_string, atom, and
+#   comment.  We must allow for rfc822_lwsp (or comments) after each of these.
+#   This regexp will only work on addresses which have had comments stripped
+#   and replaced with rfc822_lwsp.
+
+    my $specials = '()<>@,;:\\\\".\\[\\]';
+    my $controls = '\\000-\\037\\177';
+
+    my $dtext = "[^\\[\\]\\r\\\\]";
+    my $domain_literal = "\\[(?:$dtext|\\\\.)*\\]$rfc822_lwsp*";
+
+    my $quoted_string = "\"(?:[^\\\"\\r\\\\]|\\\\.|$rfc822_lwsp)*\"$rfc822_lwsp*";
+
+#   Use zero-width assertion to spot the limit of an atom.  A simple
+#   $rfc822_lwsp* causes the regexp engine to hang occasionally.
+    my $atom = "[^$specials $controls]+(?:$rfc822_lwsp+|\\Z|(?=[\\[\"$specials]))";
+    my $word = "(?:$atom|$quoted_string)";
+    my $localpart = "$word(?:\\.$rfc822_lwsp*$word)*";
+
+    my $sub_domain = "(?:$atom|$domain_literal)";
+    my $domain = "$sub_domain(?:\\.$rfc822_lwsp*$sub_domain)*";
+
+    my $addr_spec = "$localpart\@$rfc822_lwsp*$domain";
+
+    my $phrase = "$word*";
+    my $route = "(?:\@$domain(?:,\@$rfc822_lwsp*$domain)*:$rfc822_lwsp*)";
+    my $route_addr = "\\<$rfc822_lwsp*$route?$addr_spec\\>$rfc822_lwsp*";
+    my $mailbox = "(?:$addr_spec|$phrase$route_addr)";
+
+    my $group = "$phrase:$rfc822_lwsp*(?:$mailbox(?:,\\s*$mailbox)*)?;\\s*";
+    my $address = "(?:$mailbox|$group)";
+
+    return "$rfc822_lwsp*$address";
+}
+
+sub rfc822_strip_comments {
+    my $s = shift;
+#   Recursively remove comments, and replace with a single space.  The simpler
+#   regexps in the Email Addressing FAQ are imperfect - they will miss escaped
+#   chars in atoms, for example.
+
+    while ($s =~ s/^((?:[^"\\]|\\.)*
+                    (?:"(?:[^"\\]|\\.)*"(?:[^"\\]|\\.)*)*)
+                    \((?:[^()\\]|\\.)*\)/$1 /osx) {}
+    return $s;
+}
+
+#   valid: returns true if the parameter is an RFC822 valid address
+#
+sub rfc822_valid ($) {
+    my $s = rfc822_strip_comments(shift);
+
+    if (!$rfc822re) {
+        $rfc822re = make_rfc822re();
+    }
+
+    return $s =~ m/^$rfc822re$/so && $s =~ m/^$rfc822_char*$/;
+}
+
+#   validlist: In scalar context, returns true if the parameter is an RFC822
+#              valid list of addresses.
+#
+#              In list context, returns an empty list on failure (an invalid
+#              address was found); otherwise a list whose first element is the
+#              number of addresses found and whose remaining elements are the
+#              addresses.  This is needed to disambiguate failure (invalid)
+#              from success with no addresses found, because an empty string is
+#              a valid list.
+
+sub rfc822_validlist ($) {
+    my $s = rfc822_strip_comments(shift);
+
+    if (!$rfc822re) {
+        $rfc822re = make_rfc822re();
+    }
+    # * null list items are valid according to the RFC
+    # * the '1' business is to aid in distinguishing failure from no results
+
+    my @r;
+    if ($s =~ m/^(?:$rfc822re)?(?:,(?:$rfc822re)?)*$/so &&
+	$s =~ m/^$rfc822_char*$/) {
+        while($s =~ m/(?:^|,$rfc822_lwsp*)($rfc822re)/gos) {
+            push @r, $1;
+        }
+        return wantarray ? (scalar(@r), @r) : 1;
+    }
+    else {
+        return wantarray ? () : 0;
+    }
+}
-- 
cgit v0.10.2


From d789504ab03c27b194170262cb4ffda38905c5c0 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:02 -0700
Subject: scripts/get_maintainer.pl: better email name quoting

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 7cf4309..159ce64 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -356,6 +356,7 @@ sub format_email {
     my ($name, $email) = @_;
 
     $name =~ s/^\s+|\s+$//g;
+    $name =~ s/^\"|\"$//g;
     $email =~ s/^\s+|\s+$//g;
 
     my $formatted_email = "";
-- 
cgit v0.10.2


From 5f2441e97684cfc787873f884c715e109ffcfbcd Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:02 -0700
Subject: scripts/get_maintainer.pl: support both "P:/M:" and integrated "M:"
 lines

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 159ce64..a1a43cf 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -55,7 +55,7 @@ foreach my $chief (@penguin_chief) {
 }
 my $penguin_chiefs = "\(" . join("|",@penguin_chief_names) . "\)";
 
-# rfc822 - preloaded methods go here.
+# rfc822 email address - preloaded methods go here.
 my $rfc822_lwsp = "(?:(?:\\r\\n)?[ \\t])";
 my $rfc822_char = '[\\000-\\177]';
 
@@ -396,7 +396,19 @@ sub add_categories {
 		    }
 		}
 	    } elsif ($ptype eq "M") {
-		if ($email_maintainer) {
+		my $p_used = 0;
+		if ($index >= 0) {
+		    my $tv = $typevalue[$index - 1];
+		    if ($tv =~ m/^(\C):\s*(.*)/) {
+			if ($1 eq "P") {
+			    if ($email_usename) {
+				push_email_address(format_email($2, $pvalue));
+				$p_used = 1;
+			    }
+			}
+		    }
+		}
+		if (!$p_used) {
 		    push_email_addresses($pvalue);
 		}
 	    } elsif ($ptype eq "T") {
@@ -436,13 +448,16 @@ sub push_email_addresses {
 
     my @address_list = ();
 
-    if (@address_list = rfc822_validlist($address)) {
+    if (rfc822_valid($address)) {
+	push_email_address($address);
+    } elsif (@address_list = rfc822_validlist($address)) {
 	my $array_count = shift(@address_list);
 	while (my $entry = shift(@address_list)) {
 	    push_email_address($entry);
 	}
+    } else {
+	warn("Invalid MAINTAINERS address: '" . $address . "'\n");
     }
-
 }
 
 sub which {
@@ -471,9 +486,8 @@ sub recent_git_signoffs {
 	return;
     }
     if (!(-d ".git")) {
-	warn("$P: .git repository not found.\n");
-	warn("Use a .git repository for better results.\n");
-	warn("ie: git clone git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git\n");
+	warn("$P: .git directory not found.  Use a git repository for better results.\n");
+	warn("$P: perhaps 'git clone git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git'\n");
 	return;
     }
 
@@ -632,7 +646,7 @@ sub rfc822_validlist ($) {
     my @r;
     if ($s =~ m/^(?:$rfc822re)?(?:,(?:$rfc822re)?)*$/so &&
 	$s =~ m/^$rfc822_char*$/) {
-        while($s =~ m/(?:^|,$rfc822_lwsp*)($rfc822re)/gos) {
+        while ($s =~ m/(?:^|,$rfc822_lwsp*)($rfc822re)/gos) {
             push @r, $1;
         }
         return wantarray ? (scalar(@r), @r) : 1;
-- 
cgit v0.10.2


From 0a79c492bcb1022e9a2d0bcb5ed6c624ef6641a0 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:03 -0700
Subject: scripts/get_maintainer.pl: don't print maintainers when not requested

Fixed bug introduced after using rfc822 address checking.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index a1a43cf..e57c3f6 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -436,10 +436,12 @@ sub push_email_address {
 	$email_address = $2;
     }
 
-    if ($email_usename && $email_name) {
-	push(@email_to,	format_email($email_name, $email_address));
-    } else {
-	push(@email_to, $email_address);
+    if ($email_maintainer) {
+	if ($email_usename && $email_name) {
+	    push(@email_to, format_email($email_name, $email_address));
+	} else {
+	    push(@email_to, $email_address);
+	}
     }
 }
 
-- 
cgit v0.10.2


From df4cc036828f6027689016a91adadee405eab104 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:04 -0700
Subject: scripts/get_maintainer.pl: allow 8 bit characters in email addresses

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index e57c3f6..19854f5 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -57,7 +57,7 @@ my $penguin_chiefs = "\(" . join("|",@penguin_chief_names) . "\)";
 
 # rfc822 email address - preloaded methods go here.
 my $rfc822_lwsp = "(?:(?:\\r\\n)?[ \\t])";
-my $rfc822_char = '[\\000-\\177]';
+my $rfc822_char = '[\\000-\\377]';
 
 if (!GetOptions(
 		'email!' => \$email,
-- 
cgit v0.10.2


From 7f29fd2748ac8a8a47c949b26e5a9749b1b804fb Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:04 -0700
Subject: scripts/get_maintainer.pl: change "die" to "warn" when command line
 file is not a patch

fixes git send-email with a cover letter

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 19854f5..3e73314 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -165,7 +165,7 @@ foreach my $file (@ARGV) {
 	}
 	close(PATCH);
 	if ($file_cnt == @files) {
-	    die "$P: file '${file}' doesn't appear to be a patch.  "
+	    warn "$P: file '${file}' doesn't appear to be a patch.  "
 		. "Add -f to options?\n";
 	}
 	@files = sort_and_uniq(@files);
-- 
cgit v0.10.2


From 866a36b73ecfaffbb4ba74f7fb3a18d71a982983 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:05 -0700
Subject: MAINTAINERS: swap mismarked ECRYPT FS M: and P: entries

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index b735f7d..143bdbd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1981,8 +1981,8 @@ F:	net/bridge/netfilter/ebt*.c
 ECRYPT FILE SYSTEM
 P:	Tyler Hicks
 M:	tyhicks@linux.vnet.ibm.com
-M:	Dustin Kirkland
-P:	kirkland@canonical.com
+P:	Dustin Kirkland
+M:	kirkland@canonical.com
 L:	ecryptfs-devel@lists.launchpad.net
 W:	https://launchpad.net/ecryptfs
 S:	Supported
-- 
cgit v0.10.2


From 34d03cc16adc475118f7cfbb2288fca5af1e252f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:06 -0700
Subject: MAINTAINERS: add file patterns to "THE REST"

These file patterns match all sources.
By default, scripts/get_maintainers.pl excludes Linus Torvalds
from the CC: list.  Option --git-chief-penguins will include him.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index 143bdbd..b0b0e2b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6530,5 +6530,10 @@ F:	drivers/serial/zs.*
 
 THE REST
 P:	Linus Torvalds
+M:	torvalds@linux-foundation.org
+L:	linux-kernel@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
 S:	Buried alive in reporters
+F:	*
+F:	*/
+
-- 
cgit v0.10.2


From 3f563d2503604b1d33d2474d99a32a6fb4cc7904 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:06 -0700
Subject: MAINTAINERS: update M32R file patterns after rename

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index b0b0e2b..8c47867 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3658,7 +3658,6 @@ L:	linux-m32r-ja@ml.linux-m32r.org (in Japanese)
 W:	http://www.linux-m32r.org/
 S:	Maintained
 F:	arch/m32r/
-F:	include/asm-m32r/
 
 M68K ARCHITECTURE
 P:	Geert Uytterhoeven
-- 
cgit v0.10.2


From 93711660086d0ad2da071b6aba675e96277e995f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:07 -0700
Subject: MAINTAINERS: mark ALSA lists as moderated

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index 8c47867..644102a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -488,7 +488,7 @@ AOA (Apple Onboard Audio) ALSA DRIVER
 P:	Johannes Berg
 M:	johannes@sipsolutions.net
 L:	linuxppc-dev@ozlabs.org
-L:	alsa-devel@alsa-project.org (subscribers-only)
+L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
 S:	Maintained
 F:	sound/aoa/
 
@@ -1436,7 +1436,7 @@ F:	drivers/usb/host/ohci-ep93xx.c
 CIRRUS LOGIC CS4270 SOUND DRIVER
 P:	Timur Tabi
 M:	timur@freescale.com
-L:	alsa-devel@alsa-project.org
+L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
 S:	Supported
 F:	sound/soc/codecs/cs4270*
 
@@ -2365,7 +2365,7 @@ F:	drivers/serial/ucc_uart.c
 FREESCALE SOC SOUND DRIVERS
 P:	Timur Tabi
 M:	timur@freescale.com
-L:	alsa-devel@alsa-project.org
+L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
 L:	linuxppc-dev@ozlabs.org
 S:	Supported
 F:	sound/soc/fsl/fsl*
@@ -5425,7 +5425,7 @@ P:	Jaroslav Kysela
 M:	perex@perex.cz
 P:	Takashi Iwai
 M:	tiwai@suse.de
-L:	alsa-devel@alsa-project.org (subscribers-only)
+L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
 W:	http://www.alsa-project.org/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound-2.6.git
 T:	git git://git.alsa-project.org/alsa-kernel.git
@@ -5440,7 +5440,7 @@ M:	lrg@slimlogic.co.uk
 P:	Mark Brown
 M:	broonie@opensource.wolfsonmicro.com
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound-2.6.git
-L:	alsa-devel@alsa-project.org (subscribers-only)
+L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
 W:	http://alsa-project.org/main/index.php/ASoC
 S:	Supported
 F:	sound/soc/
-- 
cgit v0.10.2


From b5472cddbe2c41fd434592ecf3c5b81a551d5bea Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:07 -0700
Subject: MAINTAINERS: remove L: linux-kernel@vger.kernel.org from all but "THE
 REST"

lkml is added to all CC lists via pattern matching on "THE REST"

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index 644102a..bfbd5be 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -911,7 +911,6 @@ P:	Dan Williams
 M:	dan.j.williams@intel.com
 P:	Maciej Sosnowski
 M:	maciej.sosnowski@intel.com
-L:	linux-kernel@vger.kernel.org
 W:	http://sourceforge.net/projects/xscaleiop
 S:	Supported
 F:	Documentation/crypto/async-tx-api.txt
@@ -1007,7 +1006,6 @@ F:	drivers/mmc/host/at91_mci.c
 ATMEL AT91 / AT32 SERIAL DRIVER
 P:	Haavard Skinnemoen
 M:	hskinnemoen@atmel.com
-L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	drivers/serial/atmel_serial.c
 
@@ -1063,7 +1061,6 @@ F:	kernel/audit*
 AUXILIARY DISPLAY DRIVERS
 P:	Miguel Ojeda Sandonis
 M:	miguel.ojeda.sandonis@gmail.com
-L:	linux-kernel@vger.kernel.org
 W:	http://miguelojeda.es/auxdisplay.htm
 W:	http://jair.lab.fi.uva.es/~migojed/auxdisplay.htm
 S:	Maintained
@@ -1133,7 +1130,6 @@ F:	drivers/net/hamradio/baycom*
 BEFS FILE SYSTEM
 P:	Sergey S. Kostyliov
 M:	rathamahata@php4.ru
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/filesystems/befs.txt
 F:	fs/befs/
@@ -1141,7 +1137,6 @@ F:	fs/befs/
 BFS FILE SYSTEM
 P:	Tigran A. Aivazian
 M:	tigran@aivazian.fsnet.co.uk
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/filesystems/bfs.txt
 F:	fs/bfs/
@@ -1198,7 +1193,6 @@ F:	drivers/i2c/busses/i2c-bfin-twi.c
 BLOCK LAYER
 P:	Jens Axboe
 M:	axboe@kernel.dk
-L:	linux-kernel@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git
 S:	Maintained
 F:	block/
@@ -1325,7 +1319,6 @@ P:	Muli Ben-Yehuda
 M:	muli@il.ibm.com
 P:	Jon D. Mason
 M:	jdmason@kudzu.us
-L:	linux-kernel@vger.kernel.org
 L:	discuss@x86-64.org
 S:	Maintained
 F:	arch/x86/kernel/pci-calgary_64.c
@@ -1377,7 +1370,6 @@ F:	include/linux/usb/wusb*
 CFAG12864B LCD DRIVER
 P:	Miguel Ojeda Sandonis
 M:	miguel.ojeda.sandonis@gmail.com
-L:	linux-kernel@vger.kernel.org
 W:	http://miguelojeda.es/auxdisplay.htm
 W:	http://jair.lab.fi.uva.es/~migojed/auxdisplay.htm
 S:	Maintained
@@ -1387,7 +1379,6 @@ F:	include/linux/cfag12864b.h
 CFAG12864BFB LCD FRAMEBUFFER DRIVER
 P:	Miguel Ojeda Sandonis
 M:	miguel.ojeda.sandonis@gmail.com
-L:	linux-kernel@vger.kernel.org
 W:	http://miguelojeda.es/auxdisplay.htm
 W:	http://jair.lab.fi.uva.es/~migojed/auxdisplay.htm
 S:	Maintained
@@ -1407,7 +1398,6 @@ X:	net/wireless/wext*
 CHECKPATCH
 P:	Andy Whitcroft
 M:	apw@canonical.com
-L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	scripts/checkpatch.pl
 
@@ -1533,7 +1523,6 @@ F:	drivers/usb/atm/cxacru.c
 CONFIGFS
 P:	Joel Becker
 M:	joel.becker@oracle.com
-L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	fs/configfs/
 F:	include/linux/configfs.h
@@ -1591,7 +1580,6 @@ F:	arch/x86/kernel/msr.c
 CPUSETS
 P:	Paul Menage
 M:	menage@google.com
-L:	linux-kernel@vger.kernel.org
 W:	http://www.bullopensource.org/cpuset/
 W:	http://oss.sgi.com/projects/cpusets/
 S:	Supported
@@ -1798,7 +1786,6 @@ DEVICE NUMBER REGISTRY
 P:	Torben Mathiasen
 M:	device@lanana.org
 W:	http://lanana.org/docs/device-list/index.html
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 
 DEVICE-MAPPER  (LVM)
@@ -1824,7 +1811,6 @@ F:	drivers/char/digi*
 DIRECTORY NOTIFICATION (DNOTIFY)
 P:	Eric Paris
 M:	eparis@parisplace.org
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/filesystems/dnotify.txt
 F:	fs/notify/dnotify/
@@ -1841,7 +1827,6 @@ S:	Maintained
 DISKQUOTA
 P:	Jan Kara
 M:	jack@suse.cz
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/filesystems/quota.txt
 F:	fs/quota/
@@ -1863,7 +1848,6 @@ P:	Maciej Sosnowski
 M:	maciej.sosnowski@intel.com
 P:	Dan Williams
 M:	dan.j.williams@intel.com
-L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	drivers/dma/
 F:	include/linux/dma*
@@ -1915,7 +1899,6 @@ F:	drivers/scsi/dpt/
 DRIVER CORE, KOBJECTS, AND SYSFS
 P:	Greg Kroah-Hartman
 M:	gregkh@suse.de
-L:	linux-kernel@vger.kernel.org
 T:	quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
 S:	Supported
 F:	Documentation/kobject.txt
@@ -2262,7 +2245,6 @@ F:	drivers/firewire/
 F:	include/linux/firewire*.h
 
 FIRMWARE LOADER (request_firmware)
-L:	linux-kernel@vger.kernel.org
 S:	Orphan
 F:	Documentation/firmware_class/
 F:	drivers/base/firmware*.c
@@ -2299,7 +2281,6 @@ M:	leoli@freescale.com
 P:	Zhang Wei
 M:	zw@zh-kernel.org
 L:	linuxppc-dev@ozlabs.org
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	drivers/dma/fsldma.*
 
@@ -2499,7 +2480,6 @@ F:	drivers/hwmon/hdaps.c
 
 HYPERVISOR VIRTUAL CONSOLE DRIVER
 L:	linuxppc-dev@ozlabs.org
-L:	linux-kernel@vger.kernel.org
 S:	Odd Fixes
 F:	drivers/char/hvc_*
 
@@ -2566,7 +2546,6 @@ F:	sound/parisc/harmony.*
 HAYES ESP SERIAL DRIVER
 P:	Andrew J. Robinson
 M:	arobinso@nyx.net
-L:	linux-kernel@vger.kernel.org
 W:	http://www.nyx.net/~arobinso
 S:	Maintained
 F:	Documentation/serial/hayes-esp.txt
@@ -2592,7 +2571,6 @@ F:	include/linux/cciss_ioctl.h
 HFS FILESYSTEM
 P:	Roman Zippel
 M:	zippel@linux-m68k.org
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/filesystems/hfs.txt
 F:	fs/hfs/
@@ -2632,7 +2610,6 @@ F:	include/linux/hid*
 HIGH-RESOLUTION TIMERS, CLOCKEVENTS, DYNTICKS
 P:	Thomas Gleixner
 M:	tglx@linutronix.de
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/timers/
 F:	kernel/hrtimer.c
@@ -2771,7 +2748,6 @@ F:	drivers/i2c/busses/i2c-tiny-usb.c
 i386 BOOT CODE
 P:	H. Peter Anvin
 M:	hpa@zytor.com
-L:	Linux-Kernel@vger.kernel.org
 S:	Maintained
 F:	arch/x86/boot/
 
@@ -2901,7 +2877,6 @@ P:	Robert Love
 M:	rlove@rlove.org
 P:	Eric Paris
 M:	eparis@parisplace.org
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/filesystems/inotify.txt
 F:	fs/notify/inotify/
@@ -2949,7 +2924,6 @@ F:	arch/x86/kernel/microcode_intel.c
 INTEL I/OAT DMA DRIVER
 P:	Maciej Sosnowski
 M:	maciej.sosnowski@intel.com
-L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	drivers/dma/ioat*
 
@@ -2965,7 +2939,6 @@ F:	include/linux/intel-iommu.h
 INTEL IOP-ADMA DMA DRIVER
 P:	Dan Williams
 M:	dan.j.williams@intel.com
-L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	drivers/dma/iop-adma.c
 
@@ -3278,7 +3251,6 @@ M:	vgoyal@redhat.com
 P:	Haren Myneni
 M:	hbabu@us.ibm.com
 L:	kexec@lists.infradead.org
-L:	linux-kernel@vger.kernel.org
 W:	http://lse.sourceforge.net/kdump/
 S:	Maintained
 F:	Documentation/kdump/
@@ -3388,7 +3360,6 @@ KEXEC
 P:	Eric Biederman
 M:	ebiederm@xmission.com
 W:	http://ftp.kernel.org/pub/linux/kernel/people/horms/kexec-tools/
-L:	linux-kernel@vger.kernel.org
 L:	kexec@lists.infradead.org
 S:	Maintained
 F:	include/linux/kexec.h
@@ -3418,7 +3389,6 @@ F:	mm/kmemleak-test.c
 KMEMTRACE
 P:	Eduard - Gabriel Munteanu
 M:	eduard.munteanu@linux360.ro
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/trace/kmemtrace.txt
 F:	include/trace/kmemtrace.h
@@ -3433,7 +3403,6 @@ P:	David S. Miller
 M:	davem@davemloft.net
 P:	Masami Hiramatsu
 M:	mhiramat@redhat.com
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/kprobes.txt
 F:	include/linux/kprobes.h
@@ -3442,7 +3411,6 @@ F:	kernel/kprobes.c
 KS0108 LCD CONTROLLER DRIVER
 P:	Miguel Ojeda Sandonis
 M:	miguel.ojeda.sandonis@gmail.com
-L:	linux-kernel@vger.kernel.org
 W:	http://miguelojeda.es/auxdisplay.htm
 W:	http://jair.lab.fi.uva.es/~migojed/auxdisplay.htm
 S:	Maintained
@@ -3606,7 +3574,6 @@ P:	Peter Zijlstra
 M:	peterz@infradead.org
 P:	Ingo Molnar
 M:	mingo@redhat.com
-L:	linux-kernel@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/peterz/linux-2.6-lockdep.git
 S:	Maintained
 F:	Documentation/lockdep*.txt
@@ -3741,7 +3708,6 @@ F:	include/linux/mv643xx.h
 MARVELL SOC MMC/SD/SDIO CONTROLLER DRIVER
 P:	Nicolas Pitre
 M:	nico@cam.org
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 
 MARVELL YUKON / SYSKONNECT DRIVER
@@ -3795,7 +3761,6 @@ F:	drivers/scsi/megaraid/
 
 MEMORY MANAGEMENT
 L:	linux-mm@kvack.org
-L:	linux-kernel@vger.kernel.org
 W:	http://www.linux-mm.org
 S:	Maintained
 F:	include/linux/mm.h
@@ -3809,7 +3774,6 @@ M:	xemul@openvz.org
 P:	KAMEZAWA Hiroyuki
 M:	kamezawa.hiroyu@jp.fujitsu.com
 L:	linux-mm@kvack.org
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	mm/memcontrol.c
 
@@ -3852,7 +3816,6 @@ F:	arch/mips/
 MISCELLANEOUS MCA-SUPPORT
 P:	James Bottomley
 M:	James.Bottomley@HansenPartnership.com
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/ia64/mca.txt
 F:	Documentation/mca.txt
@@ -3862,7 +3825,6 @@ F:	include/linux/mca*
 MODULE SUPPORT
 P:	Rusty Russell
 M:	rusty@rustcorp.com.au
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	include/linux/module.h
 F:	kernel/module.c
@@ -3886,7 +3848,6 @@ F:	drivers/mmc/host/imxmmc.*
 MOUSE AND MISC DEVICES [GENERAL]
 P:	Alessandro Rubini
 M:	rubini@ipvvis.unipv.it
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	drivers/input/mouse/
 F:	include/linux/gpio_mouse.h
@@ -3894,7 +3855,6 @@ F:	include/linux/gpio_mouse.h
 MOXA SMARTIO/INDUSTIO/INTELLIO SERIAL CARD
 P:	Jiri Slaby
 M:	jirislaby@gmail.com
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/serial/moxa-smartio
 F:	drivers/char/mxser.*
@@ -3910,7 +3870,6 @@ F:	drivers/platform/x86/msi-laptop.c
 MULTIFUNCTION DEVICES (MFD)
 P:	Samuel Ortiz
 M:	sameo@linux.intel.com
-L:	linux-kernel@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/sameo/mfd-2.6.git
 S:	Supported
 F:	drivers/mfd/
@@ -3918,7 +3877,6 @@ F:	drivers/mfd/
 MULTIMEDIA CARD (MMC), SECURE DIGITAL (SD) AND SDIO SUBSYSTEM
 P:	Pierre Ossman
 M:	pierre@ossman.eu
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	drivers/mmc/
 F:	include/linux/mmc/
@@ -3926,7 +3884,6 @@ F:	include/linux/mmc/
 MULTIMEDIA CARD (MMC) ETC. OVER SPI
 P:	David Brownell
 M:	dbrownell@users.sourceforge.net
-L:	linux-kernel@vger.kernel.org
 S:	Odd Fixes
 F:	drivers/mmc/host/mmc_spi.c
 F:	include/linux/spi/mmc_spi.h
@@ -3941,7 +3898,6 @@ F:	sound/oss/msnd*
 MULTITECH MULTIPORT CARD (ISICOM)
 P:	Jiri Slaby
 M:	jirislaby@gmail.com
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	drivers/char/isicom.c
 F:	include/linux/isicom.h
@@ -4185,7 +4141,6 @@ NTFS FILESYSTEM
 P:	Anton Altaparmakov
 M:	aia21@cantab.net
 L:	linux-ntfs-dev@lists.sourceforge.net
-L:	linux-kernel@vger.kernel.org
 W:	http://www.linux-ntfs.org/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/aia21/ntfs-2.6.git
 S:	Maintained
@@ -4419,7 +4374,6 @@ M:	akataria@vmware.com
 P:	Rusty Russell
 M:	rusty@rustcorp.com.au
 L:	virtualization@lists.osdl.org
-L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	Documentation/ia64/paravirt_ops.txt
 F:	arch/*/kernel/paravirt*
@@ -4470,7 +4424,6 @@ F:	include/linux/leds-pca9532.h
 PCI ERROR RECOVERY
 P:	Linas Vepstas
 M:	linas@austin.ibm.com
-L:	linux-kernel@vger.kernel.org
 L:	linux-pci@vger.kernel.org
 S:	Supported
 F:	Documentation/PCI/pci-error-recovery.txt
@@ -4479,7 +4432,6 @@ F:	Documentation/powerpc/eeh-pci-error-recovery.txt
 PCI SUBSYSTEM
 P:	Jesse Barnes
 M:	jbarnes@virtuousgeek.org
-L:	linux-kernel@vger.kernel.org
 L:	linux-pci@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jbarnes/pci-2.6.git
 S:	Supported
@@ -4514,7 +4466,6 @@ F:	drivers/net/pcnet32.c
 PER-TASK DELAY ACCOUNTING
 P:	Balbir Singh
 M:	balbir@linux.vnet.ibm.com
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	include/linux/delayacct.h
 F:	kernel/delayacct.c
@@ -4546,7 +4497,6 @@ F:	drivers/mtd/devices/phram.c
 PKTCDVD DRIVER
 P:	Peter Osterlund
 M:	petero2@telia.com
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	drivers/block/pktcdvd.c
 F:	include/linux/pktcdvd.h
@@ -4554,7 +4504,6 @@ F:	include/linux/pktcdvd.h
 POSIX CLOCKS and TIMERS
 P:	Thomas Gleixner
 M:	tglx@linutronix.de
-L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	fs/timerfd.c
 F:	include/linux/timer*
@@ -4565,7 +4514,6 @@ P:	Anton Vorontsov
 M:	cbou@mail.ru
 P:	David Woodhouse
 M:	dwmw2@infradead.org
-L:	linux-kernel@vger.kernel.org
 T:	git git://git.infradead.org/battery-2.6.git
 S:	Maintained
 F:	include/linux/power_supply.h
@@ -4617,7 +4565,6 @@ F:	include/linux/if_pppol2tp.h
 PREEMPTIBLE KERNEL
 P:	Robert Love
 M:	rml@tech9.net
-L:	linux-kernel@vger.kernel.org
 L:	kpreempt-tech@lists.sourceforge.net
 W:	ftp://ftp.kernel.org/pub/linux/kernel/people/rml/preempt-kernel
 S:	Supported
@@ -4680,7 +4627,6 @@ P:	Roland McGrath
 M:	roland@redhat.com
 P:	Oleg Nesterov
 M:	oleg@redhat.com
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	include/asm-generic/syscall.h
 F:	include/linux/ptrace.h
@@ -4766,7 +4712,6 @@ F:	drivers/net/qlge/
 QNX4 FILESYSTEM
 P:	Anders Larsen
 M:	al@alarsen.net
-L:	linux-kernel@vger.kernel.org
 W:	http://www.alarsen.net/linux/qnx4fs/
 S:	Maintained
 F:	fs/qnx4/
@@ -4813,7 +4758,6 @@ F:	drivers/char/random.c
 RAPIDIO SUBSYSTEM
 P:	Matt Porter
 M:	mporter@kernel.crashing.org
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	drivers/rapidio/
 
@@ -4827,7 +4771,6 @@ F:	drivers/net/wireless/ray*
 RCUTORTURE MODULE
 P:	Josh Triplett
 M:	josh@freedesktop.org
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/RCU/torture.txt
 F:	kernel/rcutorture.c
@@ -4835,7 +4778,6 @@ F:	kernel/rcutorture.c
 RDC R-321X SoC
 P:	Florian Fainelli
 M:	florian@openwrt.org
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 
 RDC R6040 FAST ETHERNET DRIVER
@@ -4856,7 +4798,6 @@ READ-COPY UPDATE (RCU)
 P:	Dipankar Sarma
 M:	dipankar@in.ibm.com
 W:	http://www.rdrop.com/users/paulmck/rclock/
-L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	Documentation/RCU/rcu.txt
 F:	Documentation/RCU/rcuref.txt
@@ -4867,7 +4808,6 @@ F:	kernel/rcupdate.c
 REAL TIME CLOCK DRIVER
 P:	Paul Gortmaker
 M:	p_gortmaker@yahoo.com
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/rtc.txt
 F:	drivers/rtc/
@@ -5005,7 +4945,6 @@ S3C24XX SD/MMC Driver
 P:	Ben Dooks
 M:	ben-linux@fluff.org
 L:	linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only)
-L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	drivers/mmc/host/s3cmci.*
 
@@ -5031,7 +4970,6 @@ P:	Ingo Molnar
 M:	mingo@elte.hu
 P:	Peter Zijlstra
 M:	peterz@infradead.org
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	kernel/sched*
 F:	include/linux/sched.h
@@ -5133,7 +5071,6 @@ F:	drivers/mmc/host/sdhci.*
 SECURITY SUBSYSTEM
 P:	James Morris
 M:	jmorris@namei.org
-L:	linux-kernel@vger.kernel.org
 L:	linux-security-module@vger.kernel.org (suggested Cc:)
 T:	git git://www.kernel.org/pub/scm/linux/kernel/git/jmorris/security-testing-2.6.git
 W:	http://security.wiki.kernel.org/
@@ -5152,7 +5089,6 @@ P:	James Morris
 M:	jmorris@namei.org
 P:	Eric Paris
 M:	eparis@parisplace.org
-L:	linux-kernel@vger.kernel.org (kernel issues)
 L:	selinux@tycho.nsa.gov (subscribers-only, general discussion)
 W:	http://selinuxproject.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/security-testing-2.6.git
@@ -5415,7 +5351,6 @@ F:	include/linux/sony-laptop.h
 SONY MEMORYSTICK CARD SUPPORT
 P:	Alex Dubov
 M:	oakad@yahoo.com
-L:	linux-kernel@vger.kernel.org
 W:	http://tifmxx.berlios.de/
 S:	Maintained
 F:	drivers/memstick/host/tifm_ms.c
@@ -5458,7 +5393,6 @@ F:	arch/sparc/
 SPECIALIX IO8+ MULTIPORT SERIAL CARD DRIVER
 P:	Roger Wolff
 M:	R.E.Wolff@BitWizard.nl
-L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	Documentation/serial/specialix.txt
 F:	drivers/char/specialix*
@@ -5504,7 +5438,6 @@ F:	fs/squashfs/
 SRM (Alpha) environment access
 P:	Jan-Benedict Glaw
 M:	jbglaw@lug-owl.de
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	arch/alpha/kernel/srm_env.c
 
@@ -5519,7 +5452,6 @@ S:	Maintained
 STAGING SUBSYSTEM
 P:	Greg Kroah-Hartman
 M:	gregkh@suse.de
-L:	linux-kernel@vger.kernel.org
 T:	quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
 S:	Maintained
 F:	drivers/staging/
@@ -5599,7 +5531,6 @@ F:	include/linux/sysv_fs.h
 TASKSTATS STATISTICS INTERFACE
 P:	Balbir Singh
 M:	balbir@linux.vnet.ibm.com
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/accounting/taskstats*
 F:	include/linux/taskstats*
@@ -5692,7 +5623,6 @@ P:	Kentaro Takeda
 M:	takedakn@nttdata.co.jp
 P:	Tetsuo Handa
 M:	penguin-kernel@I-love.SAKURA.ne.jp
-L:	linux-kernel@vger.kernel.org (kernel issues)
 L:	tomoyo-users-en@lists.sourceforge.jp (subscribers-only, for developers and users in English)
 L:	tomoyo-dev@lists.sourceforge.jp (subscribers-only, for developers in Japanese)
 L:	tomoyo-users@lists.sourceforge.jp (subscribers-only, for users in Japanese)
@@ -5744,14 +5674,12 @@ F:	drivers/char/tpm/
 TRIVIAL PATCHES
 P:	Jiri Kosina
 M:	trivial@kernel.org
-L:	linux-kernel@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jikos/trivial.git
 S:	Maintained
 
 TTY LAYER
 P:	Alan Cox
 M:	alan@lxorguk.ukuu.org.uk
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 T:	stgit http://zeniv.linux.org.uk/~alan/ttydev/
 
@@ -5824,7 +5752,6 @@ F:	fs/udf/
 UFS FILESYSTEM
 P:	Evgeniy Dushistov
 M:	dushistov@mail.ru
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/filesystems/ufs.txt
 F:	fs/ufs/
@@ -5841,7 +5768,6 @@ F:	include/linux/uwb/
 UNIFORM CDROM DRIVER
 P:	Jens Axboe
 M:	axboe@kernel.dk
-L:	linux-kernel@vger.kernel.org
 W:	http://www.kernel.dk
 S:	Maintained
 F:	Documentation/cdrom/
@@ -5870,7 +5796,6 @@ F:	drivers/usb/class/cdc-acm.*
 USB BLOCK DRIVER (UB ub)
 P:	Pete Zaitcev
 M:	zaitcev@redhat.com
-L:	linux-kernel@vger.kernel.org
 L:	linux-usb@vger.kernel.org
 S:	Supported
 F:	drivers/block/ub.c
@@ -6210,7 +6135,6 @@ P:	Hans J. Koch
 M:	hjk@linutronix.de
 P:	Greg Kroah-Hartman
 M:	gregkh@suse.de
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/DocBook/uio-howto.tmpl
 F:	drivers/uio/
@@ -6236,7 +6160,6 @@ F:	drivers/video/uvesafb.*
 VFAT/FAT/MSDOS FILESYSTEM
 P:	OGAWA Hirofumi
 M:	hirofumi@mail.parknet.co.jp
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/filesystems/vfat.txt
 F:	fs/fat/
@@ -6341,7 +6264,6 @@ F:	drivers/hwmon/w83793.c
 W83L51xD SD/MMC CARD INTERFACE DRIVER
 P:	Pierre Ossman
 M:	pierre@ossman.eu
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	drivers/mmc/host/wbsd.*
 
@@ -6428,7 +6350,6 @@ M:	mingo@redhat.com
 P:	H. Peter Anvin
 M:	hpa@zytor.com
 M:	x86@kernel.org
-L:	linux-kernel@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git
 S:	Maintained
 F:	Documentation/x86/
@@ -6464,7 +6385,6 @@ XILINX SYSTEMACE DRIVER
 P:	Grant Likely
 M:	grant.likely@secretlab.ca
 W:	http://www.secretlab.ca/
-L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	drivers/block/xsysace.c
 
@@ -6535,4 +6455,3 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
 S:	Buried alive in reporters
 F:	*
 F:	*/
-
-- 
cgit v0.10.2


From f70f873b8ffa0bca4aeccd20a3e4641b7645018a Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:08 -0700
Subject: MAINTAINERS: mention scripts/get_maintainer.pl in the preface

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index bfbd5be..053bf8f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -36,6 +36,12 @@ trivial patch so apply some common sense.
 	(scripts/checkpatch.pl) to catch trival style violations.
 	See Documentation/CodingStyle for guidance here.
 
+	PLEASE CC: the maintainers and mailing lists that are generated
+	by scripts/get_maintainer.pl.  The results returned by the
+	script will be best if you have git installed and are making
+	your changes in a branch derived from Linus' latest git tree.
+	See Documentation/SubmittingPatches for details.
+
 	PLEASE try to include any credit lines you want added with the
 	patch. It avoids people being missed off by mistake and makes
 	it easier to know who wants adding and who doesn't.
-- 
cgit v0.10.2


From 2a99921a5570381fc1da04a1bd9f7afe8692e49e Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:09 -0700
Subject: MAINTAINERS: add file pattern to CISCO FCOE HBA DRIVER

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index 053bf8f..99371ee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1457,6 +1457,7 @@ P:	Joe Eykholt
 M:	jeykholt@cisco.com
 L:	linux-scsi@vger.kernel.org
 S:	Supported
+F:	drivers/scsi/fnic/
 
 CODA FILE SYSTEM
 P:	Jan Harkes
-- 
cgit v0.10.2


From 852f9bbbdec485d3015adc81b7b0c874c55705cc Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:09 -0700
Subject: MAINTAINERS: add Paul McKenney to RCU and RCUTORTURE

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index 99371ee..e31f0ee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4778,6 +4778,8 @@ F:	drivers/net/wireless/ray*
 RCUTORTURE MODULE
 P:	Josh Triplett
 M:	josh@freedesktop.org
+P:	Paul E. McKenney
+M:	paulmck@linux.vnet.ibm.com
 S:	Maintained
 F:	Documentation/RCU/torture.txt
 F:	kernel/rcutorture.c
@@ -4804,6 +4806,8 @@ F:	net/rds/
 READ-COPY UPDATE (RCU)
 P:	Dipankar Sarma
 M:	dipankar@in.ibm.com
+P:	Paul E. McKenney
+M:	paulmck@linux.vnet.ibm.com
 W:	http://www.rdrop.com/users/paulmck/rclock/
 S:	Supported
 F:	Documentation/RCU/rcu.txt
-- 
cgit v0.10.2


From 130b76cf5295d74a2174d0843bb563bbf0a19dbb Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 16 Jun 2009 15:34:10 -0700
Subject: MAINTAINERS: add file patterns to TTY LAYER

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index e31f0ee..06579ab 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5687,6 +5687,11 @@ P:	Jiri Kosina
 M:	trivial@kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jikos/trivial.git
 S:	Maintained
+F:	drivers/char/tty_*
+F:	drivers/serial/serial_core.c
+F:	include/linux/serial_core.h
+F:	include/linux/serial.h
+F:	include/linux/tty.h
 
 TTY LAYER
 P:	Alan Cox
-- 
cgit v0.10.2


From 16c047add3ceaf0ab882e3e094d1ec904d02312d Mon Sep 17 00:00:00 2001
From: Wolfram Strepp <wstrepp@gmx.de>
Date: Tue, 16 Jun 2009 15:34:11 -0700
Subject: rb_tree: reorganize code in rb_erase() for additional changes

First, move some code around in order to make the next change more obvious.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wolfram Strepp <wstrepp@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/lib/rbtree.c b/lib/rbtree.c
index f653659..0455685 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -231,6 +231,15 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 		node = node->rb_right;
 		while ((left = node->rb_left) != NULL)
 			node = left;
+
+		if (rb_parent(old)) {
+			if (rb_parent(old)->rb_left == old)
+				rb_parent(old)->rb_left = node;
+			else
+				rb_parent(old)->rb_right = node;
+		} else
+			root->rb_node = node;
+
 		child = node->rb_right;
 		parent = rb_parent(node);
 		color = rb_color(node);
@@ -247,15 +256,6 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 		node->rb_right = old->rb_right;
 		node->rb_left = old->rb_left;
 
-		if (rb_parent(old))
-		{
-			if (rb_parent(old)->rb_left == old)
-				rb_parent(old)->rb_left = node;
-			else
-				rb_parent(old)->rb_right = node;
-		} else
-			root->rb_node = node;
-
 		rb_set_parent(old->rb_left, node);
 		if (old->rb_right)
 			rb_set_parent(old->rb_right, node);
-- 
cgit v0.10.2


From 4c60117811171d867d4f27f17ea07d7419d45dae Mon Sep 17 00:00:00 2001
From: Wolfram Strepp <wstrepp@gmx.de>
Date: Tue, 16 Jun 2009 15:34:12 -0700
Subject: rb_tree: make clear distinction between two different cases in
 rb_erase()

There are two cases when a node, having 2 childs, is erased:
'normal case': the successor is not the right-hand-child of the node to be erased
'special case': the successor is the right-hand child of the node to be erased

Here some ascii-art, with following symbols (referring to the code):
O: node to be deleted
N: the successor of O
P: parent of N
C: child of N
L: some other node

normal case:

               O                         N
              / \                       / \
             /   \                     /   \
            L     \                   L     \
           / \     P      ---->      / \     P
                  / \                       / \
                 /                         /
                N                         C
                 \                       / \
                  \
                   C
                  / \

special case:
              O|P                        N
              / \                       / \
             /   \                     /   \
            L     \                   L     \
           / \     N      ---->      /       C
                    \                       / \
                     \
                      C
                     / \

Notice that for the special case we don't have to reconnect C to N.

Signed-off-by: Wolfram Strepp <wstrepp@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/lib/rbtree.c b/lib/rbtree.c
index 0455685..4b4b29b 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -244,13 +244,13 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 		parent = rb_parent(node);
 		color = rb_color(node);
 
-		if (child)
-			rb_set_parent(child, parent);
 		if (parent == old) {
-			parent->rb_right = child;
 			parent = node;
-		} else
+		} else {
+			if (child)
+				rb_set_parent(child, parent);
 			parent->rb_left = child;
+		}
 
 		node->rb_parent_color = old->rb_parent_color;
 		node->rb_right = old->rb_right;
-- 
cgit v0.10.2


From 4b324126e0c6c3a5080ca3ec0981e8766ed6f1ee Mon Sep 17 00:00:00 2001
From: Wolfram Strepp <wstrepp@gmx.de>
Date: Tue, 16 Jun 2009 15:34:13 -0700
Subject: rb_tree: remove redundant if()-condition in rb_erase()

Furthermore, notice that the initial checks:

	if (!node->rb_left)
		child = node->rb_right;
	else if (!node->rb_right)
		child = node->rb_left;
	else
	{
		...
	}
guarantee that old->rb_right is set in the final else branch, therefore
we can omit checking that again.

Signed-off-by: Wolfram Strepp <wstrepp@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/lib/rbtree.c b/lib/rbtree.c
index 4b4b29b..e2aa3be 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -250,15 +250,15 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 			if (child)
 				rb_set_parent(child, parent);
 			parent->rb_left = child;
+
+			node->rb_right = old->rb_right;
+			rb_set_parent(old->rb_right, node);
 		}
 
 		node->rb_parent_color = old->rb_parent_color;
-		node->rb_right = old->rb_right;
 		node->rb_left = old->rb_left;
-
 		rb_set_parent(old->rb_left, node);
-		if (old->rb_right)
-			rb_set_parent(old->rb_right, node);
+
 		goto color;
 	}
 
-- 
cgit v0.10.2


From c28842421cb6a29fd952043381bc5391bdf6be50 Mon Sep 17 00:00:00 2001
From: Eric Piel <eric.piel@tremplin-utc.net>
Date: Tue, 16 Jun 2009 15:34:13 -0700
Subject: lis3: fix misc device unregistering and printk

Can only unregister the misc device if it was registered before.  Also
remove debugging messages, which in addition were not properly formated.

Signed-off-by: Eric Piel <eric.piel@tremplin-utc.net>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c
index 778eb77..17f2003 100644
--- a/drivers/hwmon/lis3lv02d.c
+++ b/drivers/hwmon/lis3lv02d.c
@@ -199,7 +199,6 @@ static int lis3lv02d_misc_open(struct inode *inode, struct file *file)
 		return -EBUSY;
 	}
 	lis3lv02d_increase_use(&lis3_dev);
-	printk("lis3: registered interrupt %d\n", lis3_dev.irq);
 	return 0;
 }
 
@@ -378,7 +377,8 @@ void lis3lv02d_joystick_disable(void)
 	if (!lis3_dev.idev)
 		return;
 
-	misc_deregister(&lis3lv02d_misc_device);
+	if (lis3_dev.irq)
+		misc_deregister(&lis3lv02d_misc_device);
 	input_unregister_device(lis3_dev.idev);
 	lis3_dev.idev = NULL;
 }
@@ -493,8 +493,6 @@ int lis3lv02d_init_device(struct lis3lv02d *dev)
 	if (lis3lv02d_joystick_enable())
 		printk(KERN_ERR DRIVER_NAME ": joystick initialization failed\n");
 
-	printk("lis3_init_device: irq %d\n", dev->irq);
-
 	/* bail if we did not get an IRQ from the bus layer */
 	if (!dev->irq) {
 		printk(KERN_ERR DRIVER_NAME
@@ -502,7 +500,6 @@ int lis3lv02d_init_device(struct lis3lv02d *dev)
 		goto out;
 	}
 
-	printk("lis3: registering device\n");
 	if (misc_register(&lis3lv02d_misc_device))
 		printk(KERN_ERR DRIVER_NAME ": misc_register failed\n");
 out:
-- 
cgit v0.10.2


From a002ee896dfd08ce9fba44e9ae513c9094699a27 Mon Sep 17 00:00:00 2001
From: Eric Piel <eric.piel@tremplin-utc.net>
Date: Tue, 16 Jun 2009 15:34:14 -0700
Subject: lis3: remove automatic shutdown of the device

After measurement on my laptop, it seems that turning off the device does
not bring any energy saving (within 0.1W precision).  So let's keep the
device always on.  It simplifies the code, and it avoids the problem of
reading a wrong value sometimes just after turning the device on.

Moreover, since commit ef2cfc790bf5f0ff189b01eabc0f4feb5e8524df had been
too zealous, the device was actually never turned off anyway.  This patch
also restores the damages done by this commit concerning the
initialisation/poweroff.

Also do more clean up with the usage of the lis3_dev global variable.

Signed-off-by: Eric Piel <eric.piel@tremplin-utc.net>
Signed-off-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/hwmon/hp_accel.c b/drivers/hwmon/hp_accel.c
index abca7e9..0ebd009 100644
--- a/drivers/hwmon/hp_accel.c
+++ b/drivers/hwmon/hp_accel.c
@@ -324,7 +324,7 @@ static int lis3lv02d_remove(struct acpi_device *device, int type)
 	flush_work(&hpled_led.work);
 	led_classdev_unregister(&hpled_led.led_classdev);
 
-	return lis3lv02d_remove_fs();
+	return lis3lv02d_remove_fs(&lis3_dev);
 }
 
 
@@ -338,13 +338,7 @@ static int lis3lv02d_suspend(struct acpi_device *device, pm_message_t state)
 
 static int lis3lv02d_resume(struct acpi_device *device)
 {
-	/* put back the device in the right state (ACPI might turn it on) */
-	mutex_lock(&lis3_dev.lock);
-	if (lis3_dev.usage > 0)
-		lis3lv02d_poweron(&lis3_dev);
-	else
-		lis3lv02d_poweroff(&lis3_dev);
-	mutex_unlock(&lis3_dev.lock);
+	lis3lv02d_poweron(&lis3_dev);
 	return 0;
 }
 #else
diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c
index 17f2003..df3f586 100644
--- a/drivers/hwmon/lis3lv02d.c
+++ b/drivers/hwmon/lis3lv02d.c
@@ -105,56 +105,39 @@ static void lis3lv02d_get_xyz(struct lis3lv02d *lis3, int *x, int *y, int *z)
 {
 	int position[3];
 
-	position[0] = lis3_dev.read_data(lis3, OUTX);
-	position[1] = lis3_dev.read_data(lis3, OUTY);
-	position[2] = lis3_dev.read_data(lis3, OUTZ);
+	position[0] = lis3->read_data(lis3, OUTX);
+	position[1] = lis3->read_data(lis3, OUTY);
+	position[2] = lis3->read_data(lis3, OUTZ);
 
-	*x = lis3lv02d_get_axis(lis3_dev.ac.x, position);
-	*y = lis3lv02d_get_axis(lis3_dev.ac.y, position);
-	*z = lis3lv02d_get_axis(lis3_dev.ac.z, position);
+	*x = lis3lv02d_get_axis(lis3->ac.x, position);
+	*y = lis3lv02d_get_axis(lis3->ac.y, position);
+	*z = lis3lv02d_get_axis(lis3->ac.z, position);
 }
 
 void lis3lv02d_poweroff(struct lis3lv02d *lis3)
 {
-	lis3_dev.is_on = 0;
+	/* disable X,Y,Z axis and power down */
+	lis3->write(lis3, CTRL_REG1, 0x00);
 }
 EXPORT_SYMBOL_GPL(lis3lv02d_poweroff);
 
 void lis3lv02d_poweron(struct lis3lv02d *lis3)
 {
-	lis3_dev.is_on = 1;
-	lis3_dev.init(lis3);
-}
-EXPORT_SYMBOL_GPL(lis3lv02d_poweron);
+	u8 reg;
 
-/*
- * To be called before starting to use the device. It makes sure that the
- * device will always be on until a call to lis3lv02d_decrease_use(). Not to be
- * used from interrupt context.
- */
-static void lis3lv02d_increase_use(struct lis3lv02d *dev)
-{
-	mutex_lock(&dev->lock);
-	dev->usage++;
-	if (dev->usage == 1) {
-		if (!dev->is_on)
-			lis3lv02d_poweron(dev);
-	}
-	mutex_unlock(&dev->lock);
-}
+	lis3->init(lis3);
 
-/*
- * To be called whenever a usage of the device is stopped.
- * It will make sure to turn off the device when there is not usage.
- */
-static void lis3lv02d_decrease_use(struct lis3lv02d *dev)
-{
-	mutex_lock(&dev->lock);
-	dev->usage--;
-	if (dev->usage == 0)
-		lis3lv02d_poweroff(dev);
-	mutex_unlock(&dev->lock);
+	/*
+	 * Common configuration
+	 * BDU: LSB and MSB values are not updated until both have been read.
+	 *      So the value read will always be correct.
+	 */
+	lis3->read(lis3, CTRL_REG2, &reg);
+	reg |= CTRL2_BDU;
+	lis3->write(lis3, CTRL_REG2, reg);
 }
+EXPORT_SYMBOL_GPL(lis3lv02d_poweron);
+
 
 static irqreturn_t lis302dl_interrupt(int irq, void *dummy)
 {
@@ -198,14 +181,12 @@ static int lis3lv02d_misc_open(struct inode *inode, struct file *file)
 		printk(KERN_ERR DRIVER_NAME ": IRQ%d allocation failed\n", lis3_dev.irq);
 		return -EBUSY;
 	}
-	lis3lv02d_increase_use(&lis3_dev);
 	return 0;
 }
 
 static int lis3lv02d_misc_release(struct inode *inode, struct file *file)
 {
 	fasync_helper(-1, file, 0, &lis3_dev.async_queue);
-	lis3lv02d_decrease_use(&lis3_dev);
 	free_irq(lis3_dev.irq, &lis3_dev);
 	clear_bit(0, &lis3_dev.misc_opened); /* release the device */
 	return 0;
@@ -314,10 +295,8 @@ static int lis3lv02d_joystick_kthread(void *data)
 
 static int lis3lv02d_joystick_open(struct input_dev *input)
 {
-	lis3lv02d_increase_use(&lis3_dev);
 	lis3_dev.kthread = kthread_run(lis3lv02d_joystick_kthread, NULL, "klis3lv02d");
 	if (IS_ERR(lis3_dev.kthread)) {
-		lis3lv02d_decrease_use(&lis3_dev);
 		return PTR_ERR(lis3_dev.kthread);
 	}
 
@@ -327,7 +306,6 @@ static int lis3lv02d_joystick_open(struct input_dev *input)
 static void lis3lv02d_joystick_close(struct input_dev *input)
 {
 	kthread_stop(lis3_dev.kthread);
-	lis3lv02d_decrease_use(&lis3_dev);
 }
 
 static inline void lis3lv02d_calibrate_joystick(void)
@@ -390,9 +368,7 @@ static ssize_t lis3lv02d_position_show(struct device *dev,
 {
 	int x, y, z;
 
-	lis3lv02d_increase_use(&lis3_dev);
 	lis3lv02d_get_xyz(&lis3_dev, &x, &y, &z);
-	lis3lv02d_decrease_use(&lis3_dev);
 	return sprintf(buf, "(%d,%d,%d)\n", x, y, z);
 }
 
@@ -406,9 +382,7 @@ static ssize_t lis3lv02d_calibrate_store(struct device *dev,
 				struct device_attribute *attr,
 				const char *buf, size_t count)
 {
-	lis3lv02d_increase_use(&lis3_dev);
 	lis3lv02d_calibrate_joystick();
-	lis3lv02d_decrease_use(&lis3_dev);
 	return count;
 }
 
@@ -420,9 +394,7 @@ static ssize_t lis3lv02d_rate_show(struct device *dev,
 	u8 ctrl;
 	int val;
 
-	lis3lv02d_increase_use(&lis3_dev);
 	lis3_dev.read(&lis3_dev, CTRL_REG1, &ctrl);
-	lis3lv02d_decrease_use(&lis3_dev);
 	val = (ctrl & (CTRL1_DF0 | CTRL1_DF1)) >> 4;
 	return sprintf(buf, "%d\n", lis3lv02dl_df_val[val]);
 }
@@ -446,17 +418,17 @@ static struct attribute_group lis3lv02d_attribute_group = {
 
 static int lis3lv02d_add_fs(struct lis3lv02d *lis3)
 {
-	lis3_dev.pdev = platform_device_register_simple(DRIVER_NAME, -1, NULL, 0);
-	if (IS_ERR(lis3_dev.pdev))
-		return PTR_ERR(lis3_dev.pdev);
+	lis3->pdev = platform_device_register_simple(DRIVER_NAME, -1, NULL, 0);
+	if (IS_ERR(lis3->pdev))
+		return PTR_ERR(lis3->pdev);
 
-	return sysfs_create_group(&lis3_dev.pdev->dev.kobj, &lis3lv02d_attribute_group);
+	return sysfs_create_group(&lis3->pdev->dev.kobj, &lis3lv02d_attribute_group);
 }
 
-int lis3lv02d_remove_fs(void)
+int lis3lv02d_remove_fs(struct lis3lv02d *lis3)
 {
-	sysfs_remove_group(&lis3_dev.pdev->dev.kobj, &lis3lv02d_attribute_group);
-	platform_device_unregister(lis3_dev.pdev);
+	sysfs_remove_group(&lis3->pdev->dev.kobj, &lis3lv02d_attribute_group);
+	platform_device_unregister(lis3->pdev);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(lis3lv02d_remove_fs);
@@ -482,13 +454,12 @@ int lis3lv02d_init_device(struct lis3lv02d *dev)
 		break;
 	default:
 		printk(KERN_ERR DRIVER_NAME
-			": unknown sensor type 0x%X\n", lis3_dev.whoami);
+			": unknown sensor type 0x%X\n", dev->whoami);
 		return -EINVAL;
 	}
 
-	mutex_init(&dev->lock);
 	lis3lv02d_add_fs(dev);
-	lis3lv02d_increase_use(dev);
+	lis3lv02d_poweron(dev);
 
 	if (lis3lv02d_joystick_enable())
 		printk(KERN_ERR DRIVER_NAME ": joystick initialization failed\n");
@@ -503,7 +474,6 @@ int lis3lv02d_init_device(struct lis3lv02d *dev)
 	if (misc_register(&lis3lv02d_misc_device))
 		printk(KERN_ERR DRIVER_NAME ": misc_register failed\n");
 out:
-	lis3lv02d_decrease_use(dev);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(lis3lv02d_init_device);
diff --git a/drivers/hwmon/lis3lv02d.h b/drivers/hwmon/lis3lv02d.h
index 745ec96..b007d81 100644
--- a/drivers/hwmon/lis3lv02d.h
+++ b/drivers/hwmon/lis3lv02d.h
@@ -171,14 +171,11 @@ struct lis3lv02d {
 
 	struct input_dev	*idev;     /* input device */
 	struct task_struct	*kthread;  /* kthread for input */
-	struct mutex            lock;
 	struct platform_device	*pdev;     /* platform device */
 	atomic_t		count;     /* interrupt count after last read */
 	int			xcalib;    /* calibrated null value for x */
 	int			ycalib;    /* calibrated null value for y */
 	int			zcalib;    /* calibrated null value for z */
-	unsigned char		is_on;     /* whether the device is on or off */
-	unsigned char		usage;     /* usage counter */
 	struct axis_conversion	ac;        /* hw -> logical axis */
 
 	u32			irq;       /* IRQ number */
@@ -192,6 +189,6 @@ int lis3lv02d_joystick_enable(void);
 void lis3lv02d_joystick_disable(void);
 void lis3lv02d_poweroff(struct lis3lv02d *lis3);
 void lis3lv02d_poweron(struct lis3lv02d *lis3);
-int lis3lv02d_remove_fs(void);
+int lis3lv02d_remove_fs(struct lis3lv02d *lis3);
 
 extern struct lis3lv02d lis3_dev;
-- 
cgit v0.10.2


From dc6ea97bac6b8228c7a69740df35eed2be3407be Mon Sep 17 00:00:00 2001
From: Eric Piel <eric.piel@tremplin-utc.net>
Date: Tue, 16 Jun 2009 15:34:15 -0700
Subject: lis3: use input_polled_device

Now that there is no need to hookup on the open/close of the joystick,
it's possible to use the simplified interface input_polled_device, instead
of creating our own kthread.

[randy.dunlap@oracle.com: fix Kconfig]
[randy.dunlap@oracle.com: fix Kconfig some more]
Signed-off-by: Eric Piel <eric.piel@tremplin-utc.net>
Signed-off-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index d73f5f4..eec7dca 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -940,6 +940,7 @@ config SENSORS_HDAPS
 config SENSORS_LIS3LV02D
 	tristate "STMicroeletronics LIS3LV02Dx three-axis digital accelerometer"
 	depends on ACPI && INPUT
+	select INPUT_POLLDEV
 	select NEW_LEDS
 	select LEDS_CLASS
 	default n
@@ -967,6 +968,7 @@ config SENSORS_LIS3LV02D
 config SENSORS_LIS3_SPI
 	tristate "STMicroeletronics LIS3LV02Dx three-axis digital accelerometer (SPI)"
 	depends on !ACPI && SPI_MASTER && INPUT
+	select INPUT_POLLDEV
 	default n
 	help
 	  This driver provides support for the LIS3LV02Dx accelerometer connected
diff --git a/drivers/hwmon/hp_accel.c b/drivers/hwmon/hp_accel.c
index 0ebd009..92db68e 100644
--- a/drivers/hwmon/hp_accel.c
+++ b/drivers/hwmon/hp_accel.c
@@ -27,9 +27,6 @@
 #include <linux/types.h>
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
-#include <linux/input.h>
-#include <linux/kthread.h>
-#include <linux/semaphore.h>
 #include <linux/delay.h>
 #include <linux/wait.h>
 #include <linux/poll.h>
diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c
index df3f586..3661906 100644
--- a/drivers/hwmon/lis3lv02d.c
+++ b/drivers/hwmon/lis3lv02d.c
@@ -27,9 +27,7 @@
 #include <linux/types.h>
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
-#include <linux/input.h>
-#include <linux/kthread.h>
-#include <linux/semaphore.h>
+#include <linux/input-polldev.h>
 #include <linux/delay.h>
 #include <linux/wait.h>
 #include <linux/poll.h>
@@ -270,43 +268,16 @@ static struct miscdevice lis3lv02d_misc_device = {
 	.fops    = &lis3lv02d_misc_fops,
 };
 
-/**
- * lis3lv02d_joystick_kthread - Kthread polling function
- * @data: unused - here to conform to threadfn prototype
- */
-static int lis3lv02d_joystick_kthread(void *data)
+static void lis3lv02d_joystick_poll(struct input_polled_dev *pidev)
 {
 	int x, y, z;
 
-	while (!kthread_should_stop()) {
-		lis3lv02d_get_xyz(&lis3_dev, &x, &y, &z);
-		input_report_abs(lis3_dev.idev, ABS_X, x - lis3_dev.xcalib);
-		input_report_abs(lis3_dev.idev, ABS_Y, y - lis3_dev.ycalib);
-		input_report_abs(lis3_dev.idev, ABS_Z, z - lis3_dev.zcalib);
-
-		input_sync(lis3_dev.idev);
-
-		try_to_freeze();
-		msleep_interruptible(MDPS_POLL_INTERVAL);
-	}
-
-	return 0;
-}
-
-static int lis3lv02d_joystick_open(struct input_dev *input)
-{
-	lis3_dev.kthread = kthread_run(lis3lv02d_joystick_kthread, NULL, "klis3lv02d");
-	if (IS_ERR(lis3_dev.kthread)) {
-		return PTR_ERR(lis3_dev.kthread);
-	}
-
-	return 0;
+	lis3lv02d_get_xyz(&lis3_dev, &x, &y, &z);
+	input_report_abs(pidev->input, ABS_X, x - lis3_dev.xcalib);
+	input_report_abs(pidev->input, ABS_Y, y - lis3_dev.ycalib);
+	input_report_abs(pidev->input, ABS_Z, z - lis3_dev.zcalib);
 }
 
-static void lis3lv02d_joystick_close(struct input_dev *input)
-{
-	kthread_stop(lis3_dev.kthread);
-}
 
 static inline void lis3lv02d_calibrate_joystick(void)
 {
@@ -316,33 +287,36 @@ static inline void lis3lv02d_calibrate_joystick(void)
 
 int lis3lv02d_joystick_enable(void)
 {
+	struct input_dev *input_dev;
 	int err;
 
 	if (lis3_dev.idev)
 		return -EINVAL;
 
-	lis3_dev.idev = input_allocate_device();
+	lis3_dev.idev = input_allocate_polled_device();
 	if (!lis3_dev.idev)
 		return -ENOMEM;
 
+	lis3_dev.idev->poll = lis3lv02d_joystick_poll;
+	lis3_dev.idev->poll_interval = MDPS_POLL_INTERVAL;
+	input_dev = lis3_dev.idev->input;
+
 	lis3lv02d_calibrate_joystick();
 
-	lis3_dev.idev->name       = "ST LIS3LV02DL Accelerometer";
-	lis3_dev.idev->phys       = DRIVER_NAME "/input0";
-	lis3_dev.idev->id.bustype = BUS_HOST;
-	lis3_dev.idev->id.vendor  = 0;
-	lis3_dev.idev->dev.parent = &lis3_dev.pdev->dev;
-	lis3_dev.idev->open       = lis3lv02d_joystick_open;
-	lis3_dev.idev->close      = lis3lv02d_joystick_close;
+	input_dev->name       = "ST LIS3LV02DL Accelerometer";
+	input_dev->phys       = DRIVER_NAME "/input0";
+	input_dev->id.bustype = BUS_HOST;
+	input_dev->id.vendor  = 0;
+	input_dev->dev.parent = &lis3_dev.pdev->dev;
 
-	set_bit(EV_ABS, lis3_dev.idev->evbit);
-	input_set_abs_params(lis3_dev.idev, ABS_X, -lis3_dev.mdps_max_val, lis3_dev.mdps_max_val, 3, 3);
-	input_set_abs_params(lis3_dev.idev, ABS_Y, -lis3_dev.mdps_max_val, lis3_dev.mdps_max_val, 3, 3);
-	input_set_abs_params(lis3_dev.idev, ABS_Z, -lis3_dev.mdps_max_val, lis3_dev.mdps_max_val, 3, 3);
+	set_bit(EV_ABS, input_dev->evbit);
+	input_set_abs_params(input_dev, ABS_X, -lis3_dev.mdps_max_val, lis3_dev.mdps_max_val, 3, 3);
+	input_set_abs_params(input_dev, ABS_Y, -lis3_dev.mdps_max_val, lis3_dev.mdps_max_val, 3, 3);
+	input_set_abs_params(input_dev, ABS_Z, -lis3_dev.mdps_max_val, lis3_dev.mdps_max_val, 3, 3);
 
-	err = input_register_device(lis3_dev.idev);
+	err = input_register_polled_device(lis3_dev.idev);
 	if (err) {
-		input_free_device(lis3_dev.idev);
+		input_free_polled_device(lis3_dev.idev);
 		lis3_dev.idev = NULL;
 	}
 
@@ -357,7 +331,7 @@ void lis3lv02d_joystick_disable(void)
 
 	if (lis3_dev.irq)
 		misc_deregister(&lis3lv02d_misc_device);
-	input_unregister_device(lis3_dev.idev);
+	input_unregister_polled_device(lis3_dev.idev);
 	lis3_dev.idev = NULL;
 }
 EXPORT_SYMBOL_GPL(lis3lv02d_joystick_disable);
diff --git a/drivers/hwmon/lis3lv02d.h b/drivers/hwmon/lis3lv02d.h
index b007d81..5a5a196 100644
--- a/drivers/hwmon/lis3lv02d.h
+++ b/drivers/hwmon/lis3lv02d.h
@@ -18,6 +18,8 @@
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
+#include <linux/platform_device.h>
+#include <linux/input-polldev.h>
 
 /*
  * The actual chip is STMicroelectronics LIS3LV02DL or LIS3LV02DQ that seems to
@@ -169,8 +171,7 @@ struct lis3lv02d {
 	s16 (*read_data) (struct lis3lv02d *lis3, int reg);
 	int			mdps_max_val;
 
-	struct input_dev	*idev;     /* input device */
-	struct task_struct	*kthread;  /* kthread for input */
+	struct input_polled_dev	*idev;     /* input device */
 	struct platform_device	*pdev;     /* platform device */
 	atomic_t		count;     /* interrupt count after last read */
 	int			xcalib;    /* calibrated null value for x */
-- 
cgit v0.10.2


From 0093716e6dd18dad554bef81cc788a4c50d32a09 Mon Sep 17 00:00:00 2001
From: Eric Piel <eric.piel@tremplin-utc.net>
Date: Tue, 16 Jun 2009 15:34:16 -0700
Subject: lis3: add three new laptop models

Separate the 6710 and 6715, and set the right axis information for the
6715.
Reported-by: Isaac702 <isaac702@gmail.com>

Add the 6930.
Reported-by: Christian Weidle <slateroni@gmail.com>

Add the 2710.
Reported-by: Pavel Herrmann <morpheus.ibis@gmail.com>

Signed-off-by: Eric Piel <eric.piel@tremplin-utc.net>
Signed-off-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/hwmon/hp_accel.c b/drivers/hwmon/hp_accel.c
index 92db68e..6679854 100644
--- a/drivers/hwmon/hp_accel.c
+++ b/drivers/hwmon/hp_accel.c
@@ -158,6 +158,7 @@ static struct axis_conversion lis3lv02d_axis_normal = {1, 2, 3};
 static struct axis_conversion lis3lv02d_axis_y_inverted = {1, -2, 3};
 static struct axis_conversion lis3lv02d_axis_x_inverted = {-1, 2, 3};
 static struct axis_conversion lis3lv02d_axis_z_inverted = {1, 2, -3};
+static struct axis_conversion lis3lv02d_axis_xy_swap = {2, 1, 3};
 static struct axis_conversion lis3lv02d_axis_xy_rotated_left = {-2, 1, 3};
 static struct axis_conversion lis3lv02d_axis_xy_rotated_left_usd = {-2, 1, -3};
 static struct axis_conversion lis3lv02d_axis_xy_swap_inverted = {-2, -1, 3};
@@ -191,13 +192,16 @@ static struct dmi_system_id lis3lv02d_dmi_ids[] = {
 	AXIS_DMI_MATCH("NX9420", "HP Compaq nx9420", x_inverted),
 	AXIS_DMI_MATCH("NW9440", "HP Compaq nw9440", x_inverted),
 	AXIS_DMI_MATCH("NC2510", "HP Compaq 2510", y_inverted),
+	AXIS_DMI_MATCH("NC2710", "HP Compaq 2710", xy_swap),
 	AXIS_DMI_MATCH("NC8510", "HP Compaq 8510", xy_swap_inverted),
 	AXIS_DMI_MATCH("HP2133", "HP 2133", xy_rotated_left),
 	AXIS_DMI_MATCH("HP2140", "HP 2140", xy_swap_inverted),
 	AXIS_DMI_MATCH("NC653x", "HP Compaq 653", xy_rotated_left_usd),
 	AXIS_DMI_MATCH("NC673x", "HP Compaq 673", xy_rotated_left_usd),
 	AXIS_DMI_MATCH("NC651xx", "HP Compaq 651", xy_rotated_right),
-	AXIS_DMI_MATCH("NC671xx", "HP Compaq 671", xy_swap_yz_inverted),
+	AXIS_DMI_MATCH("NC6710x", "HP Compaq 6710", xy_swap_yz_inverted),
+	AXIS_DMI_MATCH("NC6715x", "HP Compaq 6715", y_inverted),
+	AXIS_DMI_MATCH("NC693xx", "HP EliteBook 693", xy_rotated_right),
 	/* Intel-based HP Pavilion dv5 */
 	AXIS_DMI_MATCH2("HPDV5_I",
 			PRODUCT_NAME, "HP Pavilion dv5",
@@ -213,7 +217,6 @@ static struct dmi_system_id lis3lv02d_dmi_ids[] = {
 	{ NULL, }
 /* Laptop models without axis info (yet):
  * "NC6910" "HP Compaq 6910"
- * HP Compaq 8710x Notebook PC / Mobile Workstation
  * "NC2400" "HP Compaq nc2400"
  * "NX74x0" "HP Compaq nx74"
  * "NX6325" "HP Compaq nx6325"
-- 
cgit v0.10.2


From 8f3128e714ded7cf1e8c786c204a4f253b5d8ff4 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Tue, 16 Jun 2009 15:34:17 -0700
Subject: lis3: add click function

The LIS302DL accelerometer chip has a 'click' feature which can be used to
detect sudden motion on any of the three axis.  Configuration data is
passed via spi platform_data and no action is taken if that's not
specified, so it won't harm any existing platform.

To make the configuration effective, the IRQ lines need to be set up
appropriately.  This patch also adds a way to do that from board support
code.

The DD_* definitions were factored out to an own enum because they are
specific to LIS3LV02D devices.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Eric Piel <eric.piel@tremplin-utc.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c
index 3661906..271338b 100644
--- a/drivers/hwmon/lis3lv02d.c
+++ b/drivers/hwmon/lis3lv02d.c
@@ -438,6 +438,26 @@ int lis3lv02d_init_device(struct lis3lv02d *dev)
 	if (lis3lv02d_joystick_enable())
 		printk(KERN_ERR DRIVER_NAME ": joystick initialization failed\n");
 
+	/* passing in platform specific data is purely optional and only
+	 * used by the SPI transport layer at the moment */
+	if (dev->pdata) {
+		struct lis3lv02d_platform_data *p = dev->pdata;
+
+		if (p->click_flags && (dev->whoami == LIS_SINGLE_ID)) {
+			dev->write(dev, CLICK_CFG, p->click_flags);
+			dev->write(dev, CLICK_TIMELIMIT, p->click_time_limit);
+			dev->write(dev, CLICK_LATENCY, p->click_latency);
+			dev->write(dev, CLICK_WINDOW, p->click_window);
+			dev->write(dev, CLICK_THSZ, p->click_thresh_z & 0xf);
+			dev->write(dev, CLICK_THSY_X,
+					(p->click_thresh_x & 0xf) |
+					(p->click_thresh_y << 4));
+		}
+
+		if (p->irq_cfg)
+			dev->write(dev, CTRL_REG3, p->irq_cfg);
+	}
+
 	/* bail if we did not get an IRQ from the bus layer */
 	if (!dev->irq) {
 		printk(KERN_ERR DRIVER_NAME
diff --git a/drivers/hwmon/lis3lv02d.h b/drivers/hwmon/lis3lv02d.h
index 5a5a196..e320e2f 100644
--- a/drivers/hwmon/lis3lv02d.h
+++ b/drivers/hwmon/lis3lv02d.h
@@ -29,12 +29,14 @@
  * They can also be connected via I²C.
  */
 
+#include <linux/lis3lv02d.h>
+
 /* 2-byte registers */
 #define LIS_DOUBLE_ID	0x3A /* LIS3LV02D[LQ] */
 /* 1-byte registers */
 #define LIS_SINGLE_ID	0x3B /* LIS[32]02DL and others */
 
-enum lis3lv02d_reg {
+enum lis3_reg {
 	WHO_AM_I	= 0x0F,
 	OFFSET_X	= 0x16,
 	OFFSET_Y	= 0x17,
@@ -62,6 +64,19 @@ enum lis3lv02d_reg {
 	FF_WU_THS_L	= 0x34,
 	FF_WU_THS_H	= 0x35,
 	FF_WU_DURATION	= 0x36,
+};
+
+enum lis302d_reg {
+	CLICK_CFG	= 0x38,
+	CLICK_SRC	= 0x39,
+	CLICK_THSY_X	= 0x3B,
+	CLICK_THSZ	= 0x3C,
+	CLICK_TIMELIMIT	= 0x3D,
+	CLICK_LATENCY	= 0x3E,
+	CLICK_WINDOW	= 0x3F,
+};
+
+enum lis3lv02d_reg {
 	DD_CFG		= 0x38,
 	DD_SRC		= 0x39,
 	DD_ACK		= 0x3A,
@@ -183,6 +198,8 @@ struct lis3lv02d {
 	struct fasync_struct	*async_queue; /* queue for the misc device */
 	wait_queue_head_t	misc_wait; /* Wait queue for the misc device */
 	unsigned long		misc_opened; /* bit0: whether the device is open */
+
+	struct lis3lv02d_platform_data *pdata;	/* for passing board config */
 };
 
 int lis3lv02d_init_device(struct lis3lv02d *lis3);
diff --git a/drivers/hwmon/lis3lv02d_spi.c b/drivers/hwmon/lis3lv02d_spi.c
index 07ae74b..3827ff0 100644
--- a/drivers/hwmon/lis3lv02d_spi.c
+++ b/drivers/hwmon/lis3lv02d_spi.c
@@ -72,6 +72,7 @@ static int __devinit lis302dl_spi_probe(struct spi_device *spi)
 	lis3_dev.write = lis3_spi_write;
 	lis3_dev.irq = spi->irq;
 	lis3_dev.ac = lis3lv02d_axis_normal;
+	lis3_dev.pdata = spi->dev.platform_data;
 	spi_set_drvdata(spi, &lis3_dev);
 
 	ret = lis3lv02d_init_device(&lis3_dev);
diff --git a/include/linux/lis3lv02d.h b/include/linux/lis3lv02d.h
new file mode 100644
index 0000000..ad651f4
--- /dev/null
+++ b/include/linux/lis3lv02d.h
@@ -0,0 +1,39 @@
+#ifndef __LIS3LV02D_H_
+#define __LIS3LV02D_H_
+
+struct lis3lv02d_platform_data {
+	/* please note: the 'click' feature is only supported for
+	 * LIS[32]02DL variants of the chip and will be ignored for
+	 * others */
+#define LIS3_CLICK_SINGLE_X	(1 << 0)
+#define LIS3_CLICK_DOUBLE_X	(1 << 1)
+#define LIS3_CLICK_SINGLE_Y	(1 << 2)
+#define LIS3_CLICK_DOUBLE_Y	(1 << 3)
+#define LIS3_CLICK_SINGLE_Z	(1 << 4)
+#define LIS3_CLICK_DOUBLE_Z	(1 << 5)
+	unsigned char click_flags;
+	unsigned char click_thresh_x;
+	unsigned char click_thresh_y;
+	unsigned char click_thresh_z;
+	unsigned char click_time_limit;
+	unsigned char click_latency;
+	unsigned char click_window;
+
+#define LIS3_IRQ1_DISABLE	(0 << 0)
+#define LIS3_IRQ1_FF_WU_1	(1 << 0)
+#define LIS3_IRQ1_FF_WU_2	(2 << 0)
+#define LIS3_IRQ1_FF_WU_12	(3 << 0)
+#define LIS3_IRQ1_DATA_READY	(4 << 0)
+#define LIS3_IRQ1_CLICK		(7 << 0)
+#define LIS3_IRQ2_DISABLE	(0 << 3)
+#define LIS3_IRQ2_FF_WU_1	(1 << 3)
+#define LIS3_IRQ2_FF_WU_2	(2 << 3)
+#define LIS3_IRQ2_FF_WU_12	(3 << 3)
+#define LIS3_IRQ2_DATA_READY	(4 << 3)
+#define LIS3_IRQ2_CLICK		(7 << 3)
+#define LIS3_IRQ_OPEN_DRAIN	(1 << 6)
+#define LIS3_IRQ_ACTIVE_HIGH	(1 << 7)
+	unsigned char irq_cfg;
+};
+
+#endif /* __LIS3LV02D_H_ */
-- 
cgit v0.10.2


From a53c9d5b7115173fba9f82ff8120b624ef206f48 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Tue, 16 Jun 2009 15:34:18 -0700
Subject: fbdev: generated logo sources depend on scripts/pnmtologo

The generated logo sources are not automatically regenerated if
scripts/pnmtologo.c has changed. Add the missing dependency to fix this.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Tested-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/logo/Makefile b/drivers/video/logo/Makefile
index b91251d..3b43781 100644
--- a/drivers/video/logo/Makefile
+++ b/drivers/video/logo/Makefile
@@ -37,22 +37,24 @@ extra-y += $(call logo-cfiles,_clut224,ppm)
 # Gray 256
 extra-y += $(call logo-cfiles,_gray256,pgm)
 
+pnmtologo := scripts/pnmtologo
+
 # Create commands like "pnmtologo -t mono -n logo_mac_mono -o ..."
 quiet_cmd_logo = LOGO    $@
-	cmd_logo = scripts/pnmtologo \
+	cmd_logo = $(pnmtologo) \
 			-t $(patsubst $*_%,%,$(notdir $(basename $<))) \
 			-n $(notdir $(basename $<)) -o $@ $<
 
-$(obj)/%_mono.c: $(src)/%_mono.pbm FORCE
+$(obj)/%_mono.c: $(src)/%_mono.pbm $(pnmtologo) FORCE
 	$(call if_changed,logo)
 
-$(obj)/%_vga16.c: $(src)/%_vga16.ppm FORCE
+$(obj)/%_vga16.c: $(src)/%_vga16.ppm $(pnmtologo) FORCE
 	$(call if_changed,logo)
 
-$(obj)/%_clut224.c: $(src)/%_clut224.ppm FORCE
+$(obj)/%_clut224.c: $(src)/%_clut224.ppm $(pnmtologo) FORCE
 	$(call if_changed,logo)
 
-$(obj)/%_gray256.c: $(src)/%_gray256.pgm FORCE
+$(obj)/%_gray256.c: $(src)/%_gray256.pgm $(pnmtologo) FORCE
 	$(call if_changed,logo)
 
 # Files generated that shall be removed upon make clean
-- 
cgit v0.10.2


From ae52bb2384f721562f15f719de1acb8e934733cb Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Tue, 16 Jun 2009 15:34:19 -0700
Subject: fbdev: move logo externs to header file

Now we have __initconst, we can finally move the external declarations for
the various Linux logo structures to <linux/linux_logo.h>.

James' ack dates back to the previous submission (way to long ago), when the
logos were still __initdata, which caused failures on some platforms with some
toolchain versions.

Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Acked-by: James Simmons <jsimmons@infradead.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 2f0e64b..ef6f649 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -44,10 +44,7 @@
 #include <asm/sections.h>
 #include <asm/machdep.h>
 
-#ifdef CONFIG_LOGO_LINUX_CLUT224
 #include <linux/linux_logo.h>
-extern const struct linux_logo logo_linux_clut224;
-#endif
 
 /*
  * Properties whose value is longer than this get excluded from our
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 9abd210..8547e86 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -752,17 +752,8 @@ static int __init init_spu_base(void)
 		goto out_unregister_sysdev_class;
 	}
 
-	if (ret > 0) {
-		/*
-		 * We cannot put the forward declaration in
-		 * <linux/linux_logo.h> because of conflicting session type
-		 * conflicts for const and __initdata with different compiler
-		 * versions
-		 */
-		extern const struct linux_logo logo_spe_clut224;
-
+	if (ret > 0)
 		fb_append_extra_logo(&logo_spe_clut224, ret);
-	}
 
 	mutex_lock(&spu_full_list_mutex);
 	xmon_register_spus(&spu_full_list);
diff --git a/arch/um/include/shared/init.h b/arch/um/include/shared/init.h
index 37dd097..b3906f8 100644
--- a/arch/um/include/shared/init.h
+++ b/arch/um/include/shared/init.h
@@ -27,7 +27,7 @@
  * sign followed by value, e.g.:
  *
  * static int init_variable __initdata = 0;
- * static char linux_logo[] __initdata = { 0x32, 0x36, ... };
+ * static const char linux_logo[] __initconst = { 0x32, 0x36, ... };
  *
  * Don't forget to initialize data not at file scope, i.e. within a function,
  * as gcc otherwise puts the data into the bss section and not into the init
diff --git a/drivers/video/logo/logo.c b/drivers/video/logo/logo.c
index 2e85a2b..ea7a8cc 100644
--- a/drivers/video/logo/logo.c
+++ b/drivers/video/logo/logo.c
@@ -21,21 +21,6 @@
 #include <asm/bootinfo.h>
 #endif
 
-extern const struct linux_logo logo_linux_mono;
-extern const struct linux_logo logo_linux_vga16;
-extern const struct linux_logo logo_linux_clut224;
-extern const struct linux_logo logo_blackfin_vga16;
-extern const struct linux_logo logo_blackfin_clut224;
-extern const struct linux_logo logo_dec_clut224;
-extern const struct linux_logo logo_mac_clut224;
-extern const struct linux_logo logo_parisc_clut224;
-extern const struct linux_logo logo_sgi_clut224;
-extern const struct linux_logo logo_sun_clut224;
-extern const struct linux_logo logo_superh_mono;
-extern const struct linux_logo logo_superh_vga16;
-extern const struct linux_logo logo_superh_clut224;
-extern const struct linux_logo logo_m32r_clut224;
-
 static int nologo;
 module_param(nologo, bool, 0);
 MODULE_PARM_DESC(nologo, "Disables startup logo");
diff --git a/include/linux/init.h b/include/linux/init.h
index b218980..8c2c998 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -29,7 +29,7 @@
  * sign followed by value, e.g.:
  *
  * static int init_variable __initdata = 0;
- * static char linux_logo[] __initdata = { 0x32, 0x36, ... };
+ * static const char linux_logo[] __initconst = { 0x32, 0x36, ... };
  *
  * Don't forget to initialize data not at file scope, i.e. within a function,
  * as gcc otherwise puts the data into the bss section and not into the init
diff --git a/include/linux/linux_logo.h b/include/linux/linux_logo.h
index 08a9296..ca5bd91 100644
--- a/include/linux/linux_logo.h
+++ b/include/linux/linux_logo.h
@@ -32,6 +32,22 @@ struct linux_logo {
 	const unsigned char *data;
 };
 
+extern const struct linux_logo logo_linux_mono;
+extern const struct linux_logo logo_linux_vga16;
+extern const struct linux_logo logo_linux_clut224;
+extern const struct linux_logo logo_blackfin_vga16;
+extern const struct linux_logo logo_blackfin_clut224;
+extern const struct linux_logo logo_dec_clut224;
+extern const struct linux_logo logo_mac_clut224;
+extern const struct linux_logo logo_parisc_clut224;
+extern const struct linux_logo logo_sgi_clut224;
+extern const struct linux_logo logo_sun_clut224;
+extern const struct linux_logo logo_superh_mono;
+extern const struct linux_logo logo_superh_vga16;
+extern const struct linux_logo logo_superh_clut224;
+extern const struct linux_logo logo_m32r_clut224;
+extern const struct linux_logo logo_spe_clut224;
+
 extern const struct linux_logo *fb_find_logo(int depth);
 #ifdef CONFIG_FB_LOGO_EXTRA
 extern void fb_append_extra_logo(const struct linux_logo *logo,
diff --git a/scripts/pnmtologo.c b/scripts/pnmtologo.c
index 6aa2a24..64f5ddb 100644
--- a/scripts/pnmtologo.c
+++ b/scripts/pnmtologo.c
@@ -237,22 +237,22 @@ static void write_header(void)
     fprintf(out, " *  Linux logo %s\n", logoname);
     fputs(" */\n\n", out);
     fputs("#include <linux/linux_logo.h>\n\n", out);
-    fprintf(out, "static unsigned char %s_data[] __initdata = {\n",
+    fprintf(out, "static const unsigned char %s_data[] __initconst = {\n",
 	    logoname);
 }
 
 static void write_footer(void)
 {
     fputs("\n};\n\n", out);
-    fprintf(out, "struct linux_logo %s __initdata = {\n", logoname);
-    fprintf(out, "    .type\t= %s,\n", logo_types[logo_type]);
-    fprintf(out, "    .width\t= %d,\n", logo_width);
-    fprintf(out, "    .height\t= %d,\n", logo_height);
+    fprintf(out, "const struct linux_logo %s __initconst = {\n", logoname);
+    fprintf(out, "\t.type\t\t= %s,\n", logo_types[logo_type]);
+    fprintf(out, "\t.width\t\t= %d,\n", logo_width);
+    fprintf(out, "\t.height\t\t= %d,\n", logo_height);
     if (logo_type == LINUX_LOGO_CLUT224) {
-	fprintf(out, "    .clutsize\t= %d,\n", logo_clutsize);
-	fprintf(out, "    .clut\t= %s_clut,\n", logoname);
+	fprintf(out, "\t.clutsize\t= %d,\n", logo_clutsize);
+	fprintf(out, "\t.clut\t\t= %s_clut,\n", logoname);
     }
-    fprintf(out, "    .data\t= %s_data\n", logoname);
+    fprintf(out, "\t.data\t\t= %s_data\n", logoname);
     fputs("};\n\n", out);
 
     /* close logo file */
@@ -374,7 +374,7 @@ static void write_logo_clut224(void)
     fputs("\n};\n\n", out);
 
     /* write logo clut */
-    fprintf(out, "static unsigned char %s_clut[] __initdata = {\n",
+    fprintf(out, "static const unsigned char %s_clut[] __initconst = {\n",
 	    logoname);
     write_hex_cnt = 0;
     for (i = 0; i < logo_clutsize; i++) {
-- 
cgit v0.10.2


From 27b7f2e3b587f01d2cc901b48716eed4bd90fbe4 Mon Sep 17 00:00:00 2001
From: Paul Menzel <paulepanter@users.sourceforge.net>
Date: Tue, 16 Jun 2009 15:34:20 -0700
Subject: fbdev: add video modes for resolutions and timings of PAL RGB

This patch was taken from vga-sync-field version 0.0.3 [1][2].

[1] http://lowbyte.de/vga-sync-fields/vga-sync-fields-0.0.3.tgz
[2] http://git.hellersdorfer-jugendchor.de/?p=3Dvga2scart.git;a=3Dcommit;h=
=3Dc5c8ed6c51fc9879dbf38d8b91d5db6f4300ea03

Signed-off-by: Thomas Hilber <sparkie@lowbyte.de>
Signed-off-by: Paul Menzel <paulepanter@users.sourceforge.net>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/modedb.c b/drivers/video/modedb.c
index 1618624..34e4e79 100644
--- a/drivers/video/modedb.c
+++ b/drivers/video/modedb.c
@@ -264,6 +264,14 @@ static const struct fb_videomode modedb[] = {
 	/* 1280x800, 60 Hz, 47.403 kHz hsync, WXGA 16:10 aspect ratio */
 	NULL, 60, 1280, 800, 12048, 200, 64, 24, 1, 136, 3,
 	0, FB_VMODE_NONINTERLACED
+    }, {
+       /* 720x576i @ 50 Hz, 15.625 kHz hsync (PAL RGB) */
+       NULL, 50, 720, 576, 74074, 64, 16, 39, 5, 64, 5,
+       0, FB_VMODE_INTERLACED
+    }, {
+       /* 800x520i @ 50 Hz, 15.625 kHz hsync (PAL RGB) */
+       NULL, 50, 800, 520, 58823, 144, 64, 72, 28, 80, 5,
+       0, FB_VMODE_INTERLACED
     },
 };
 
-- 
cgit v0.10.2


From 2d9d2fdfae4cf7fda90178a9daf0f8f750043ae8 Mon Sep 17 00:00:00 2001
From: Paul Menzel <paulepanter@users.sourceforge.net>
Date: Tue, 16 Jun 2009 15:34:21 -0700
Subject: Documentation/fb/vesafb.txt: fix typo

Signed-off-by: Paul Menzel <paulepanter@users.sourceforge.net>
Cc: Gerd Knorr <kraxel@goldbach.in-berlin.de>
Cc: Nico Schmoigl <schmoigl@rumms.uni-mannheim.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/fb/vesafb.txt b/Documentation/fb/vesafb.txt
index ee277dd..950d5a6 100644
--- a/Documentation/fb/vesafb.txt
+++ b/Documentation/fb/vesafb.txt
@@ -95,7 +95,7 @@ There is no way to change the vesafb video mode and/or timings after
 booting linux.  If you are not happy with the 60 Hz refresh rate, you
 have these options:
 
- * configure and load the DOS-Tools for your the graphics board (if
+ * configure and load the DOS-Tools for the graphics board (if
    available) and boot linux with loadlin.
  * use a native driver (matroxfb/atyfb) instead if vesafb.  If none
    is available, write a new one!
-- 
cgit v0.10.2


From 7ec42d2659e81f068c5392fd5cb2f5b4bd35e880 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 16 Jun 2009 15:34:22 -0700
Subject: chipsfb: remove redundant assignment

The removed assignment is done inside the framebuffer_alloc() earlier.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/chipsfb.c b/drivers/video/chipsfb.c
index 777389c..57b9d27 100644
--- a/drivers/video/chipsfb.c
+++ b/drivers/video/chipsfb.c
@@ -414,7 +414,6 @@ chipsfb_pci_init(struct pci_dev *dp, const struct pci_device_id *ent)
 	}
 
 	pci_set_drvdata(dp, p);
-	p->device = &dp->dev;
 
 	init_chips(p, addr);
 
-- 
cgit v0.10.2


From 100b4a6eefb2ec335a2ae82356dad1b506ded8ed Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 16 Jun 2009 15:34:23 -0700
Subject: igafb: use framebuffer_alloc() to allocate fb_info struct

Use the framebuffer_alloc() function to allocate the fb_info
structure so the structure is correctly initialized after allocation.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/igafb.c b/drivers/video/igafb.c
index 3a81060..15d2001 100644
--- a/drivers/video/igafb.c
+++ b/drivers/video/igafb.c
@@ -395,17 +395,16 @@ int __init igafb_init(void)
 	/* We leak a reference here but as it cannot be unloaded this is
 	   fine. If you write unload code remember to free it in unload */
 	
-	size = sizeof(struct fb_info) + sizeof(struct iga_par) + sizeof(u32)*16;
+	size = sizeof(struct iga_par) + sizeof(u32)*16;
 
-        info = kzalloc(size, GFP_ATOMIC);
+	info = framebuffer_alloc(size, &pdev->dev);
         if (!info) {
                 printk("igafb_init: can't alloc fb_info\n");
 		 pci_dev_put(pdev);
                 return -ENOMEM;
         }
 
-	par = (struct iga_par *) (info + 1);
-	
+	par = info->par;
 
 	if ((addr = pdev->resource[0].start) == 0) {
                 printk("igafb_init: no memory start\n");
@@ -526,7 +525,6 @@ int __init igafb_init(void)
 	info->var = default_var;
 	info->fix = igafb_fix;
 	info->pseudo_palette = (void *)(par + 1);
-	info->device = &pdev->dev;
 
 	if (!iga_init(info, par)) {
 		iounmap((void *)par->io_base);
-- 
cgit v0.10.2


From 4113819eb360555a91a8291f37bbbe9d26c5b275 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 16 Jun 2009 15:34:23 -0700
Subject: offb: use framebuffer_alloc() to allocate fb_info struct

Use the framebuffer_alloc() function to allocate the fb_info structure so
the structure is correctly initialized after allocation.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/offb.c b/drivers/video/offb.c
index e1d9eeb..bb915a4 100644
--- a/drivers/video/offb.c
+++ b/drivers/video/offb.c
@@ -378,7 +378,6 @@ static void __init offb_init_fb(const char *name, const char *full_name,
 	struct fb_fix_screeninfo *fix;
 	struct fb_var_screeninfo *var;
 	struct fb_info *info;
-	int size;
 
 	if (!request_mem_region(res_start, res_size, "offb"))
 		return;
@@ -393,15 +392,12 @@ static void __init offb_init_fb(const char *name, const char *full_name,
 		return;
 	}
 
-	size = sizeof(struct fb_info) + sizeof(u32) * 16;
-
-	info = kmalloc(size, GFP_ATOMIC);
+	info = framebuffer_alloc(sizeof(u32) * 16, NULL);
 	
 	if (info == 0) {
 		release_mem_region(res_start, res_size);
 		return;
 	}
-	memset(info, 0, size);
 
 	fix = &info->fix;
 	var = &info->var;
-- 
cgit v0.10.2


From 97b9a5a28b5fd02ceb3fcccee05e39dd62e4f474 Mon Sep 17 00:00:00 2001
From: Ben Nizette <bn@niasdigital.com>
Date: Tue, 16 Jun 2009 15:34:24 -0700
Subject: atmel-lcdc: fix pixclock upper bound detection

AFAICT the code which checks that the requested pixclock value is within
bounds is incorrect.  It ensures that the lcdc core clock is at least
(bytes per pixel) times higher than the pixel clock rather than just
greater than or equal to.

There are tighter restrictions on the pixclock value as a function of bus
width for STN panels but even then it isn't a simple relationship as
currently checked for.  IMO either something like the below patch should
be applied or else more detailed checking logic should be implemented
which takes in to account the panel type as well.

Signed-off-by: Ben Nizette <bn@niasdigital.com>
Acked-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Cc: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Cc: Daniel Glockner <dg@emlix.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/atmel_lcdfb.c b/drivers/video/atmel_lcdfb.c
index 2fb63f6..5afd644 100644
--- a/drivers/video/atmel_lcdfb.c
+++ b/drivers/video/atmel_lcdfb.c
@@ -345,7 +345,7 @@ static int atmel_lcdfb_check_var(struct fb_var_screeninfo *var,
 	dev_dbg(dev, "  bpp:        %u\n", var->bits_per_pixel);
 	dev_dbg(dev, "  clk:        %lu KHz\n", clk_value_khz);
 
-	if ((PICOS2KHZ(var->pixclock) * var->bits_per_pixel / 8) > clk_value_khz) {
+	if (PICOS2KHZ(var->pixclock) > clk_value_khz) {
 		dev_err(dev, "%lu KHz pixel clock is too fast\n", PICOS2KHZ(var->pixclock));
 		return -EINVAL;
 	}
-- 
cgit v0.10.2


From 39000d654c2a22ca51fe92a39003d5fade59e9e4 Mon Sep 17 00:00:00 2001
From: InKi Dae <daeinki@gmail.com>
Date: Tue, 16 Jun 2009 15:34:27 -0700
Subject: Samsung SoC Framebuffer driver: add Alpha Channel support

Add support for the ARGB1888 and ARGB4888 hardware to the Samsung SoC
Framebuffer driver (s3c-fb.c).

ARGB1888 and ARGB4888 is decided by var->transp.length and this variable
is set by s3c_fb_check_var().

In s3c_fb_check_var(), if var->vits_per_pixel is 25 or 28, then
var->transp.length would be 1 or 3.

Therefore alpha mode(ARGB1888 or ARGB4888) could be decided through that
variable.

For using alpha mode, you need to set the following: This code should be
added to your machine code as platform data.

static struct s3c_fb_pd_win xxx_fb_win0 = {
	/* this is to ensure we use win0 */
	.win_mode = {
		.pixclock		= (8+8+8+240)*(38+4+38+400),
		.left_margin	= 8,
		.right_margin	= 8,
		.upper_margin	= 38,
		.lower_margin	= 38,
		.hsync_len		= 8,
		.vsync_len		= 4,
		.xres			= 240,
		.yres			= 400,
	},
	.max_bpp		= 32,
	.default_bpp	= 24,
};

static struct s3c_fb_pd_win xxx_fb_win1 = {
	.win_mode = {
		.pixclock		= (8+8+8+240)*(38+4+38+400),
		.left_margin	= 8,
		.right_margin	= 8,
		.upper_margin	= 38,
		.lower_margin	= 38,
		.hsync_len		= 8,
		.vsync_len		= 4,
		.xres			= 240,
		.yres			= 400,
	},
	.max_bpp		= 32,
	.default_bpp	= 28,
};

static struct s3c_fb_platdata xxx_lcd_pdata __initdata = {
	.win[0]		= &ncp_fb_win0,
	.win[1]		= &ncp_fb_win1,
	.vidcon0	= VIDCON0_VIDOUT_RGB | VIDCON0_PNRMODE_RGB,
	.vidcon1	= VIDCON1_INV_HSYNC | VIDCON1_INV_VSYNC,
	.setup_gpio	= xxx_fb_gpio_setup,
};

s3c_fb_set_platdata(&xxx_lcd_pdata);

The above code sets pixelformat for window0 layer to RGB888 and window1
layer to ARGB4888.

Signed-off-by: InKi Dae <inki.dae@samsung.com>
Cc: Ben Dooks <ben-linux@fluff.org>
Cc: Kyungmin Park <kmpark@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/s3c-fb.c b/drivers/video/s3c-fb.c
index d3a568e..53bca28 100644
--- a/drivers/video/s3c-fb.c
+++ b/drivers/video/s3c-fb.c
@@ -358,9 +358,16 @@ static int s3c_fb_set_par(struct fb_info *info)
 	writel(data, regs + VIDOSD_B(win_no));
 
 	data = var->xres * var->yres;
+
+	u32 osdc_data = 0;
+
+	osdc_data = VIDISD14C_ALPHA1_R(0xf) |
+		VIDISD14C_ALPHA1_G(0xf) |
+		VIDISD14C_ALPHA1_B(0xf);
+
 	if (s3c_fb_has_osd_d(win_no)) {
 		writel(data, regs + VIDOSD_D(win_no));
-		writel(0, regs + VIDOSD_C(win_no));
+		writel(osdc_data, regs + VIDOSD_C(win_no));
 	} else
 		writel(data, regs + VIDOSD_C(win_no));
 
@@ -409,8 +416,12 @@ static int s3c_fb_set_par(struct fb_info *info)
 				data |= WINCON1_BPPMODE_19BPP_A1666;
 			else
 				data |= WINCON1_BPPMODE_18BPP_666;
-		} else if (var->transp.length != 0)
-			data |= WINCON1_BPPMODE_25BPP_A1888;
+		} else if (var->transp.length == 1)
+			data |= WINCON1_BPPMODE_25BPP_A1888
+				| WINCON1_BLD_PIX;
+		else if (var->transp.length == 4)
+			data |= WINCON1_BPPMODE_28BPP_A4888
+				| WINCON1_BLD_PIX | WINCON1_ALPHA_SEL;
 		else
 			data |= WINCON0_BPPMODE_24BPP_888;
 
@@ -418,6 +429,20 @@ static int s3c_fb_set_par(struct fb_info *info)
 		break;
 	}
 
+	/* It has no color key control register for window0 */
+	if (win_no > 0) {
+		u32 keycon0_data = 0, keycon1_data = 0;
+
+		keycon0_data = ~(WxKEYCON0_KEYBL_EN |
+				WxKEYCON0_KEYEN_F |
+				WxKEYCON0_DIRCON) | WxKEYCON0_COMPKEY(0);
+
+		keycon1_data = WxKEYCON1_COLVAL(0xffffff);
+
+		writel(keycon0_data, regs + WxKEYCONy(win_no-1, 0));
+		writel(keycon1_data, regs + WxKEYCONy(win_no-1, 1));
+	}
+
 	writel(data, regs + WINCON(win_no));
 	writel(0x0, regs + WINxMAP(win_no));
 
-- 
cgit v0.10.2


From 336e747eebef7117b9acabb606bd817f1f1c5106 Mon Sep 17 00:00:00 2001
From: Julian Calaby <julian.calaby@gmail.com>
Date: Tue, 16 Jun 2009 15:34:29 -0700
Subject: mb862xxfb: restrict compliation of platform driver to PPC

The OpenFirmware part of this driver is uncompilable on SPARC due to it's
dependance on several PPC specific functions.

Restricting this to PPC to prevent these build errors:
  CC      drivers/video/mb862xx/mb862xxfb.o
drivers/video/mb862xx/mb862xxfb.c: In function 'of_platform_mb862xx_probe':
drivers/video/mb862xx/mb862xxfb.c:559: error: implicit declaration of function 'of_address_to_resource'
drivers/video/mb862xx/mb862xxfb.c:575: error: 'NO_IRQ' undeclared (first use in this function)
drivers/video/mb862xx/mb862xxfb.c:575: error: (Each undeclared identifier is reported only once
drivers/video/mb862xx/mb862xxfb.c:575: error: for each function it appears in.)

This was found using randconfig builds.

Signed-off-by: Julian Calaby <julian.calaby@gmail.com>
Signed-off-by: Anatolij Gustschin <agust@denx.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Anatolij Gustschin <agust@denx.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index 2b5a691..932ffdb 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -2104,6 +2104,7 @@ config FB_MB862XX_LIME
 	bool "Lime GDC"
 	depends on FB_MB862XX
 	depends on OF && !FB_MB862XX_PCI_GDC
+	depends on PPC
 	select FB_FOREIGN_ENDIAN
 	select FB_LITTLE_ENDIAN
 	---help---
-- 
cgit v0.10.2


From 24f01dcb53a950999f42f55123f7bc4ccda4ca57 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 16 Jun 2009 15:34:31 -0700
Subject: mb862xxfb: use CONFIG_OF instead of CONFIG_PPC_OF

With this change, the driver builds fine on Microblaze, which helps
allyesconfig compile tests.

I did not test sparc, but the change should have the same effect there.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Anatolij Gustschin <agust@denx.de>
Tested-by: Anatolij Gustschin <agust@denx.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Michal Simek <monstr@monstr.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/mb862xx/mb862xxfb.c b/drivers/video/mb862xx/mb862xxfb.c
index fb64234..a28e3cf 100644
--- a/drivers/video/mb862xx/mb862xxfb.c
+++ b/drivers/video/mb862xx/mb862xxfb.c
@@ -19,7 +19,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/pci.h>
-#if defined(CONFIG_PPC_OF)
+#if defined(CONFIG_OF)
 #include <linux/of_platform.h>
 #endif
 #include "mb862xxfb.h"
-- 
cgit v0.10.2


From 34308fd4a22b4c24f54951e47d14e6ae5de6e150 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 16 Jun 2009 15:34:32 -0700
Subject: acornfb: remove fb_mmap function

The driver's fb_mmap function is essentially the same as a generic fb_mmap
function.  Delete driver's function and use the generic one.

A difference is that generic function marks frame buffer memory as VM_IO |
VM_RESERVED.  The driver's function marks it as VM_IO only.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c
index 6995fe1..0bcc59e 100644
--- a/drivers/video/acornfb.c
+++ b/drivers/video/acornfb.c
@@ -859,43 +859,6 @@ acornfb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info)
 	return 0;
 }
 
-/*
- * Note that we are entered with the kernel locked.
- */
-static int
-acornfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
-{
-	unsigned long off, start;
-	u32 len;
-
-	off = vma->vm_pgoff << PAGE_SHIFT;
-
-	start = info->fix.smem_start;
-	len = PAGE_ALIGN(start & ~PAGE_MASK) + info->fix.smem_len;
-	start &= PAGE_MASK;
-	if ((vma->vm_end - vma->vm_start + off) > len)
-		return -EINVAL;
-	off += start;
-	vma->vm_pgoff = off >> PAGE_SHIFT;
-
-	/* This is an IO map - tell maydump to skip this VMA */
-	vma->vm_flags |= VM_IO;
-
-	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
-
-	/*
-	 * Don't alter the page protection flags; we want to keep the area
-	 * cached for better performance.  This does mean that we may miss
-	 * some updates to the screen occasionally, but process switches
-	 * should cause the caches and buffers to be flushed often enough.
-	 */
-	if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT,
-				vma->vm_end - vma->vm_start,
-				vma->vm_page_prot))
-		return -EAGAIN;
-	return 0;
-}
-
 static struct fb_ops acornfb_ops = {
 	.owner		= THIS_MODULE,
 	.fb_check_var	= acornfb_check_var,
@@ -905,7 +868,6 @@ static struct fb_ops acornfb_ops = {
 	.fb_fillrect	= cfb_fillrect,
 	.fb_copyarea	= cfb_copyarea,
 	.fb_imageblit	= cfb_imageblit,
-	.fb_mmap	= acornfb_mmap,
 };
 
 /*
-- 
cgit v0.10.2


From 493f139ecf9ee72f73ccbabd016325a145e884ee Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 16 Jun 2009 15:34:32 -0700
Subject: carminefb: fix possible access beyond end of carmine_modedb[]

This check is off-by-one.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/carminefb.c b/drivers/video/carminefb.c
index c7ff3c1..0c02f8e 100644
--- a/drivers/video/carminefb.c
+++ b/drivers/video/carminefb.c
@@ -562,7 +562,7 @@ static int __devinit alloc_carmine_fb(void __iomem *regs, void __iomem *smem_bas
 	if (ret < 0)
 		goto err_free_fb;
 
-	if (fb_mode > ARRAY_SIZE(carmine_modedb))
+	if (fb_mode >= ARRAY_SIZE(carmine_modedb))
 		fb_mode = CARMINEFB_DEFAULT_VIDEO_MODE;
 
 	par->cur_mode = par->new_mode = ~0;
-- 
cgit v0.10.2


From ddc518d9f88d7cf82bd974737ce977193785335d Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 16 Jun 2009 15:34:33 -0700
Subject: s3c-fb: fix resource releasing on error during probing

All resources are released in s3c_fb_win_release so remove other places of
resources releasing.  Add releasing of an allocated fb_info structure as
well.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Ben Dooks <ben-linux@fluff.org>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/s3c-fb.c b/drivers/video/s3c-fb.c
index 53bca28..43680e5 100644
--- a/drivers/video/s3c-fb.c
+++ b/drivers/video/s3c-fb.c
@@ -725,9 +725,12 @@ static void s3c_fb_free_memory(struct s3c_fb *sfb, struct s3c_fb_win *win)
  */
 static void s3c_fb_release_win(struct s3c_fb *sfb, struct s3c_fb_win *win)
 {
-	fb_dealloc_cmap(&win->fbinfo->cmap);
-	unregister_framebuffer(win->fbinfo);
-	s3c_fb_free_memory(sfb, win);
+	if (win->fbinfo) {
+		unregister_framebuffer(win->fbinfo);
+		fb_dealloc_cmap(&win->fbinfo->cmap);
+		s3c_fb_free_memory(sfb, win);
+		framebuffer_release(win->fbinfo);
+	}
 }
 
 /**
@@ -778,7 +781,7 @@ static int __devinit s3c_fb_probe_win(struct s3c_fb *sfb, unsigned int win_no,
 	ret = s3c_fb_alloc_memory(sfb, win);
 	if (ret) {
 		dev_err(sfb->dev, "failed to allocate display memory\n");
-		goto err_framebuffer;
+		return ret;
 	}
 
 	/* setup the r/b/g positions for the window's palette */
@@ -801,7 +804,7 @@ static int __devinit s3c_fb_probe_win(struct s3c_fb *sfb, unsigned int win_no,
 	ret = s3c_fb_check_var(&fbinfo->var, fbinfo);
 	if (ret < 0) {
 		dev_err(sfb->dev, "check_var failed on initial video params\n");
-		goto err_alloc_mem;
+		return ret;
 	}
 
 	/* create initial colour map */
@@ -821,20 +824,13 @@ static int __devinit s3c_fb_probe_win(struct s3c_fb *sfb, unsigned int win_no,
 	ret = register_framebuffer(fbinfo);
 	if (ret < 0) {
 		dev_err(sfb->dev, "failed to register framebuffer\n");
-		goto err_alloc_mem;
+		return ret;
 	}
 
 	*res = win;
 	dev_info(sfb->dev, "window %d: fb %s\n", win_no, fbinfo->fix.id);
 
 	return 0;
-
-err_alloc_mem:
-	s3c_fb_free_memory(sfb, win);
-
-err_framebuffer:
-	unregister_framebuffer(fbinfo);
-	return ret;
 }
 
 /**
-- 
cgit v0.10.2


From 0dac6ecdc056b83ac66e5b5c923fb73268f4332d Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben@simtec.co.uk>
Date: Tue, 16 Jun 2009 15:34:34 -0700
Subject: s3c-fb: CPUFREQ frequency scaling support

Add support for CPU frequency scaling in the S3C24XX video driver.

Signed-off-by: Simtec Linux Team <linux@simtec.co.uk>
Signed-off-by: Ben Dooks <ben@simtec.co.uk>
Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/s3c2410fb.c b/drivers/video/s3c2410fb.c
index b0b4513..7da0027 100644
--- a/drivers/video/s3c2410fb.c
+++ b/drivers/video/s3c2410fb.c
@@ -24,6 +24,7 @@
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
 #include <linux/clk.h>
+#include <linux/cpufreq.h>
 
 #include <asm/io.h>
 #include <asm/div64.h>
@@ -89,7 +90,7 @@ static void s3c2410fb_set_lcdaddr(struct fb_info *info)
 static unsigned int s3c2410fb_calc_pixclk(struct s3c2410fb_info *fbi,
 					  unsigned long pixclk)
 {
-	unsigned long clk = clk_get_rate(fbi->clk);
+	unsigned long clk = fbi->clk_rate;
 	unsigned long long div;
 
 	/* pixclk is in picoseconds, our clock is in Hz
@@ -758,6 +759,57 @@ static irqreturn_t s3c2410fb_irq(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+#ifdef CONFIG_CPU_FREQ
+
+static int s3c2410fb_cpufreq_transition(struct notifier_block *nb,
+					unsigned long val, void *data)
+{
+	struct cpufreq_freqs *freqs = data;
+	struct s3c2410fb_info *info;
+	struct fb_info *fbinfo;
+	long delta_f;
+
+	info = container_of(nb, struct s3c2410fb_info, freq_transition);
+	fbinfo = platform_get_drvdata(to_platform_device(info->dev));
+
+	/* work out change, <0 for speed-up */
+	delta_f = info->clk_rate - clk_get_rate(info->clk);
+
+	if ((val == CPUFREQ_POSTCHANGE && delta_f > 0) ||
+	    (val == CPUFREQ_PRECHANGE && delta_f < 0)) {
+		info->clk_rate = clk_get_rate(info->clk);
+		s3c2410fb_activate_var(fbinfo);
+	}
+
+	return 0;
+}
+
+static inline int s3c2410fb_cpufreq_register(struct s3c2410fb_info *info)
+{
+	info->freq_transition.notifier_call = s3c2410fb_cpufreq_transition;
+
+	return cpufreq_register_notifier(&info->freq_transition,
+					 CPUFREQ_TRANSITION_NOTIFIER);
+}
+
+static inline void s3c2410fb_cpufreq_deregister(struct s3c2410fb_info *info)
+{
+	cpufreq_unregister_notifier(&info->freq_transition,
+				    CPUFREQ_TRANSITION_NOTIFIER);
+}
+
+#else
+static inline int s3c2410fb_cpufreq_register(struct s3c2410fb_info *info)
+{
+	return 0;
+}
+
+static inline void s3c2410fb_cpufreq_deregister(struct s3c2410fb_info *info)
+{
+}
+#endif
+
+
 static char driver_name[] = "s3c2410fb";
 
 static int __init s3c24xxfb_probe(struct platform_device *pdev,
@@ -875,6 +927,8 @@ static int __init s3c24xxfb_probe(struct platform_device *pdev,
 
 	msleep(1);
 
+	info->clk_rate = clk_get_rate(info->clk);
+
 	/* find maximum required memory size for display */
 	for (i = 0; i < mach_info->num_displays; i++) {
 		unsigned long smem_len = mach_info->displays[i].xres;
@@ -904,11 +958,17 @@ static int __init s3c24xxfb_probe(struct platform_device *pdev,
 
 	s3c2410fb_check_var(&fbinfo->var, fbinfo);
 
+	ret = s3c2410fb_cpufreq_register(info);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Failed to register cpufreq\n");
+		goto free_video_memory;
+	}
+
 	ret = register_framebuffer(fbinfo);
 	if (ret < 0) {
 		printk(KERN_ERR "Failed to register framebuffer device: %d\n",
 			ret);
-		goto free_video_memory;
+		goto free_cpufreq;
 	}
 
 	/* create device files */
@@ -922,6 +982,8 @@ static int __init s3c24xxfb_probe(struct platform_device *pdev,
 
 	return 0;
 
+ free_cpufreq:
+	s3c2410fb_cpufreq_deregister(info);
 free_video_memory:
 	s3c2410fb_unmap_video_memory(fbinfo);
 release_clock:
@@ -961,6 +1023,7 @@ static int s3c2410fb_remove(struct platform_device *pdev)
 	int irq;
 
 	unregister_framebuffer(fbinfo);
+	s3c2410fb_cpufreq_deregister(info);
 
 	s3c2410fb_lcd_enable(info, 0);
 	msleep(1);
diff --git a/drivers/video/s3c2410fb.h b/drivers/video/s3c2410fb.h
index 9a6ba3e..47a17bd 100644
--- a/drivers/video/s3c2410fb.h
+++ b/drivers/video/s3c2410fb.h
@@ -29,8 +29,13 @@ struct s3c2410fb_info {
 	enum s3c_drv_type	drv_type;
 	struct s3c2410fb_hw	regs;
 
+	unsigned long		clk_rate;
 	unsigned int		palette_ready;
 
+#ifdef CONFIG_CPU_FREQ
+	struct notifier_block	freq_transition;
+#endif
+
 	/* keep these registers in case we need to re-write palette */
 	u32			palette_buffer[256];
 	u32			pseudo_pal[16];
-- 
cgit v0.10.2


From f73323de5a07e2a7bf3e9bca36dcc8057e5446d4 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 16 Jun 2009 15:34:35 -0700
Subject: radeon: P2G2CLK_ALWAYS_ONb tested twice, should 2nd be
 P2G2CLK_DAC_ALWAYS_ONb?

P2G2CLK_ALWAYS_ONb is tested twice, 2nd should be P2G2CLK_DAC_ALWAYS_ONb.

[akpm@linux-foundation.org: remove duplicated bitwise-OR of PIXCLKS_CNTL__R300_P2G2CLK_ALWAYS_ONb too]
Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/aty/radeon_pm.c b/drivers/video/aty/radeon_pm.c
index 97a1f09..515cf19 100644
--- a/drivers/video/aty/radeon_pm.c
+++ b/drivers/video/aty/radeon_pm.c
@@ -213,7 +213,6 @@ static void radeon_pm_disable_dynamic_mode(struct radeonfb_info *rinfo)
 			 PIXCLKS_CNTL__R300_PIXCLK_TRANS_ALWAYS_ONb	|
 			 PIXCLKS_CNTL__R300_PIXCLK_TVO_ALWAYS_ONb	|
 			 PIXCLKS_CNTL__R300_P2G2CLK_ALWAYS_ONb		|
-			 PIXCLKS_CNTL__R300_P2G2CLK_ALWAYS_ONb		|
 			 PIXCLKS_CNTL__R300_DISP_DAC_PIXCLK_DAC2_BLANK_OFF);
                 OUTPLL(pllPIXCLKS_CNTL, tmp);
 
@@ -395,7 +394,7 @@ static void radeon_pm_enable_dynamic_mode(struct radeonfb_info *rinfo)
 			PIXCLKS_CNTL__R300_PIXCLK_TRANS_ALWAYS_ONb      |
 			PIXCLKS_CNTL__R300_PIXCLK_TVO_ALWAYS_ONb        |
 			PIXCLKS_CNTL__R300_P2G2CLK_ALWAYS_ONb           |
-			PIXCLKS_CNTL__R300_P2G2CLK_ALWAYS_ONb);
+			PIXCLKS_CNTL__R300_P2G2CLK_DAC_ALWAYS_ONb);
 		OUTPLL(pllPIXCLKS_CNTL, tmp);
 
 		tmp = INPLL(pllMCLK_MISC);
-- 
cgit v0.10.2


From 491bcc9bf5d9a57f2d9cb3ce8ba0f6d48752c113 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 16 Jun 2009 15:34:36 -0700
Subject: fbdev: use framebuffer_release() for freeing fb_info structures

Use the framebuffer_release() for freeing fb_info structures allocated
with framebuffer_alloc().

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/offb.c b/drivers/video/offb.c
index bb915a4..4d8c54c 100644
--- a/drivers/video/offb.c
+++ b/drivers/video/offb.c
@@ -493,7 +493,7 @@ static void __init offb_init_fb(const char *name, const char *full_name,
 		iounmap(par->cmap_adr);
 		par->cmap_adr = NULL;
 		iounmap(info->screen_base);
-		kfree(info);
+		framebuffer_release(info);
 		release_mem_region(res_start, res_size);
 		return;
 	}
diff --git a/drivers/video/pm2fb.c b/drivers/video/pm2fb.c
index c6dd924..36436ee 100644
--- a/drivers/video/pm2fb.c
+++ b/drivers/video/pm2fb.c
@@ -1748,7 +1748,7 @@ static void __devexit pm2fb_remove(struct pci_dev *pdev)
 	pci_set_drvdata(pdev, NULL);
 	fb_dealloc_cmap(&info->cmap);
 	kfree(info->pixmap.addr);
-	kfree(info);
+	framebuffer_release(info);
 }
 
 static struct pci_device_id pm2fb_id_table[] = {
diff --git a/drivers/video/sis/sis_main.c b/drivers/video/sis/sis_main.c
index 7e17ee9..7072d19 100644
--- a/drivers/video/sis/sis_main.c
+++ b/drivers/video/sis/sis_main.c
@@ -5928,7 +5928,7 @@ sisfb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		if(pci_enable_device(pdev)) {
 			if(ivideo->nbridge) pci_dev_put(ivideo->nbridge);
 			pci_set_drvdata(pdev, NULL);
-			kfree(sis_fb_info);
+			framebuffer_release(sis_fb_info);
 			return -EIO;
 		}
 	}
@@ -6134,7 +6134,7 @@ error_3:	vfree(ivideo->bios_abase);
 		pci_set_drvdata(pdev, NULL);
 		if(!ivideo->sisvga_enabled)
 			pci_disable_device(pdev);
-		kfree(sis_fb_info);
+		framebuffer_release(sis_fb_info);
 		return ret;
 	}
 
diff --git a/drivers/video/stifb.c b/drivers/video/stifb.c
index eabaad7..eec9dcb 100644
--- a/drivers/video/stifb.c
+++ b/drivers/video/stifb.c
@@ -1380,7 +1380,7 @@ stifb_cleanup(void)
 				if (info->screen_base)
 					iounmap(info->screen_base);
 		        fb_dealloc_cmap(&info->cmap);
-		        kfree(info); 
+		        framebuffer_release(info);
 		}
 		sti->info = NULL;
 	}
-- 
cgit v0.10.2


From b586640141ab5f4ab3b194419bc2c0f039e91dbc Mon Sep 17 00:00:00 2001
From: Paul Menzel <paulepanter@users.sourceforge.net>
Date: Tue, 16 Jun 2009 15:34:37 -0700
Subject: intelfb: fix a bug when changing video timing

When changing video timing dynamically via fbset the screen sporadically
is rendered black.

With the attached fix which disables VCO prior to timing register change
the problem disappears.

I had a look at the Xserver register setup code. Here the VCO is
disabled in the same way [1].

This patch is taken from vga-sync-field version 0.0.11 [2][3].

[1] http://cgit.freedesktop.org/xorg/driver/xf86-video-intel/tree/src/i830_=
driver.c
[2] http://lowbyte.de/vga-sync-fields/vga-sync-fields-0.0.11.tgz
[3] http://easy-vdr.de/git?p=frc.git/.git;a=commit;h=dcc3b863e5a663652587619c357bd20075af6896
2587619c357bd20075af6896

Signed-off-by: Thomas Hilber <sparkie@lowbyte.de>
Signed-off-by: Paul Menzel <paulepanter@users.sourceforge.net>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/intelfb/intelfbdrv.c b/drivers/video/intelfb/intelfbdrv.c
index ace14fe..0cafd64 100644
--- a/drivers/video/intelfb/intelfbdrv.c
+++ b/drivers/video/intelfb/intelfbdrv.c
@@ -1365,6 +1365,11 @@ static int intelfb_set_par(struct fb_info *info)
 	DBG_MSG("intelfb_set_par (%dx%d-%d)\n", info->var.xres,
 		info->var.yres, info->var.bits_per_pixel);
 
+	/*
+	 * Disable VCO prior to timing register change.
+	 */
+	OUTREG(DPLL_A, INREG(DPLL_A) & ~DPLL_VCO_ENABLE);
+
 	intelfb_blank(FB_BLANK_POWERDOWN, info);
 
 	if (ACCEL(dinfo, info))
-- 
cgit v0.10.2


From 4410f3910947dcea8672280b3adecd53cec4e85e Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 16 Jun 2009 15:34:38 -0700
Subject: fbdev: add support for handoff from firmware to hw framebuffers

With KMS we have ran into an issue where we really want the KMS fb driver
to be the one running the console, so panics etc can be shown by switching
out of X etc.

However with vesafb/efifb built-in, we end up with those on fb0 and the
KMS fb driver on fb1, driving the same piece of hw, so this adds an fb
info flag to denote a firmware fbdev, and adds a new aperture base/size
range which can be compared when the hw drivers are installed to see if
there is a conflict with a firmware driver, and if there is the firmware
driver is unregistered and the hw driver takes over.

It uses new aperture_base/size members instead of comparing on the fix
smem_start/length, as smem_start/length might for example only cover the
first 1MB of the PCI aperture, and we could allocate the kms fb from 8MB
into the aperture, thus they would never overlap.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Dave Airlie <airlied@redhat.com>
Acked-by: Peter Jones <pjones@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/gpu/drm/i915/intel_fb.c b/drivers/gpu/drm/i915/intel_fb.c
index 0ecf6b7..8e28e59 100644
--- a/drivers/gpu/drm/i915/intel_fb.c
+++ b/drivers/gpu/drm/i915/intel_fb.c
@@ -504,6 +504,14 @@ static int intelfb_create(struct drm_device *dev, uint32_t fb_width,
 	info->fbops = &intelfb_ops;
 
 	info->fix.line_length = fb->pitch;
+
+	/* setup aperture base/size for vesafb takeover */
+	info->aperture_base = dev->mode_config.fb_base;
+	if (IS_I9XX(dev))
+		info->aperture_size = pci_resource_len(dev->pdev, 2);
+	else
+		info->aperture_size = pci_resource_len(dev->pdev, 0);
+
 	info->fix.smem_start = dev->mode_config.fb_base + obj_priv->gtt_offset;
 	info->fix.smem_len = size;
 
diff --git a/drivers/video/efifb.c b/drivers/video/efifb.c
index 8dea2bc..eb12182 100644
--- a/drivers/video/efifb.c
+++ b/drivers/video/efifb.c
@@ -280,6 +280,9 @@ static int __init efifb_probe(struct platform_device *dev)
 	info->pseudo_palette = info->par;
 	info->par = NULL;
 
+	info->aperture_base = efifb_fix.smem_start;
+	info->aperture_size = size_total;
+
 	info->screen_base = ioremap(efifb_fix.smem_start, efifb_fix.smem_len);
 	if (!info->screen_base) {
 		printk(KERN_ERR "efifb: abort, cannot ioremap video memory "
@@ -337,7 +340,7 @@ static int __init efifb_probe(struct platform_device *dev)
 	info->fbops = &efifb_ops;
 	info->var = efifb_defined;
 	info->fix = efifb_fix;
-	info->flags = FBINFO_FLAG_DEFAULT;
+	info->flags = FBINFO_FLAG_DEFAULT | FBINFO_MISC_FIRMWARE;
 
 	if ((err = fb_alloc_cmap(&info->cmap, 256, 0)) < 0) {
 		printk(KERN_ERR "efifb: cannot allocate colormap\n");
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index d412a1d..f8a09bf 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1462,6 +1462,16 @@ static int fb_check_foreignness(struct fb_info *fi)
 	return 0;
 }
 
+static bool fb_do_apertures_overlap(struct fb_info *gen, struct fb_info *hw)
+{
+	/* is the generic aperture base the same as the HW one */
+	if (gen->aperture_base == hw->aperture_base)
+		return true;
+	/* is the generic aperture base inside the hw base->hw base+size */
+	if (gen->aperture_base > hw->aperture_base && gen->aperture_base <= hw->aperture_base + hw->aperture_size)
+		return true;
+	return false;
+}
 /**
  *	register_framebuffer - registers a frame buffer device
  *	@fb_info: frame buffer info structure
@@ -1485,6 +1495,23 @@ register_framebuffer(struct fb_info *fb_info)
 	if (fb_check_foreignness(fb_info))
 		return -ENOSYS;
 
+	/* check all firmware fbs and kick off if the base addr overlaps */
+	for (i = 0 ; i < FB_MAX; i++) {
+		if (!registered_fb[i])
+			continue;
+
+		if (registered_fb[i]->flags & FBINFO_MISC_FIRMWARE) {
+			if (fb_do_apertures_overlap(registered_fb[i], fb_info)) {
+				printk(KERN_ERR "fb: conflicting fb hw usage "
+				       "%s vs %s - removing generic driver\n",
+				       fb_info->fix.id,
+				       registered_fb[i]->fix.id);
+				unregister_framebuffer(registered_fb[i]);
+				break;
+			}
+		}
+	}
+
 	num_registered_fb++;
 	for (i = 0 ; i < FB_MAX; i++)
 		if (!registered_fb[i])
@@ -1586,6 +1613,10 @@ unregister_framebuffer(struct fb_info *fb_info)
 	device_destroy(fb_class, MKDEV(FB_MAJOR, i));
 	event.info = fb_info;
 	fb_notifier_call_chain(FB_EVENT_FB_UNREGISTERED, &event);
+
+	/* this may free fb info */
+	if (fb_info->fbops->fb_destroy)
+		fb_info->fbops->fb_destroy(fb_info);
 done:
 	return ret;
 }
diff --git a/drivers/video/vesafb.c b/drivers/video/vesafb.c
index d6856f4..bd37ee1 100644
--- a/drivers/video/vesafb.c
+++ b/drivers/video/vesafb.c
@@ -174,8 +174,17 @@ static int vesafb_setcolreg(unsigned regno, unsigned red, unsigned green,
 	return err;
 }
 
+static void vesafb_destroy(struct fb_info *info)
+{
+	if (info->screen_base)
+		iounmap(info->screen_base);
+	release_mem_region(info->aperture_base, info->aperture_size);
+	framebuffer_release(info);
+}
+
 static struct fb_ops vesafb_ops = {
 	.owner		= THIS_MODULE,
+	.fb_destroy     = vesafb_destroy,
 	.fb_setcolreg	= vesafb_setcolreg,
 	.fb_pan_display	= vesafb_pan_display,
 	.fb_fillrect	= cfb_fillrect,
@@ -286,6 +295,10 @@ static int __init vesafb_probe(struct platform_device *dev)
 	info->pseudo_palette = info->par;
 	info->par = NULL;
 
+	/* set vesafb aperture size for generic probing */
+	info->aperture_base = screen_info.lfb_base;
+	info->aperture_size = size_total;
+
 	info->screen_base = ioremap(vesafb_fix.smem_start, vesafb_fix.smem_len);
 	if (!info->screen_base) {
 		printk(KERN_ERR
@@ -437,7 +450,7 @@ static int __init vesafb_probe(struct platform_device *dev)
 	info->fbops = &vesafb_ops;
 	info->var = vesafb_defined;
 	info->fix = vesafb_fix;
-	info->flags = FBINFO_FLAG_DEFAULT |
+	info->flags = FBINFO_FLAG_DEFAULT | FBINFO_MISC_FIRMWARE |
 		(ypan ? FBINFO_HWACCEL_YPAN : 0);
 
 	if (!ypan)
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 330c4b1..3c5562a 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -677,6 +677,9 @@ struct fb_ops {
 	/* get capability given var */
 	void (*fb_get_caps)(struct fb_info *info, struct fb_blit_caps *caps,
 			    struct fb_var_screeninfo *var);
+
+	/* teardown any resources to do with this framebuffer */
+	void (*fb_destroy)(struct fb_info *info);
 };
 
 #ifdef CONFIG_FB_TILEBLITTING
@@ -786,6 +789,8 @@ struct fb_tile_ops {
 #define FBINFO_MISC_USEREVENT          0x10000 /* event request
 						  from userspace */
 #define FBINFO_MISC_TILEBLITTING       0x20000 /* use tile blitting */
+#define FBINFO_MISC_FIRMWARE           0x40000 /* a replaceable firmware
+						  inited framebuffer */
 
 /* A driver may set this flag to indicate that it does want a set_par to be
  * called every time when fbcon_switch is executed. The advantage is that with
@@ -854,7 +859,12 @@ struct fb_info {
 	u32 state;			/* Hardware state i.e suspend */
 	void *fbcon_par;                /* fbcon use-only private area */
 	/* From here on everything is device dependent */
-	void *par;	
+	void *par;
+	/* we need the PCI or similiar aperture base/size not
+	   smem_start/size as smem_start may just be an object
+	   allocated inside the aperture so may not actually overlap */
+	resource_size_t aperture_base;
+	resource_size_t aperture_size;
 };
 
 #ifdef MODULE
-- 
cgit v0.10.2


From fe3a1aa239a74bcbf25211aab33b6ecc80acf0f9 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 16 Jun 2009 15:34:39 -0700
Subject: tcx: use standard fields for framebuffer physical address and length

Use standard fields fbinfo.fix.smem_start and fbinfo.fix.smem_len for
physical address and length of framebuffer.

This also fixes output of the 'fbset -i' command - address and length of
the framebuffer are displayed correctly.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/tcx.c b/drivers/video/tcx.c
index 643afbfe..45b8835 100644
--- a/drivers/video/tcx.c
+++ b/drivers/video/tcx.c
@@ -116,17 +116,16 @@ struct tcx_par {
 	u32			flags;
 #define TCX_FLAG_BLANKED	0x00000001
 
-	unsigned long		physbase;
 	unsigned long		which_io;
-	unsigned long		fbsize;
 
 	struct sbus_mmap_map	mmap_map[TCX_MMAP_ENTRIES];
 	int			lowdepth;
 };
 
 /* Reset control plane so that WID is 8-bit plane. */
-static void __tcx_set_control_plane(struct tcx_par *par)
+static void __tcx_set_control_plane(struct fb_info *info)
 {
+	struct tcx_par *par = info->par;
 	u32 __iomem *p, *pend;
 
 	if (par->lowdepth)
@@ -135,7 +134,7 @@ static void __tcx_set_control_plane(struct tcx_par *par)
 	p = par->cplane;
 	if (p == NULL)
 		return;
-	for (pend = p + par->fbsize; p < pend; p++) {
+	for (pend = p + info->fix.smem_len; p < pend; p++) {
 		u32 tmp = sbus_readl(p);
 
 		tmp &= 0xffffff;
@@ -149,7 +148,7 @@ static void tcx_reset(struct fb_info *info)
 	unsigned long flags;
 
 	spin_lock_irqsave(&par->lock, flags);
-	__tcx_set_control_plane(par);
+	__tcx_set_control_plane(info);
 	spin_unlock_irqrestore(&par->lock, flags);
 }
 
@@ -304,7 +303,7 @@ static int tcx_mmap(struct fb_info *info, struct vm_area_struct *vma)
 	struct tcx_par *par = (struct tcx_par *)info->par;
 
 	return sbusfb_mmap_helper(par->mmap_map,
-				  par->physbase, par->fbsize,
+				  info->fix.smem_start, info->fix.smem_len,
 				  par->which_io, vma);
 }
 
@@ -316,7 +315,7 @@ static int tcx_ioctl(struct fb_info *info, unsigned int cmd,
 	return sbusfb_ioctl_helper(cmd, arg, info,
 				   FBTYPE_TCXCOLOR,
 				   (par->lowdepth ? 8 : 24),
-				   par->fbsize);
+				   info->fix.smem_len);
 }
 
 /*
@@ -358,10 +357,10 @@ static void tcx_unmap_regs(struct of_device *op, struct fb_info *info,
 			   par->bt, sizeof(struct bt_regs));
 	if (par->cplane)
 		of_iounmap(&op->resource[4],
-			   par->cplane, par->fbsize * sizeof(u32));
+			   par->cplane, info->fix.smem_len * sizeof(u32));
 	if (info->screen_base)
 		of_iounmap(&op->resource[0],
-			   info->screen_base, par->fbsize);
+			   info->screen_base, info->fix.smem_len);
 }
 
 static int __devinit tcx_probe(struct of_device *op,
@@ -391,7 +390,7 @@ static int __devinit tcx_probe(struct of_device *op,
 
 	linebytes = of_getintprop_default(dp, "linebytes",
 					  info->var.xres);
-	par->fbsize = PAGE_ALIGN(linebytes * info->var.yres);
+	info->fix.smem_len = PAGE_ALIGN(linebytes * info->var.yres);
 
 	par->tec = of_ioremap(&op->resource[7], 0,
 				  sizeof(struct tcx_tec), "tcx tec");
@@ -400,7 +399,7 @@ static int __devinit tcx_probe(struct of_device *op,
 	par->bt = of_ioremap(&op->resource[8], 0,
 				 sizeof(struct bt_regs), "tcx dac");
 	info->screen_base = of_ioremap(&op->resource[0], 0,
-					   par->fbsize, "tcx ram");
+					   info->fix.smem_len, "tcx ram");
 	if (!par->tec || !par->thc ||
 	    !par->bt || !info->screen_base)
 		goto out_unmap_regs;
@@ -408,7 +407,7 @@ static int __devinit tcx_probe(struct of_device *op,
 	memcpy(&par->mmap_map, &__tcx_mmap_map, sizeof(par->mmap_map));
 	if (!par->lowdepth) {
 		par->cplane = of_ioremap(&op->resource[4], 0,
-					     par->fbsize * sizeof(u32),
+					     info->fix.smem_len * sizeof(u32),
 					     "tcx cplane");
 		if (!par->cplane)
 			goto out_unmap_regs;
@@ -419,7 +418,7 @@ static int __devinit tcx_probe(struct of_device *op,
 		par->mmap_map[6].size = SBUS_MMAP_EMPTY;
 	}
 
-	par->physbase = op->resource[0].start;
+	info->fix.smem_start = op->resource[0].start;
 	par->which_io = op->resource[0].flags & IORESOURCE_BITS;
 
 	for (i = 0; i < TCX_MMAP_ENTRIES; i++) {
@@ -473,7 +472,7 @@ static int __devinit tcx_probe(struct of_device *op,
 	printk(KERN_INFO "%s: TCX at %lx:%lx, %s\n",
 	       dp->full_name,
 	       par->which_io,
-	       par->physbase,
+	       info->fix.smem_start,
 	       par->lowdepth ? "8-bit only" : "24-bit depth");
 
 	return 0;
-- 
cgit v0.10.2


From 3ed167af96ed098187ea41353fe02d1af20d38a1 Mon Sep 17 00:00:00 2001
From: Kristoffer Ericson <kristoffer.ericson@gmail.com>
Date: Tue, 16 Jun 2009 15:34:40 -0700
Subject: fbdev: s1d13xxxfb: add accelerated bitblt functions

Add accelerated bitblt functions to s1d13xxx based video chipsets, more
specificly functions copyarea and fillrect.

It has only been tested and activated for 13506 chipsets but is expected
to work for the majority of s1d13xxx based chips.  This patch also cleans
up the driver with respect of whitespaces and other formatting issues.  We
update the current status comments.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Kristoffer Ericson <kristoffer.ericson@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/s1d13xxxfb.c b/drivers/video/s1d13xxxfb.c
index 0726aec..0deb0a8 100644
--- a/drivers/video/s1d13xxxfb.c
+++ b/drivers/video/s1d13xxxfb.c
@@ -2,6 +2,7 @@
  *
  * (c) 2004 Simtec Electronics
  * (c) 2005 Thibaut VARENE <varenet@parisc-linux.org>
+ * (c) 2009 Kristoffer Ericson <kristoffer.ericson@gmail.com>
  *
  * Driver for Epson S1D13xxx series framebuffer chips
  *
@@ -10,18 +11,10 @@
  *  linux/drivers/video/epson1355fb.c
  *  linux/drivers/video/epson/s1d13xxxfb.c (2.4 driver by Epson)
  *
- * Note, currently only tested on S1D13806 with 16bit CRT.
- * As such, this driver might still contain some hardcoded bits relating to
- * S1D13806.
- * Making it work on other S1D13XXX chips should merely be a matter of adding
- * a few switch()s, some missing glue here and there maybe, and split header
- * files.
- *
  * TODO: - handle dual screen display (CRT and LCD at the same time).
  *	 - check_var(), mode change, etc.
- *	 - PM untested.
- *	 - Accelerated interfaces.
- *	 - Probably not SMP safe :)
+ *	 - probably not SMP safe :)
+ *       - support all bitblt operations on all cards
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License. See the file COPYING in the main directory of this archive for
@@ -31,19 +24,24 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/delay.h>
-
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/fb.h>
+#include <linux/spinlock_types.h>
+#include <linux/spinlock.h>
 
 #include <asm/io.h>
 
 #include <video/s1d13xxxfb.h>
 
-#define PFX "s1d13xxxfb: "
+#define PFX	"s1d13xxxfb: "
+#define BLIT	"s1d13xxxfb_bitblt: "
 
+/*
+ * set this to enable debugging on general functions
+ */
 #if 0
 #define dbg(fmt, args...) do { printk(KERN_INFO fmt, ## args); } while(0)
 #else
@@ -51,7 +49,21 @@
 #endif
 
 /*
- * List of card production ids
+ * set this to enable debugging on 2D acceleration
+ */
+#if 0
+#define dbg_blit(fmt, args...) do { printk(KERN_INFO BLIT fmt, ## args); } while (0)
+#else
+#define dbg_blit(fmt, args...) do { } while (0)
+#endif
+
+/*
+ * we make sure only one bitblt operation is running
+ */
+static DEFINE_SPINLOCK(s1d13xxxfb_bitblt_lock);
+
+/*
+ * list of card production ids
  */
 static const int s1d13xxxfb_prod_ids[] = {
 	S1D13505_PROD_ID,
@@ -69,7 +81,7 @@ static const char *s1d13xxxfb_prod_names[] = {
 };
 
 /*
- * Here we define the default struct fb_fix_screeninfo
+ * here we define the default struct fb_fix_screeninfo
  */
 static struct fb_fix_screeninfo __devinitdata s1d13xxxfb_fix = {
 	.id		= S1D_FBID,
@@ -145,8 +157,10 @@ crt_enable(struct s1d13xxxfb_par *par, int enable)
 	s1d13xxxfb_writereg(par, S1DREG_COM_DISP_MODE, mode);
 }
 
-/* framebuffer control routines */
 
+/*************************************************************
+ framebuffer control functions
+ *************************************************************/
 static inline void
 s1d13xxxfb_setup_pseudocolour(struct fb_info *info)
 {
@@ -242,13 +256,13 @@ s1d13xxxfb_set_par(struct fb_info *info)
 }
 
 /**
- *  	s1d13xxxfb_setcolreg - sets a color register.
- *      @regno: Which register in the CLUT we are programming
- *      @red: The red value which can be up to 16 bits wide
+ *	s1d13xxxfb_setcolreg - sets a color register.
+ *	@regno: Which register in the CLUT we are programming
+ *	@red: The red value which can be up to 16 bits wide
  *	@green: The green value which can be up to 16 bits wide
  *	@blue:  The blue value which can be up to 16 bits wide.
  *	@transp: If supported the alpha value which can be up to 16 bits wide.
- *      @info: frame buffer info structure
+ *	@info: frame buffer info structure
  *
  *	Returns negative errno on error, or zero on success.
  */
@@ -351,15 +365,15 @@ s1d13xxxfb_blank(int blank_mode, struct fb_info *info)
 }
 
 /**
- *      s1d13xxxfb_pan_display - Pans the display.
- *      @var: frame buffer variable screen structure
- *      @info: frame buffer structure that represents a single frame buffer
+ *	s1d13xxxfb_pan_display - Pans the display.
+ *	@var: frame buffer variable screen structure
+ *	@info: frame buffer structure that represents a single frame buffer
  *
  *	Pan (or wrap, depending on the `vmode' field) the display using the
- *  	`yoffset' field of the `var' structure (`xoffset'  not yet supported).
- *  	If the values don't fit, return -EINVAL.
+ *	`yoffset' field of the `var' structure (`xoffset'  not yet supported).
+ *	If the values don't fit, return -EINVAL.
  *
- *      Returns negative errno on error, or zero on success.
+ *	Returns negative errno on error, or zero on success.
  */
 static int
 s1d13xxxfb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info)
@@ -390,8 +404,259 @@ s1d13xxxfb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info)
 	return 0;
 }
 
-/* framebuffer information structures */
+/************************************************************
+ functions to handle bitblt acceleration
+ ************************************************************/
+
+/**
+ *	bltbit_wait_bitset - waits for change in register value
+ *	@info : framebuffer structure
+ *	@bit  : value expected in register
+ *	@timeout : ...
+ *
+ *	waits until value changes INTO bit
+ */
+static u8
+bltbit_wait_bitset(struct fb_info *info, u8 bit, int timeout)
+{
+	while (!(s1d13xxxfb_readreg(info->par, S1DREG_BBLT_CTL0) & bit)) {
+		udelay(10);
+		if (!--timeout) {
+			dbg_blit("wait_bitset timeout\n");
+			break;
+		}
+	}
+
+	return timeout;
+}
+
+/**
+ *	bltbit_wait_bitclear - waits for change in register value
+ *	@info : frambuffer structure
+ *	@bit  : value currently in register
+ *	@timeout : ...
+ *
+ *	waits until value changes FROM bit
+ *
+ */
+static u8
+bltbit_wait_bitclear(struct fb_info *info, u8 bit, int timeout)
+{
+	while (s1d13xxxfb_readreg(info->par, S1DREG_BBLT_CTL0) & bit) {
+		udelay(10);
+		if (!--timeout) {
+			dbg_blit("wait_bitclear timeout\n");
+			break;
+		}
+	}
+
+	return timeout;
+}
+
+/**
+ *	bltbit_fifo_status - checks the current status of the fifo
+ *	@info : framebuffer structure
+ *
+ *	returns number of free words in buffer
+ */
+static u8
+bltbit_fifo_status(struct fb_info *info)
+{
+	u8 status;
 
+	status = s1d13xxxfb_readreg(info->par, S1DREG_BBLT_CTL0);
+
+	/* its empty so room for 16 words */
+	if (status & BBLT_FIFO_EMPTY)
+		return 16;
+
+	/* its full so we dont want to add */
+	if (status & BBLT_FIFO_FULL)
+		return 0;
+
+	/* its atleast half full but we can add one atleast */
+	if (status & BBLT_FIFO_NOT_FULL)
+		return 1;
+
+	return 0;
+}
+
+/*
+ *	s1d13xxxfb_bitblt_copyarea - accelerated copyarea function
+ *	@info : framebuffer structure
+ *	@area : fb_copyarea structure
+ *
+ *	supports (atleast) S1D13506
+ *
+ */
+static void
+s1d13xxxfb_bitblt_copyarea(struct fb_info *info, const struct fb_copyarea *area)
+{
+	u32 dst, src;
+	u32 stride;
+	u16 reverse = 0;
+	u16 sx = area->sx, sy = area->sy;
+	u16 dx = area->dx, dy = area->dy;
+	u16 width = area->width, height = area->height;
+	u16 bpp;
+
+	spin_lock(&s1d13xxxfb_bitblt_lock);
+
+	/* bytes per xres line */
+	bpp = (info->var.bits_per_pixel >> 3);
+	stride = bpp * info->var.xres;
+
+	/* reverse, calculate the last pixel in rectangle */
+	if ((dy > sy) || ((dy == sy) && (dx >= sx))) {
+		dst = (((dy + height - 1) * stride) + (bpp * (dx + width - 1)));
+		src = (((sy + height - 1) * stride) + (bpp * (sx + width - 1)));
+		reverse = 1;
+	/* not reverse, calculate the first pixel in rectangle */
+	} else { /* (y * xres) + (bpp * x) */
+		dst = (dy * stride) + (bpp * dx);
+		src = (sy * stride) + (bpp * sx);
+	}
+
+	/* set source adress */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_SRC_START0, (src & 0xff));
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_SRC_START1, (src >> 8) & 0x00ff);
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_SRC_START2, (src >> 16) & 0x00ff);
+
+	/* set destination adress */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_DST_START0, (dst & 0xff));
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_DST_START1, (dst >> 8) & 0x00ff);
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_DST_START2, (dst >> 16) & 0x00ff);
+
+	/* program height and width */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_WIDTH0, (width & 0xff) - 1);
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_WIDTH1, (width >> 8));
+
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_HEIGHT0, (height & 0xff) - 1);
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_HEIGHT1, (height >> 8));
+
+	/* negative direction ROP */
+	if (reverse == 1) {
+		dbg_blit("(copyarea) negative rop\n");
+		s1d13xxxfb_writereg(info->par, S1DREG_BBLT_OP, 0x03);
+	} else /* positive direction ROP */ {
+		s1d13xxxfb_writereg(info->par, S1DREG_BBLT_OP, 0x02);
+		dbg_blit("(copyarea) positive rop\n");
+	}
+
+	/* set for rectangel mode and not linear */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_CTL0, 0x0);
+
+	/* setup the bpp 1 = 16bpp, 0 = 8bpp*/
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_CTL1, (bpp >> 1));
+
+	/* set words per xres */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_MEM_OFF0, (stride >> 1) & 0xff);
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_MEM_OFF1, (stride >> 9));
+
+	dbg_blit("(copyarea) dx=%d, dy=%d\n", dx, dy);
+	dbg_blit("(copyarea) sx=%d, sy=%d\n", sx, sy);
+	dbg_blit("(copyarea) width=%d, height=%d\n", width - 1, height - 1);
+	dbg_blit("(copyarea) stride=%d\n", stride);
+	dbg_blit("(copyarea) bpp=%d=0x0%d, mem_offset1=%d, mem_offset2=%d\n", bpp, (bpp >> 1),
+		(stride >> 1) & 0xff, stride >> 9);
+
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_CC_EXP, 0x0c);
+
+	/* initialize the engine */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_CTL0, 0x80);
+
+	/* wait to complete */
+	bltbit_wait_bitclear(info, 0x80, 8000);
+
+	spin_unlock(&s1d13xxxfb_bitblt_lock);
+}
+
+/**
+ *
+ *	s1d13xxxfb_bitblt_solidfill - accelerated solidfill function
+ *	@info : framebuffer structure
+ *	@rect : fb_fillrect structure
+ *
+ *	supports (atleast 13506)
+ *
+ **/
+static void
+s1d13xxxfb_bitblt_solidfill(struct fb_info *info, const struct fb_fillrect *rect)
+{
+	u32 screen_stride, dest;
+	u32 fg;
+	u16 bpp = (info->var.bits_per_pixel >> 3);
+
+	/* grab spinlock */
+	spin_lock(&s1d13xxxfb_bitblt_lock);
+
+	/* bytes per x width */
+	screen_stride = (bpp * info->var.xres);
+
+	/* bytes to starting point */
+	dest = ((rect->dy * screen_stride) + (bpp * rect->dx));
+
+	dbg_blit("(solidfill) dx=%d, dy=%d, stride=%d, dest=%d\n"
+		 "(solidfill) : rect_width=%d, rect_height=%d\n",
+				rect->dx, rect->dy, screen_stride, dest,
+				rect->width - 1, rect->height - 1);
+
+	dbg_blit("(solidfill) : xres=%d, yres=%d, bpp=%d\n",
+				info->var.xres, info->var.yres,
+				info->var.bits_per_pixel);
+	dbg_blit("(solidfill) : rop=%d\n", rect->rop);
+
+	/* We split the destination into the three registers */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_DST_START0, (dest & 0x00ff));
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_DST_START1, ((dest >> 8) & 0x00ff));
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_DST_START2, ((dest >> 16) & 0x00ff));
+
+	/* give information regarding rectangel width */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_WIDTH0, ((rect->width) & 0x00ff) - 1);
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_WIDTH1, (rect->width >> 8));
+
+	/* give information regarding rectangel height */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_HEIGHT0, ((rect->height) & 0x00ff) - 1);
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_HEIGHT1, (rect->height >> 8));
+
+	if (info->fix.visual == FB_VISUAL_TRUECOLOR ||
+		info->fix.visual == FB_VISUAL_DIRECTCOLOR) {
+		fg = ((u32 *)info->pseudo_palette)[rect->color];
+		dbg_blit("(solidfill) truecolor/directcolor\n");
+		dbg_blit("(solidfill) pseudo_palette[%d] = %d\n", rect->color, fg);
+	} else {
+		fg = rect->color;
+		dbg_blit("(solidfill) color = %d\n", rect->color);
+	}
+
+	/* set foreground color */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_FGC0, (fg & 0xff));
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_FGC1, (fg >> 8) & 0xff);
+
+	/* set rectangual region of memory (rectangle and not linear) */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_CTL0, 0x0);
+
+	/* set operation mode SOLID_FILL */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_OP, BBLT_SOLID_FILL);
+
+	/* set bits per pixel (1 = 16bpp, 0 = 8bpp) */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_CTL1, (info->var.bits_per_pixel >> 4));
+
+	/* set the memory offset for the bblt in word sizes */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_MEM_OFF0, (screen_stride >> 1) & 0x00ff);
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_MEM_OFF1, (screen_stride >> 9));
+
+	/* and away we go.... */
+	s1d13xxxfb_writereg(info->par, S1DREG_BBLT_CTL0, 0x80);
+
+	/* wait until its done */
+	bltbit_wait_bitclear(info, 0x80, 8000);
+
+	/* let others play */
+	spin_unlock(&s1d13xxxfb_bitblt_lock);
+}
+
+/* framebuffer information structures */
 static struct fb_ops s1d13xxxfb_fbops = {
 	.owner		= THIS_MODULE,
 	.fb_set_par	= s1d13xxxfb_set_par,
@@ -400,7 +665,7 @@ static struct fb_ops s1d13xxxfb_fbops = {
 
 	.fb_pan_display	= s1d13xxxfb_pan_display,
 
-	/* to be replaced by any acceleration we can */
+	/* gets replaced at chip detection time */
 	.fb_fillrect	= cfb_fillrect,
 	.fb_copyarea	= cfb_copyarea,
 	.fb_imageblit	= cfb_imageblit,
@@ -412,9 +677,9 @@ static int s1d13xxxfb_width_tab[2][4] __devinitdata = {
 };
 
 /**
- *      s1d13xxxfb_fetch_hw_state - Configure the framebuffer according to
+ *	s1d13xxxfb_fetch_hw_state - Configure the framebuffer according to
  *	hardware setup.
- *      @info: frame buffer structure
+ *	@info: frame buffer structure
  *
  *	We setup the framebuffer structures according to the current
  *	hardware setup. On some machines, the BIOS will have filled
@@ -569,7 +834,6 @@ s1d13xxxfb_probe(struct platform_device *pdev)
 	if (pdata && pdata->platform_init_video)
 		pdata->platform_init_video();
 
-
 	if (pdev->num_resources != 2) {
 		dev_err(&pdev->dev, "invalid num_resources: %i\n",
 		       pdev->num_resources);
@@ -655,16 +919,27 @@ s1d13xxxfb_probe(struct platform_device *pdev)
 
 	info->fix = s1d13xxxfb_fix;
 	info->fix.mmio_start = pdev->resource[1].start;
-	info->fix.mmio_len = pdev->resource[1].end - pdev->resource[1].start +1;
+	info->fix.mmio_len = pdev->resource[1].end - pdev->resource[1].start + 1;
 	info->fix.smem_start = pdev->resource[0].start;
-	info->fix.smem_len = pdev->resource[0].end - pdev->resource[0].start +1;
+	info->fix.smem_len = pdev->resource[0].end - pdev->resource[0].start + 1;
 
 	printk(KERN_INFO PFX "regs mapped at 0x%p, fb %d KiB mapped at 0x%p\n",
 	       default_par->regs, info->fix.smem_len / 1024, info->screen_base);
 
 	info->par = default_par;
-	info->fbops = &s1d13xxxfb_fbops;
 	info->flags = FBINFO_DEFAULT | FBINFO_HWACCEL_YPAN;
+	info->fbops = &s1d13xxxfb_fbops;
+
+	switch(prod_id) {
+	case S1D13506_PROD_ID:	/* activate acceleration */
+		s1d13xxxfb_fbops.fb_fillrect = s1d13xxxfb_bitblt_solidfill;
+		s1d13xxxfb_fbops.fb_copyarea = s1d13xxxfb_bitblt_copyarea;
+		info->flags = FBINFO_DEFAULT | FBINFO_HWACCEL_YPAN |
+			FBINFO_HWACCEL_FILLRECT | FBINFO_HWACCEL_COPYAREA;
+		break;
+	default:
+		break;
+	}
 
 	/* perform "manual" chip initialization, if needed */
 	if (pdata && pdata->initregs)
diff --git a/include/video/s1d13xxxfb.h b/include/video/s1d13xxxfb.h
index c3b2a2a..f0736cff 100644
--- a/include/video/s1d13xxxfb.h
+++ b/include/video/s1d13xxxfb.h
@@ -136,6 +136,15 @@
 #define S1DREG_DELAYOFF			0xFFFE
 #define S1DREG_DELAYON			0xFFFF
 
+#define BBLT_FIFO_EMPTY			0x00
+#define BBLT_FIFO_NOT_EMPTY		0x40
+#define BBLT_FIFO_NOT_FULL		0x30
+#define BBLT_FIFO_HALF_FULL		0x20
+#define BBLT_FIFO_FULL			0x10
+
+#define BBLT_SOLID_FILL			0x0c
+
+
 /* Note: all above defines should go in separate header files
    when implementing other S1D13xxx chip support. */
 
-- 
cgit v0.10.2


From 00115e6690ed5aa90530e1e41c467cd895dd18fc Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 16 Jun 2009 15:34:40 -0700
Subject: fbdev: blackfin has __raw I/O accessors, so use them in fb.h

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/fb.h b/include/linux/fb.h
index 3c5562a..dd68358 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -903,7 +903,7 @@ struct fb_info {
 #define fb_writeq sbus_writeq
 #define fb_memset sbus_memset_io
 
-#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || defined(__sh__) || defined(__powerpc__) || defined(__avr32__)
+#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || defined(__sh__) || defined(__powerpc__) || defined(__avr32__) || defined(__bfin__)
 
 #define fb_readb __raw_readb
 #define fb_readw __raw_readw
-- 
cgit v0.10.2


From 9990bfd0e5d4aae9c33693aef8b0a36577c63677 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 16 Jun 2009 15:34:41 -0700
Subject: fbdev: bfin-t350mcqb-fb: drop unused local variables

The local fbinfo/info vars in the suspend functions don't actually get
used which cause ugly gcc warnings, so drop them.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/bfin-t350mcqb-fb.c b/drivers/video/bfin-t350mcqb-fb.c
index 90cfdda..2a2e116 100644
--- a/drivers/video/bfin-t350mcqb-fb.c
+++ b/drivers/video/bfin-t350mcqb-fb.c
@@ -637,9 +637,6 @@ static int bfin_t350mcqb_remove(struct platform_device *pdev)
 #ifdef CONFIG_PM
 static int bfin_t350mcqb_suspend(struct platform_device *pdev, pm_message_t state)
 {
-	struct fb_info *fbinfo = platform_get_drvdata(pdev);
-	struct bfin_t350mcqbfb_info *info = fbinfo->par;
-
 	bfin_t350mcqb_disable_ppi();
 	disable_dma(CH_PPI);
 	bfin_write_PPI_STATUS(0xFFFF);
@@ -649,9 +646,6 @@ static int bfin_t350mcqb_suspend(struct platform_device *pdev, pm_message_t stat
 
 static int bfin_t350mcqb_resume(struct platform_device *pdev)
 {
-	struct fb_info *fbinfo = platform_get_drvdata(pdev);
-	struct bfin_t350mcqbfb_info *info = fbinfo->par;
-
 	enable_dma(CH_PPI);
 	bfin_t350mcqb_enable_ppi();
 
-- 
cgit v0.10.2


From 3608c66c2e6522952b4be219317b194ab27f4e78 Mon Sep 17 00:00:00 2001
From: Vivek Kutal <vivek.kutal@azingo.com>
Date: Tue, 16 Jun 2009 15:34:42 -0700
Subject: fbdev: *bfin*: drop unnecessary calls to memset

The dma_alloc_* functions sets the memory to 0 before returning so there
is no need to call memset after the allocation.  Also no point in clearing
the memory when disabling the buffer.

Signed-off-by: Vivek Kutal <vivek.kutal@azingo.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/bf54x-lq043fb.c b/drivers/video/bf54x-lq043fb.c
index 37e60b1..6439a43 100644
--- a/drivers/video/bf54x-lq043fb.c
+++ b/drivers/video/bf54x-lq043fb.c
@@ -323,7 +323,6 @@ static int bfin_bf54x_fb_release(struct fb_info *info, int user)
 		bfin_write_EPPI0_CONTROL(0);
 		SSYNC();
 		disable_dma(CH_EPPI0);
-		memset(fbi->fb_buffer, 0, info->fix.smem_len);
 	}
 
 	spin_unlock(&fbi->lock);
@@ -626,8 +625,6 @@ static int __init bfin_bf54x_probe(struct platform_device *pdev)
 		goto out3;
 	}
 
-	memset(info->fb_buffer, 0, fbinfo->fix.smem_len);
-
 	fbinfo->screen_base = (void *)info->fb_buffer;
 	fbinfo->fix.smem_start = (int)info->fb_buffer;
 
diff --git a/drivers/video/bfin-t350mcqb-fb.c b/drivers/video/bfin-t350mcqb-fb.c
index 2a2e116..f40eef8 100644
--- a/drivers/video/bfin-t350mcqb-fb.c
+++ b/drivers/video/bfin-t350mcqb-fb.c
@@ -242,7 +242,6 @@ static int bfin_t350mcqb_fb_release(struct fb_info *info, int user)
 		SSYNC();
 		disable_dma(CH_PPI);
 		bfin_t350mcqb_stop_timers();
-		memset(fbi->fb_buffer, 0, info->fix.smem_len);
 	}
 
 	spin_unlock(&fbi->lock);
@@ -527,8 +526,6 @@ static int __devinit bfin_t350mcqb_probe(struct platform_device *pdev)
 		goto out3;
 	}
 
-	memset(info->fb_buffer, 0, fbinfo->fix.smem_len);
-
 	fbinfo->screen_base = (void *)info->fb_buffer + ACTIVE_VIDEO_MEM_OFFSET;
 	fbinfo->fix.smem_start = (int)info->fb_buffer + ACTIVE_VIDEO_MEM_OFFSET;
 
-- 
cgit v0.10.2


From 8f09d74a6165c210d5d58d10c2bd58a8c13b0ca8 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 16 Jun 2009 15:34:42 -0700
Subject: fbdev: *bfin*: fix __dev{init,exit} markings

The remove member of the platform_driver bfin_t350mcqb_driver should use
__devexit_p() to refer to the remove function, and that function should
get __devexit markings.  Likewise, the probe function should be marked
with __devinit and not __init.

Also, module_init() functions should be marked with __init rather than
__devinit.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/bf54x-lq043fb.c b/drivers/video/bf54x-lq043fb.c
index 6439a43..8aa67de 100644
--- a/drivers/video/bf54x-lq043fb.c
+++ b/drivers/video/bf54x-lq043fb.c
@@ -529,7 +529,7 @@ static irqreturn_t bfin_bf54x_irq_error(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __init bfin_bf54x_probe(struct platform_device *pdev)
+static int __devinit bfin_bf54x_probe(struct platform_device *pdev)
 {
 	struct bfin_bf54xfb_info *info;
 	struct fb_info *fbinfo;
@@ -709,7 +709,7 @@ out1:
 	return ret;
 }
 
-static int bfin_bf54x_remove(struct platform_device *pdev)
+static int __devexit bfin_bf54x_remove(struct platform_device *pdev)
 {
 
 	struct fb_info *fbinfo = platform_get_drvdata(pdev);
@@ -778,7 +778,7 @@ static int bfin_bf54x_resume(struct platform_device *pdev)
 
 static struct platform_driver bfin_bf54x_driver = {
 	.probe = bfin_bf54x_probe,
-	.remove = bfin_bf54x_remove,
+	.remove = __devexit_p(bfin_bf54x_remove),
 	.suspend = bfin_bf54x_suspend,
 	.resume = bfin_bf54x_resume,
 	.driver = {
@@ -787,7 +787,7 @@ static struct platform_driver bfin_bf54x_driver = {
 		   },
 };
 
-static int __devinit bfin_bf54x_driver_init(void)
+static int __init bfin_bf54x_driver_init(void)
 {
 	return platform_driver_register(&bfin_bf54x_driver);
 }
diff --git a/drivers/video/bfin-t350mcqb-fb.c b/drivers/video/bfin-t350mcqb-fb.c
index f40eef8..5cc36cf 100644
--- a/drivers/video/bfin-t350mcqb-fb.c
+++ b/drivers/video/bfin-t350mcqb-fb.c
@@ -599,7 +599,7 @@ out1:
 	return ret;
 }
 
-static int bfin_t350mcqb_remove(struct platform_device *pdev)
+static int __devexit bfin_t350mcqb_remove(struct platform_device *pdev)
 {
 
 	struct fb_info *fbinfo = platform_get_drvdata(pdev);
@@ -655,7 +655,7 @@ static int bfin_t350mcqb_resume(struct platform_device *pdev)
 
 static struct platform_driver bfin_t350mcqb_driver = {
 	.probe = bfin_t350mcqb_probe,
-	.remove = bfin_t350mcqb_remove,
+	.remove = __devexit_p(bfin_t350mcqb_remove),
 	.suspend = bfin_t350mcqb_suspend,
 	.resume = bfin_t350mcqb_resume,
 	.driver = {
@@ -664,7 +664,7 @@ static struct platform_driver bfin_t350mcqb_driver = {
 		   },
 };
 
-static int __devinit bfin_t350mcqb_driver_init(void)
+static int __init bfin_t350mcqb_driver_init(void)
 {
 	return platform_driver_register(&bfin_t350mcqb_driver);
 }
-- 
cgit v0.10.2


From a34601c5d84134055782ee031d58d82f5440e918 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 16 Jun 2009 15:34:43 -0700
Subject: fbdev: bf54x-lq043fb: use kzalloc over kmalloc/memset

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/bf54x-lq043fb.c b/drivers/video/bf54x-lq043fb.c
index 8aa67de..e49ae5e 100644
--- a/drivers/video/bf54x-lq043fb.c
+++ b/drivers/video/bf54x-lq043fb.c
@@ -630,7 +630,7 @@ static int __devinit bfin_bf54x_probe(struct platform_device *pdev)
 
 	fbinfo->fbops = &bfin_bf54x_fb_ops;
 
-	fbinfo->pseudo_palette = kmalloc(sizeof(u32) * 16, GFP_KERNEL);
+	fbinfo->pseudo_palette = kzalloc(sizeof(u32) * 16, GFP_KERNEL);
 	if (!fbinfo->pseudo_palette) {
 		printk(KERN_ERR DRIVER_NAME
 		       "Fail to allocate pseudo_palette\n");
@@ -639,8 +639,6 @@ static int __devinit bfin_bf54x_probe(struct platform_device *pdev)
 		goto out4;
 	}
 
-	memset(fbinfo->pseudo_palette, 0, sizeof(u32) * 16);
-
 	if (fb_alloc_cmap(&fbinfo->cmap, BFIN_LCD_NBR_PALETTE_ENTRIES, 0)
 	    < 0) {
 		printk(KERN_ERR DRIVER_NAME
-- 
cgit v0.10.2