From 543585cc5b07fa99a2dc897159fbf48c1eb73058 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 18 Oct 2011 22:09:24 -0700
Subject: slab: rename slab_break_gfp_order to slab_max_order

slab_break_gfp_order is more appropriately named slab_max_order since it
enforces the maximum order size of slabs as long as a single object will
still fit.

Also rename BREAK_GFP_ORDER_{LO,HI} accordingly.

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>

diff --git a/mm/slab.c b/mm/slab.c
index 708efe8..1a482e8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -481,9 +481,9 @@ EXPORT_SYMBOL(slab_buffer_size);
 /*
  * Do not go above this order unless 0 objects fit into the slab.
  */
-#define	BREAK_GFP_ORDER_HI	1
-#define	BREAK_GFP_ORDER_LO	0
-static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
+#define	SLAB_MAX_ORDER_HI	1
+#define	SLAB_MAX_ORDER_LO	0
+static int slab_max_order = SLAB_MAX_ORDER_LO;
 
 /*
  * Functions for storing/retrieving the cachep and or slab from the page
@@ -1502,7 +1502,7 @@ void __init kmem_cache_init(void)
 	 * page orders on machines with more than 32MB of memory.
 	 */
 	if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
-		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
+		slab_max_order = SLAB_MAX_ORDER_HI;
 
 	/* Bootstrap is tricky, because several objects are allocated
 	 * from caches that do not exist yet:
@@ -2112,7 +2112,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 		 * Large number of objects is good, but very large slabs are
 		 * currently bad for the gfp()s.
 		 */
-		if (gfporder >= slab_break_gfp_order)
+		if (gfporder >= slab_max_order)
 			break;
 
 		/*
-- 
cgit v0.10.2


From 3df1cccdfb3fab6aa9176beb655d802eb384eabc Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 18 Oct 2011 22:09:28 -0700
Subject: slab: introduce slab_max_order kernel parameter

Introduce new slab_max_order kernel parameter which is the equivalent of
slub_max_order.

For immediate purposes, allows users to override the heuristic that sets
the max order to 1 by default if they have more than 32MB of RAM.  This
may result in page allocation failures if there is substantial
fragmentation.

Another usecase would be to increase the max order for better
performance.

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a0c5c5f..b21093e 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2362,6 +2362,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	slram=		[HW,MTD]
 
+	slab_max_order=	[MM, SLAB]
+			Determines the maximum allowed order for slabs.
+			A high setting may cause OOMs due to memory
+			fragmentation.  Defaults to 1 for systems with
+			more than 32MB of RAM, 0 otherwise.
+
 	slub_debug[=options[,slabs]]	[MM, SLUB]
 			Enabling slub_debug allows one to determine the
 			culprit if slab objects become corrupted. Enabling
diff --git a/mm/slab.c b/mm/slab.c
index 1a482e8..b0414d1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -479,11 +479,13 @@ EXPORT_SYMBOL(slab_buffer_size);
 #endif
 
 /*
- * Do not go above this order unless 0 objects fit into the slab.
+ * Do not go above this order unless 0 objects fit into the slab or
+ * overridden on the command line.
  */
 #define	SLAB_MAX_ORDER_HI	1
 #define	SLAB_MAX_ORDER_LO	0
 static int slab_max_order = SLAB_MAX_ORDER_LO;
+static bool slab_max_order_set __initdata;
 
 /*
  * Functions for storing/retrieving the cachep and or slab from the page
@@ -851,6 +853,17 @@ static int __init noaliencache_setup(char *s)
 }
 __setup("noaliencache", noaliencache_setup);
 
+static int __init slab_max_order_setup(char *str)
+{
+	get_option(&str, &slab_max_order);
+	slab_max_order = slab_max_order < 0 ? 0 :
+				min(slab_max_order, MAX_ORDER - 1);
+	slab_max_order_set = true;
+
+	return 1;
+}
+__setup("slab_max_order=", slab_max_order_setup);
+
 #ifdef CONFIG_NUMA
 /*
  * Special reaping functions for NUMA systems called from cache_reap().
@@ -1499,9 +1512,10 @@ void __init kmem_cache_init(void)
 
 	/*
 	 * Fragmentation resistance on low memory - only use bigger
-	 * page orders on machines with more than 32MB of memory.
+	 * page orders on machines with more than 32MB of memory if
+	 * not overridden on the command line.
 	 */
-	if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
+	if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
 		slab_max_order = SLAB_MAX_ORDER_HI;
 
 	/* Bootstrap is tricky, because several objects are allocated
-- 
cgit v0.10.2


From 265d47e7115023df9e2b7a864b207b4738d9e18c Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Tue, 15 Nov 2011 15:04:00 -0800
Subject: slub: add taint flag outputting to debug paths

When we get corruption reports, it's useful to see if the kernel was
tainted, to rule out problems we can't do anything about.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>

diff --git a/mm/slub.c b/mm/slub.c
index 7d2a996..60552d5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -570,7 +570,7 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
 	va_end(args);
 	printk(KERN_ERR "========================================"
 			"=====================================\n");
-	printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
+	printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
 	printk(KERN_ERR "----------------------------------------"
 			"-------------------------------------\n\n");
 }
-- 
cgit v0.10.2


From face37f5e615646f364fa848f0a5c9d361d7a46e Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Tue, 15 Nov 2011 15:03:52 -0800
Subject: slab: add taint flag outputting to debug paths.

When we get corruption reports, it's useful to see if the kernel was
tainted, to rule out problems we can't do anything about.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>

diff --git a/mm/slab.c b/mm/slab.c
index b0414d1..a7f9c24 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1941,8 +1941,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 			/* Print header */
 			if (lines == 0) {
 				printk(KERN_ERR
-					"Slab corruption: %s start=%p, len=%d\n",
-					cachep->name, realobj, size);
+					"Slab corruption (%s): %s start=%p, len=%d\n",
+					print_tainted(), cachep->name, realobj, size);
 				print_objinfo(cachep, objp, 0);
 			}
 			/* Hexdump the affected line */
@@ -3051,8 +3051,9 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
 	if (entries != cachep->num - slabp->inuse) {
 bad:
 		printk(KERN_ERR "slab: Internal list corruption detected in "
-				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-			cachep->name, cachep->num, slabp, slabp->inuse);
+			"cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n",
+			cachep->name, cachep->num, slabp, slabp->inuse,
+			print_tainted());
 		print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
 			sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
 			1);
-- 
cgit v0.10.2


From 4c493a5a5c0bab6c434af2723328edd79c49aa0c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 11 Nov 2011 14:54:14 +0800
Subject: slub: add missed accounting

With per-cpu partial list, slab is added to partial list first and then moved
to node list. The __slab_free() code path for add/remove_partial is almost
deprecated(except for slub debug). But we forget to account add/remove_partial
when move per-cpu partial pages to node list, so the statistics for such events
are always 0. Add corresponding accounting.

This is against the patch "slub: use correct parameter to add a page to
partial list tail"

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>

diff --git a/mm/slub.c b/mm/slub.c
index c313823..108ed03 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1901,11 +1901,14 @@ static void unfreeze_partials(struct kmem_cache *s)
 			}
 
 			if (l != m) {
-				if (l == M_PARTIAL)
+				if (l == M_PARTIAL) {
 					remove_partial(n, page);
-				else
+					stat(s, FREE_REMOVE_PARTIAL);
+				} else {
 					add_partial(n, page,
 						DEACTIVATE_TO_TAIL);
+					stat(s, FREE_ADD_PARTIAL);
+				}
 
 				l = m;
 			}
-- 
cgit v0.10.2


From 25f4379b8c79066c4be0e5995f37f5265733b801 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 22 Nov 2011 06:59:09 +0100
Subject: slub: fix slub_max_order Documentation

slub_max_order default is 3 (aka PAGE_ALLOC_COSTLY_ORDER), not 1

Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>

diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt
index f464f47..2acdda9 100644
--- a/Documentation/vm/slub.txt
+++ b/Documentation/vm/slub.txt
@@ -117,7 +117,7 @@ can be influenced by kernel parameters:
 
 slub_min_objects=x		(default 4)
 slub_min_order=x		(default 0)
-slub_max_order=x		(default 1)
+slub_max_order=x		(default 3 (PAGE_ALLOC_COSTLY_ORDER))
 
 slub_min_objects allows to specify how many objects must at least fit
 into one slab in order for the allocation order to be acceptable.
-- 
cgit v0.10.2


From 73736e0387ba0e6d2b703407b4d26168d31516a7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 13 Dec 2011 04:57:06 +0100
Subject: slub: fix a possible memleak in __slab_alloc()

Zhihua Che reported a possible memleak in slub allocator on
CONFIG_PREEMPT=y builds.

It is possible current thread migrates right before disabling irqs in
__slab_alloc(). We must check again c->freelist, and perform a normal
allocation instead of scratching c->freelist.

Many thanks to Zhihua Che for spotting this bug, introduced in 2.6.39

V2: Its also possible an IRQ freed one (or several) object(s) and
populated c->freelist, so its not a CONFIG_PREEMPT only problem.

Cc: <stable@vger.kernel.org>        [2.6.39+]
Reported-by: Zhihua Che <zhihua.che@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>

diff --git a/mm/slub.c b/mm/slub.c
index 108ed03..5e410a9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2169,6 +2169,11 @@ redo:
 		goto new_slab;
 	}
 
+	/* must check again c->freelist in case of cpu migration or IRQ */
+	object = c->freelist;
+	if (object)
+		goto load_freelist;
+
 	stat(s, ALLOC_SLOWPATH);
 
 	do {
-- 
cgit v0.10.2


From 8f1e33daeda6cd89753f9e77d174805a6f21db09 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 23 Nov 2011 09:24:27 -0600
Subject: slub: Switch per cpu partial page support off for debugging

Eric saw an issue with accounting of slabs during validation. Its not
possible to determine accurately how many per cpu partial slabs exist at
any time so this switches off per cpu partial pages during debug.

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>

diff --git a/mm/slub.c b/mm/slub.c
index ed3334d..4056d29 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3028,7 +3028,9 @@ static int kmem_cache_open(struct kmem_cache *s,
 	 *    per node list when we run out of per cpu objects. We only fetch 50%
 	 *    to keep some capacity around for frees.
 	 */
-	if (s->size >= PAGE_SIZE)
+	if (kmem_cache_debug(s))
+		s->cpu_partial = 0;
+	else if (s->size >= PAGE_SIZE)
 		s->cpu_partial = 2;
 	else if (s->size >= 1024)
 		s->cpu_partial = 6;
-- 
cgit v0.10.2


From 213eeb9fd9d66c33109e2ace242df214dc3a653d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Fri, 11 Nov 2011 14:07:14 -0600
Subject: slub: Extract get_freelist from __slab_alloc

get_freelist retrieves free objects from the page freelist (put there by remote
frees) or deactivates a slab page if no more objects are available.

Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>

diff --git a/mm/slub.c b/mm/slub.c
index 5e410a9..6dc79f8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2127,6 +2127,37 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
 }
 
 /*
+ * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist
+ * or deactivate the page.
+ *
+ * The page is still frozen if the return value is not NULL.
+ *
+ * If this function returns NULL then the page has been unfrozen.
+ */
+static inline void *get_freelist(struct kmem_cache *s, struct page *page)
+{
+	struct page new;
+	unsigned long counters;
+	void *freelist;
+
+	do {
+		freelist = page->freelist;
+		counters = page->counters;
+		new.counters = counters;
+		VM_BUG_ON(!new.frozen);
+
+		new.inuse = page->objects;
+		new.frozen = freelist != NULL;
+
+	} while (!cmpxchg_double_slab(s, page,
+		freelist, counters,
+		NULL, new.counters,
+		"get_freelist"));
+
+	return freelist;
+}
+
+/*
  * Slow path. The lockless freelist is empty or we need to perform
  * debugging duties.
  *
@@ -2147,8 +2178,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 {
 	void **object;
 	unsigned long flags;
-	struct page new;
-	unsigned long counters;
 
 	local_irq_save(flags);
 #ifdef CONFIG_PREEMPT
@@ -2176,29 +2205,7 @@ redo:
 
 	stat(s, ALLOC_SLOWPATH);
 
-	do {
-		object = c->page->freelist;
-		counters = c->page->counters;
-		new.counters = counters;
-		VM_BUG_ON(!new.frozen);
-
-		/*
-		 * If there is no object left then we use this loop to
-		 * deactivate the slab which is simple since no objects
-		 * are left in the slab and therefore we do not need to
-		 * put the page back onto the partial list.
-		 *
-		 * If there are objects left then we retrieve them
-		 * and use them to refill the per cpu queue.
-		 */
-
-		new.inuse = c->page->objects;
-		new.frozen = object != NULL;
-
-	} while (!__cmpxchg_double_slab(s, c->page,
-			object, counters,
-			NULL, new.counters,
-			"__slab_alloc"));
+	object = get_freelist(s, c->page);
 
 	if (!object) {
 		c->page = NULL;
-- 
cgit v0.10.2


From b13683d1cc14d1dd30b8e20f3ebea3f814ad029f Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 11 Nov 2011 14:54:14 +0800
Subject: slub: add missed accounting

With per-cpu partial list, slab is added to partial list first and then moved
to node list. The __slab_free() code path for add/remove_partial is almost
deprecated(except for slub debug). But we forget to account add/remove_partial
when move per-cpu partial pages to node list, so the statistics for such events
are always 0. Add corresponding accounting.

This is against the patch "slub: use correct parameter to add a page to
partial list tail"

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>

diff --git a/mm/slub.c b/mm/slub.c
index 4056d29..8284a20 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1901,11 +1901,14 @@ static void unfreeze_partials(struct kmem_cache *s)
 			}
 
 			if (l != m) {
-				if (l == M_PARTIAL)
+				if (l == M_PARTIAL) {
 					remove_partial(n, page);
-				else
+					stat(s, FREE_REMOVE_PARTIAL);
+				} else {
 					add_partial(n, page,
 						DEACTIVATE_TO_TAIL);
+					stat(s, FREE_ADD_PARTIAL);
+				}
 
 				l = m;
 			}
-- 
cgit v0.10.2


From 74ee4ef1f901fbb014bdcdc9171d126490ce2b62 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 9 Jan 2012 13:19:45 -0800
Subject: slub: disallow changing cpu_partial from userspace for debug caches

For caches with debugging enabled, "slub: Switch per cpu partial page
support off for debugging" changes cpu_partial to 0.  It shouldn't be
tunable from userspace for such caches, otherwise the same accounting
issues arise during validation.

This patch disallows tuning /sys/kernel/slab/cache/cpu_partial to be non-
zero for caches with debugging enabled.

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>

diff --git a/mm/slub.c b/mm/slub.c
index 6dc79f8..a47df0a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4649,6 +4649,8 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
 	err = strict_strtoul(buf, 10, &objects);
 	if (err)
 		return err;
+	if (objects && kmem_cache_debug(s))
+		return -EINVAL;
 
 	s->cpu_partial = objects;
 	flush_all(s);
-- 
cgit v0.10.2