From 543585cc5b07fa99a2dc897159fbf48c1eb73058 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 18 Oct 2011 22:09:24 -0700 Subject: slab: rename slab_break_gfp_order to slab_max_order slab_break_gfp_order is more appropriately named slab_max_order since it enforces the maximum order size of slabs as long as a single object will still fit. Also rename BREAK_GFP_ORDER_{LO,HI} accordingly. Acked-by: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg diff --git a/mm/slab.c b/mm/slab.c index 708efe8..1a482e8 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -481,9 +481,9 @@ EXPORT_SYMBOL(slab_buffer_size); /* * Do not go above this order unless 0 objects fit into the slab. */ -#define BREAK_GFP_ORDER_HI 1 -#define BREAK_GFP_ORDER_LO 0 -static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; +#define SLAB_MAX_ORDER_HI 1 +#define SLAB_MAX_ORDER_LO 0 +static int slab_max_order = SLAB_MAX_ORDER_LO; /* * Functions for storing/retrieving the cachep and or slab from the page @@ -1502,7 +1502,7 @@ void __init kmem_cache_init(void) * page orders on machines with more than 32MB of memory. */ if (totalram_pages > (32 << 20) >> PAGE_SHIFT) - slab_break_gfp_order = BREAK_GFP_ORDER_HI; + slab_max_order = SLAB_MAX_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated * from caches that do not exist yet: @@ -2112,7 +2112,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, * Large number of objects is good, but very large slabs are * currently bad for the gfp()s. */ - if (gfporder >= slab_break_gfp_order) + if (gfporder >= slab_max_order) break; /* -- cgit v0.10.2 From 3df1cccdfb3fab6aa9176beb655d802eb384eabc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 18 Oct 2011 22:09:28 -0700 Subject: slab: introduce slab_max_order kernel parameter Introduce new slab_max_order kernel parameter which is the equivalent of slub_max_order. For immediate purposes, allows users to override the heuristic that sets the max order to 1 by default if they have more than 32MB of RAM. This may result in page allocation failures if there is substantial fragmentation. Another usecase would be to increase the max order for better performance. Acked-by: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a0c5c5f..b21093e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2362,6 +2362,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. slram= [HW,MTD] + slab_max_order= [MM, SLAB] + Determines the maximum allowed order for slabs. + A high setting may cause OOMs due to memory + fragmentation. Defaults to 1 for systems with + more than 32MB of RAM, 0 otherwise. + slub_debug[=options[,slabs]] [MM, SLUB] Enabling slub_debug allows one to determine the culprit if slab objects become corrupted. Enabling diff --git a/mm/slab.c b/mm/slab.c index 1a482e8..b0414d1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -479,11 +479,13 @@ EXPORT_SYMBOL(slab_buffer_size); #endif /* - * Do not go above this order unless 0 objects fit into the slab. + * Do not go above this order unless 0 objects fit into the slab or + * overridden on the command line. */ #define SLAB_MAX_ORDER_HI 1 #define SLAB_MAX_ORDER_LO 0 static int slab_max_order = SLAB_MAX_ORDER_LO; +static bool slab_max_order_set __initdata; /* * Functions for storing/retrieving the cachep and or slab from the page @@ -851,6 +853,17 @@ static int __init noaliencache_setup(char *s) } __setup("noaliencache", noaliencache_setup); +static int __init slab_max_order_setup(char *str) +{ + get_option(&str, &slab_max_order); + slab_max_order = slab_max_order < 0 ? 0 : + min(slab_max_order, MAX_ORDER - 1); + slab_max_order_set = true; + + return 1; +} +__setup("slab_max_order=", slab_max_order_setup); + #ifdef CONFIG_NUMA /* * Special reaping functions for NUMA systems called from cache_reap(). @@ -1499,9 +1512,10 @@ void __init kmem_cache_init(void) /* * Fragmentation resistance on low memory - only use bigger - * page orders on machines with more than 32MB of memory. + * page orders on machines with more than 32MB of memory if + * not overridden on the command line. */ - if (totalram_pages > (32 << 20) >> PAGE_SHIFT) + if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) slab_max_order = SLAB_MAX_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated -- cgit v0.10.2 From 265d47e7115023df9e2b7a864b207b4738d9e18c Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 15 Nov 2011 15:04:00 -0800 Subject: slub: add taint flag outputting to debug paths When we get corruption reports, it's useful to see if the kernel was tainted, to rule out problems we can't do anything about. Signed-off-by: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Pekka Enberg diff --git a/mm/slub.c b/mm/slub.c index 7d2a996..60552d5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -570,7 +570,7 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) va_end(args); printk(KERN_ERR "========================================" "=====================================\n"); - printk(KERN_ERR "BUG %s: %s\n", s->name, buf); + printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); printk(KERN_ERR "----------------------------------------" "-------------------------------------\n\n"); } -- cgit v0.10.2 From face37f5e615646f364fa848f0a5c9d361d7a46e Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 15 Nov 2011 15:03:52 -0800 Subject: slab: add taint flag outputting to debug paths. When we get corruption reports, it's useful to see if the kernel was tainted, to rule out problems we can't do anything about. Signed-off-by: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Pekka Enberg diff --git a/mm/slab.c b/mm/slab.c index b0414d1..a7f9c24 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1941,8 +1941,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print header */ if (lines == 0) { printk(KERN_ERR - "Slab corruption: %s start=%p, len=%d\n", - cachep->name, realobj, size); + "Slab corruption (%s): %s start=%p, len=%d\n", + print_tainted(), cachep->name, realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ @@ -3051,8 +3051,9 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) if (entries != cachep->num - slabp->inuse) { bad: printk(KERN_ERR "slab: Internal list corruption detected in " - "cache '%s'(%d), slabp %p(%d). Hexdump:\n", - cachep->name, cachep->num, slabp, slabp->inuse); + "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n", + cachep->name, cachep->num, slabp, slabp->inuse, + print_tainted()); print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), 1); -- cgit v0.10.2 From 4c493a5a5c0bab6c434af2723328edd79c49aa0c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 11 Nov 2011 14:54:14 +0800 Subject: slub: add missed accounting With per-cpu partial list, slab is added to partial list first and then moved to node list. The __slab_free() code path for add/remove_partial is almost deprecated(except for slub debug). But we forget to account add/remove_partial when move per-cpu partial pages to node list, so the statistics for such events are always 0. Add corresponding accounting. This is against the patch "slub: use correct parameter to add a page to partial list tail" Acked-by: Christoph Lameter Signed-off-by: Shaohua Li Signed-off-by: Pekka Enberg diff --git a/mm/slub.c b/mm/slub.c index c313823..108ed03 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1901,11 +1901,14 @@ static void unfreeze_partials(struct kmem_cache *s) } if (l != m) { - if (l == M_PARTIAL) + if (l == M_PARTIAL) { remove_partial(n, page); - else + stat(s, FREE_REMOVE_PARTIAL); + } else { add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } l = m; } -- cgit v0.10.2 From 25f4379b8c79066c4be0e5995f37f5265733b801 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Nov 2011 06:59:09 +0100 Subject: slub: fix slub_max_order Documentation slub_max_order default is 3 (aka PAGE_ALLOC_COSTLY_ORDER), not 1 Acked-by: David Rientjes Acked-by: Christoph Lameter Signed-off-by: Eric Dumazet Signed-off-by: Pekka Enberg diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt index f464f47..2acdda9 100644 --- a/Documentation/vm/slub.txt +++ b/Documentation/vm/slub.txt @@ -117,7 +117,7 @@ can be influenced by kernel parameters: slub_min_objects=x (default 4) slub_min_order=x (default 0) -slub_max_order=x (default 1) +slub_max_order=x (default 3 (PAGE_ALLOC_COSTLY_ORDER)) slub_min_objects allows to specify how many objects must at least fit into one slab in order for the allocation order to be acceptable. -- cgit v0.10.2 From 73736e0387ba0e6d2b703407b4d26168d31516a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Dec 2011 04:57:06 +0100 Subject: slub: fix a possible memleak in __slab_alloc() Zhihua Che reported a possible memleak in slub allocator on CONFIG_PREEMPT=y builds. It is possible current thread migrates right before disabling irqs in __slab_alloc(). We must check again c->freelist, and perform a normal allocation instead of scratching c->freelist. Many thanks to Zhihua Che for spotting this bug, introduced in 2.6.39 V2: Its also possible an IRQ freed one (or several) object(s) and populated c->freelist, so its not a CONFIG_PREEMPT only problem. Cc: [2.6.39+] Reported-by: Zhihua Che Signed-off-by: Eric Dumazet Acked-by: Christoph Lameter Signed-off-by: Pekka Enberg diff --git a/mm/slub.c b/mm/slub.c index 108ed03..5e410a9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2169,6 +2169,11 @@ redo: goto new_slab; } + /* must check again c->freelist in case of cpu migration or IRQ */ + object = c->freelist; + if (object) + goto load_freelist; + stat(s, ALLOC_SLOWPATH); do { -- cgit v0.10.2 From 8f1e33daeda6cd89753f9e77d174805a6f21db09 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 23 Nov 2011 09:24:27 -0600 Subject: slub: Switch per cpu partial page support off for debugging Eric saw an issue with accounting of slabs during validation. Its not possible to determine accurately how many per cpu partial slabs exist at any time so this switches off per cpu partial pages during debug. Acked-by: Eric Dumazet Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg diff --git a/mm/slub.c b/mm/slub.c index ed3334d..4056d29 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3028,7 +3028,9 @@ static int kmem_cache_open(struct kmem_cache *s, * per node list when we run out of per cpu objects. We only fetch 50% * to keep some capacity around for frees. */ - if (s->size >= PAGE_SIZE) + if (kmem_cache_debug(s)) + s->cpu_partial = 0; + else if (s->size >= PAGE_SIZE) s->cpu_partial = 2; else if (s->size >= 1024) s->cpu_partial = 6; -- cgit v0.10.2 From 213eeb9fd9d66c33109e2ace242df214dc3a653d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 11 Nov 2011 14:07:14 -0600 Subject: slub: Extract get_freelist from __slab_alloc get_freelist retrieves free objects from the page freelist (put there by remote frees) or deactivates a slab page if no more objects are available. Acked-by: David Rientjes Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg diff --git a/mm/slub.c b/mm/slub.c index 5e410a9..6dc79f8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2127,6 +2127,37 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, } /* + * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist + * or deactivate the page. + * + * The page is still frozen if the return value is not NULL. + * + * If this function returns NULL then the page has been unfrozen. + */ +static inline void *get_freelist(struct kmem_cache *s, struct page *page) +{ + struct page new; + unsigned long counters; + void *freelist; + + do { + freelist = page->freelist; + counters = page->counters; + new.counters = counters; + VM_BUG_ON(!new.frozen); + + new.inuse = page->objects; + new.frozen = freelist != NULL; + + } while (!cmpxchg_double_slab(s, page, + freelist, counters, + NULL, new.counters, + "get_freelist")); + + return freelist; +} + +/* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. * @@ -2147,8 +2178,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, { void **object; unsigned long flags; - struct page new; - unsigned long counters; local_irq_save(flags); #ifdef CONFIG_PREEMPT @@ -2176,29 +2205,7 @@ redo: stat(s, ALLOC_SLOWPATH); - do { - object = c->page->freelist; - counters = c->page->counters; - new.counters = counters; - VM_BUG_ON(!new.frozen); - - /* - * If there is no object left then we use this loop to - * deactivate the slab which is simple since no objects - * are left in the slab and therefore we do not need to - * put the page back onto the partial list. - * - * If there are objects left then we retrieve them - * and use them to refill the per cpu queue. - */ - - new.inuse = c->page->objects; - new.frozen = object != NULL; - - } while (!__cmpxchg_double_slab(s, c->page, - object, counters, - NULL, new.counters, - "__slab_alloc")); + object = get_freelist(s, c->page); if (!object) { c->page = NULL; -- cgit v0.10.2 From b13683d1cc14d1dd30b8e20f3ebea3f814ad029f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 11 Nov 2011 14:54:14 +0800 Subject: slub: add missed accounting With per-cpu partial list, slab is added to partial list first and then moved to node list. The __slab_free() code path for add/remove_partial is almost deprecated(except for slub debug). But we forget to account add/remove_partial when move per-cpu partial pages to node list, so the statistics for such events are always 0. Add corresponding accounting. This is against the patch "slub: use correct parameter to add a page to partial list tail" Acked-by: Christoph Lameter Signed-off-by: Shaohua Li Signed-off-by: Pekka Enberg diff --git a/mm/slub.c b/mm/slub.c index 4056d29..8284a20 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1901,11 +1901,14 @@ static void unfreeze_partials(struct kmem_cache *s) } if (l != m) { - if (l == M_PARTIAL) + if (l == M_PARTIAL) { remove_partial(n, page); - else + stat(s, FREE_REMOVE_PARTIAL); + } else { add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } l = m; } -- cgit v0.10.2 From 74ee4ef1f901fbb014bdcdc9171d126490ce2b62 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 9 Jan 2012 13:19:45 -0800 Subject: slub: disallow changing cpu_partial from userspace for debug caches For caches with debugging enabled, "slub: Switch per cpu partial page support off for debugging" changes cpu_partial to 0. It shouldn't be tunable from userspace for such caches, otherwise the same accounting issues arise during validation. This patch disallows tuning /sys/kernel/slab/cache/cpu_partial to be non- zero for caches with debugging enabled. Acked-by: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg diff --git a/mm/slub.c b/mm/slub.c index 6dc79f8..a47df0a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4649,6 +4649,8 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, err = strict_strtoul(buf, 10, &objects); if (err) return err; + if (objects && kmem_cache_debug(s)) + return -EINVAL; s->cpu_partial = objects; flush_all(s); -- cgit v0.10.2