From 86d9f48534e800e4d62cdc1b5aaf539f4c1d47d6 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 27 Oct 2016 17:46:18 -0700 Subject: mm/slab: fix kmemcg cache creation delayed issue There is a bug report that SLAB makes extreme load average due to over 2000 kworker thread. https://bugzilla.kernel.org/show_bug.cgi?id=172981 This issue is caused by kmemcg feature that try to create new set of kmem_caches for each memcg. Recently, kmem_cache creation is slowed by synchronize_sched() and futher kmem_cache creation is also delayed since kmem_cache creation is synchronized by a global slab_mutex lock. So, the number of kworker that try to create kmem_cache increases quietly. synchronize_sched() is for lockless access to node's shared array but it's not needed when a new kmem_cache is created. So, this patch rules out that case. Fixes: 801faf0db894 ("mm/slab: lockless decision to grow cache") Link: http://lkml.kernel.org/r/1475734855-4837-1-git-send-email-iamjoonsoo.kim@lge.com Reported-by: Doug Smythies Tested-by: Doug Smythies Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 090fb26..c451e3f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -966,7 +966,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, * guaranteed to be valid until irq is re-enabled, because it will be * freed after synchronize_sched(). */ - if (force_change) + if (old_shared && force_change) synchronize_sched(); fail: -- cgit v0.10.2 From b274c0bb394c6a69ac12feac7c2db81f5aff5a55 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Oct 2016 17:46:21 -0700 Subject: kcov: properly check if we are in an interrupt in_interrupt() returns a nonzero value when we are either in an interrupt or have bh disabled via local_bh_disable(). Since we are interested in only ignoring coverage from actual interrupts, do a proper check instead of just calling in_interrupt(). As a result of this change, kcov will start to collect coverage from within local_bh_disable()/local_bh_enable() sections. Link: http://lkml.kernel.org/r/1476115803-20712-1-git-send-email-andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Dmitry Vyukov Cc: Nicolai Stange Cc: Andrey Ryabinin Cc: Kees Cook Cc: James Morse Cc: Vegard Nossum Cc: Quentin Casasnovas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/kcov.c b/kernel/kcov.c index 8d44b3f..30e6d05 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -53,8 +53,15 @@ void notrace __sanitizer_cov_trace_pc(void) /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. + * The checks for whether we are in an interrupt are open-coded, because + * 1. We can't use in_interrupt() here, since it also returns true + * when we are inside local_bh_disable() section. + * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), + * since that leads to slower generated code (three separate tests, + * one for each of the flags). */ - if (!t || in_interrupt()) + if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET + | NMI_MASK))) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { -- cgit v0.10.2 From 21753583056d48a5fad964d6f272e28168426845 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 27 Oct 2016 17:46:24 -0700 Subject: h8300: fix syscall restarting Back in commit f56141e3e2d9 ("all arches, signal: move restart_block to struct task_struct"), all architectures and core code were changed to use task_struct::restart_block. However, when h8300 support was subsequently restored in v4.2, it was not updated to account for this, and maintains thread_info::restart_block, which is not kept in sync. This patch drops the redundant restart_block from thread_info, and moves h8300 to the common one in task_struct, ensuring that syscall restarting always works as expected. Fixes: f56141e3e2d9 ("all arches, signal: move restart_block to struct task_struct") Link: http://lkml.kernel.org/r/1476714934-11635-1-git-send-email-mark.rutland@arm.com Signed-off-by: Mark Rutland Cc: Andy Lutomirski Cc: Yoshinori Sato Cc: uclinux-h8-devel@lists.sourceforge.jp Cc: [4.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/h8300/include/asm/thread_info.h b/arch/h8300/include/asm/thread_info.h index b408fe6..3cef068 100644 --- a/arch/h8300/include/asm/thread_info.h +++ b/arch/h8300/include/asm/thread_info.h @@ -31,7 +31,6 @@ struct thread_info { int cpu; /* cpu we're on */ int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; - struct restart_block restart_block; }; /* @@ -44,9 +43,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index ad1f81f..7138303 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -79,7 +79,7 @@ restore_sigcontext(struct sigcontext *usc, int *pd0) unsigned int er0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* restore passed registers */ #define COPY(r) do { err |= get_user(regs->r, &usc->sc_##r); } while (0) -- cgit v0.10.2 From 1bc11d70b5db7c6bb1414b283d7f09b1fe1ac0d0 Mon Sep 17 00:00:00 2001 From: Alexander Polakov Date: Thu, 27 Oct 2016 17:46:27 -0700 Subject: mm/list_lru.c: avoid error-path NULL pointer deref As described in https://bugzilla.kernel.org/show_bug.cgi?id=177821: After some analysis it seems to be that the problem is in alloc_super(). In case list_lru_init_memcg() fails it goes into destroy_super(), which calls list_lru_destroy(). And in list_lru_init() we see that in case memcg_init_list_lru() fails, lru->node is freed, but not set NULL, which then leads list_lru_destroy() to believe it is initialized and call memcg_destroy_list_lru(). memcg_destroy_list_lru() in turn can access lru->node[i].memcg_lrus, which is NULL. [akpm@linux-foundation.org: add comment] Signed-off-by: Alexander Polakov Acked-by: Vladimir Davydov Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/list_lru.c b/mm/list_lru.c index 1d05cb9..234676e 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -554,6 +554,8 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, err = memcg_init_list_lru(lru, memcg_aware); if (err) { kfree(lru->node); + /* Do this so a list_lru_destroy() doesn't crash: */ + lru->node = NULL; goto out; } -- cgit v0.10.2 From 1f84a18fc010d7a62667199c9be35872bbf31526 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 27 Oct 2016 17:46:29 -0700 Subject: mm: page_alloc: use KERN_CONT where appropriate Recent changes to printk require KERN_CONT uses to continue logging messages. So add KERN_CONT where necessary. [akpm@linux-foundation.org: coding-style fixes] Fixes: 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines") Link: http://lkml.kernel.org/r/c7df37c8665134654a17aaeb8b9f6ace1d6db58b.1476239034.git.joe@perches.com Reported-by: Mark Rutland Signed-off-by: Joe Perches Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index de7c6e4..8fd42aa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4224,7 +4224,7 @@ static void show_migration_types(unsigned char type) } *p = '\0'; - printk("(%s) ", tmp); + printk(KERN_CONT "(%s) ", tmp); } /* @@ -4335,7 +4335,8 @@ void show_free_areas(unsigned int filter) free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; show_node(zone); - printk("%s" + printk(KERN_CONT + "%s" " free:%lukB" " min:%lukB" " low:%lukB" @@ -4382,8 +4383,8 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %ld", zone->lowmem_reserve[i]); - printk("\n"); + printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); + printk(KERN_CONT "\n"); } for_each_populated_zone(zone) { @@ -4394,7 +4395,7 @@ void show_free_areas(unsigned int filter) if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; show_node(zone); - printk("%s: ", zone->name); + printk(KERN_CONT "%s: ", zone->name); spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { @@ -4412,11 +4413,12 @@ void show_free_areas(unsigned int filter) } spin_unlock_irqrestore(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { - printk("%lu*%lukB ", nr[order], K(1UL) << order); + printk(KERN_CONT "%lu*%lukB ", + nr[order], K(1UL) << order); if (nr[order]) show_migration_types(types[order]); } - printk("= %lukB\n", K(total)); + printk(KERN_CONT "= %lukB\n", K(total)); } hugetlb_show_meminfo(); -- cgit v0.10.2 From 07a63c41fa1f6533f5668e5b33a295bfd63aa534 Mon Sep 17 00:00:00 2001 From: Aruna Ramakrishna Date: Thu, 27 Oct 2016 17:46:32 -0700 Subject: mm/slab: improve performance of gathering slabinfo stats On large systems, when some slab caches grow to millions of objects (and many gigabytes), running 'cat /proc/slabinfo' can take up to 1-2 seconds. During this time, interrupts are disabled while walking the slab lists (slabs_full, slabs_partial, and slabs_free) for each node, and this sometimes causes timeouts in other drivers (for instance, Infiniband). This patch optimizes 'cat /proc/slabinfo' by maintaining a counter for total number of allocated slabs per node, per cache. This counter is updated when a slab is created or destroyed. This enables us to skip traversing the slabs_full list while gathering slabinfo statistics, and since slabs_full tends to be the biggest list when the cache is large, it results in a dramatic performance improvement. Getting slabinfo statistics now only requires walking the slabs_free and slabs_partial lists, and those lists are usually much smaller than slabs_full. We tested this after growing the dentry cache to 70GB, and the performance improved from 2s to 5ms. Link: http://lkml.kernel.org/r/1472517876-26814-1-git-send-email-aruna.ramakrishna@oracle.com Signed-off-by: Aruna Ramakrishna Acked-by: David Rientjes Cc: Mike Kravetz Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index c451e3f..0b0550c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -233,6 +233,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) spin_lock_init(&parent->list_lock); parent->free_objects = 0; parent->free_touched = 0; + parent->num_slabs = 0; } #define MAKE_LIST(cachep, listp, slab, nodeid) \ @@ -1382,24 +1383,27 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) for_each_kmem_cache_node(cachep, node, n) { unsigned long active_objs = 0, num_objs = 0, free_objects = 0; unsigned long active_slabs = 0, num_slabs = 0; + unsigned long num_slabs_partial = 0, num_slabs_free = 0; + unsigned long num_slabs_full; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->slabs_full, lru) { - active_objs += cachep->num; - active_slabs++; - } + num_slabs = n->num_slabs; list_for_each_entry(page, &n->slabs_partial, lru) { active_objs += page->active; - active_slabs++; + num_slabs_partial++; } list_for_each_entry(page, &n->slabs_free, lru) - num_slabs++; + num_slabs_free++; free_objects += n->free_objects; spin_unlock_irqrestore(&n->list_lock, flags); - num_slabs += active_slabs; num_objs = num_slabs * cachep->num; + active_slabs = num_slabs - num_slabs_free; + num_slabs_full = num_slabs - + (num_slabs_partial + num_slabs_free); + active_objs += (num_slabs_full * cachep->num); + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", node, active_slabs, num_slabs, active_objs, num_objs, free_objects); @@ -2314,6 +2318,7 @@ static int drain_freelist(struct kmem_cache *cache, page = list_entry(p, struct page, lru); list_del(&page->lru); + n->num_slabs--; /* * Safe to drop the lock. The slab is no longer linked * to the cache. @@ -2752,6 +2757,8 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) list_add_tail(&page->lru, &(n->slabs_free)); else fixup_slab_list(cachep, n, page, &list); + + n->num_slabs++; STATS_INC_GROWN(cachep); n->free_objects += cachep->num - page->active; spin_unlock(&n->list_lock); @@ -3443,6 +3450,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, page = list_last_entry(&n->slabs_free, struct page, lru); list_move(&page->lru, list); + n->num_slabs--; } } @@ -4099,6 +4107,8 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) unsigned long num_objs; unsigned long active_slabs = 0; unsigned long num_slabs, free_objects = 0, shared_avail = 0; + unsigned long num_slabs_partial = 0, num_slabs_free = 0; + unsigned long num_slabs_full = 0; const char *name; char *error = NULL; int node; @@ -4111,33 +4121,34 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) check_irq_on(); spin_lock_irq(&n->list_lock); - list_for_each_entry(page, &n->slabs_full, lru) { - if (page->active != cachep->num && !error) - error = "slabs_full accounting error"; - active_objs += cachep->num; - active_slabs++; - } + num_slabs += n->num_slabs; + list_for_each_entry(page, &n->slabs_partial, lru) { if (page->active == cachep->num && !error) error = "slabs_partial accounting error"; if (!page->active && !error) error = "slabs_partial accounting error"; active_objs += page->active; - active_slabs++; + num_slabs_partial++; } + list_for_each_entry(page, &n->slabs_free, lru) { if (page->active && !error) error = "slabs_free accounting error"; - num_slabs++; + num_slabs_free++; } + free_objects += n->free_objects; if (n->shared) shared_avail += n->shared->avail; spin_unlock_irq(&n->list_lock); } - num_slabs += active_slabs; num_objs = num_slabs * cachep->num; + active_slabs = num_slabs - num_slabs_free; + num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free); + active_objs += (num_slabs_full * cachep->num); + if (num_objs - active_objs != free_objects && !error) error = "free_objects accounting error"; diff --git a/mm/slab.h b/mm/slab.h index 9653f2e..bc05fdc 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -432,6 +432,7 @@ struct kmem_cache_node { struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; + unsigned long num_slabs; unsigned long free_objects; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ -- cgit v0.10.2 From 8c8d4d45204902e144abc0f15b7c658828028fa1 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Thu, 27 Oct 2016 17:46:35 -0700 Subject: ipc: account for kmem usage on mqueue and msg When kmem accounting switched from account by default to only account if flagged by __GFP_ACCOUNT, IPC mqueue and messages was left out. The production use case at hand is that mqueues should be customizable via sysctls in Docker containers in a Kubernetes cluster. This can only be safely allowed to the users of the cluster (without the risk that they can cause resource shortage on a node, influencing other users' containers) if all resources they control are bounded, i.e. accounted for. Link: http://lkml.kernel.org/r/1476806075-1210-1-git-send-email-arozansk@redhat.com Signed-off-by: Aristeu Rozanski Reported-by: Stefan Schimanski Acked-by: Davidlohr Bueso Cc: Alexey Dobriyan Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Stefan Schimanski Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msgutil.c b/ipc/msgutil.c index a521999..bf74eaa 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -53,7 +53,7 @@ static struct msg_msg *alloc_msg(size_t len) size_t alen; alen = min(len, DATALEN_MSG); - msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL); + msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_ACCOUNT); if (msg == NULL) return NULL; @@ -65,7 +65,7 @@ static struct msg_msg *alloc_msg(size_t len) while (len > 0) { struct msg_msgseg *seg; alen = min(len, DATALEN_SEG); - seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL); + seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT); if (seg == NULL) goto out_err; *pseg = seg; -- cgit v0.10.2 From c0a0aba8e478229b2f0956918542152fbad3f794 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 27 Oct 2016 17:46:38 -0700 Subject: kconfig.h: remove config_enabled() macro The use of config_enabled() is ambiguous. For config options, IS_ENABLED(), IS_REACHABLE(), etc. will make intention clearer. Sometimes config_enabled() has been used for non-config options because it is useful to check whether the given symbol is defined or not. I have been tackling on deprecating config_enabled(), and now is the time to finish this work. Some new users have appeared for v4.9-rc1, but it is trivial to replace them: - arch/x86/mm/kaslr.c replace config_enabled() with IS_ENABLED() because CONFIG_X86_ESPFIX64 and CONFIG_EFI are boolean. - include/asm-generic/export.h replace config_enabled() with __is_defined(). Then, config_enabled() can be removed now. Going forward, please use IS_ENABLED(), IS_REACHABLE(), etc. for config options, and __is_defined() for non-config symbols. Link: http://lkml.kernel.org/r/1476616078-32252-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Ingo Molnar Acked-by: Nicolas Pitre Cc: Peter Oberparleiter Cc: Arnd Bergmann Cc: Kees Cook Cc: Michal Marek Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Thomas Garnier Cc: Paul Bolle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index ddd2661..887e571 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -104,10 +104,10 @@ void __init kernel_randomize_memory(void) * consistent with the vaddr_start/vaddr_end variables. */ BUILD_BUG_ON(vaddr_start >= vaddr_end); - BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) && + BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && vaddr_end >= EFI_VA_START); - BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) || - config_enabled(CONFIG_EFI)) && + BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || + IS_ENABLED(CONFIG_EFI)) && vaddr_end >= __START_KERNEL_map); BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h index 43199a0..63554e9 100644 --- a/include/asm-generic/export.h +++ b/include/asm-generic/export.h @@ -70,7 +70,7 @@ KSYM(__kcrctab_\name): #include #define __EXPORT_SYMBOL(sym, val, sec) \ - __cond_export_sym(sym, val, sec, config_enabled(__KSYM_##sym)) + __cond_export_sym(sym, val, sec, __is_defined(__KSYM_##sym)) #define __cond_export_sym(sym, val, sec, conf) \ ___cond_export_sym(sym, val, sec, conf) #define ___cond_export_sym(sym, val, sec, enabled) \ diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index 15ec117..8f2e059 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -31,7 +31,6 @@ * When CONFIG_BOOGER is not defined, we generate a (... 1, 0) pair, and when * the last step cherry picks the 2nd arg, we get a zero. */ -#define config_enabled(cfg) ___is_defined(cfg) #define __is_defined(x) ___is_defined(x) #define ___is_defined(val) ____is_defined(__ARG_PLACEHOLDER_##val) #define ____is_defined(arg1_or_junk) __take_second_arg(arg1_or_junk 1, 0) @@ -41,13 +40,13 @@ * otherwise. For boolean options, this is equivalent to * IS_ENABLED(CONFIG_FOO). */ -#define IS_BUILTIN(option) config_enabled(option) +#define IS_BUILTIN(option) __is_defined(option) /* * IS_MODULE(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'm', 0 * otherwise. */ -#define IS_MODULE(option) config_enabled(option##_MODULE) +#define IS_MODULE(option) __is_defined(option##_MODULE) /* * IS_REACHABLE(CONFIG_FOO) evaluates to 1 if the currently compiled -- cgit v0.10.2 From 0e07f663c90154079b287931257defbdb1d5405a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 27 Oct 2016 17:46:41 -0700 Subject: latent_entropy: raise CONFIG_FRAME_WARN by default When building with the latent_entropy plugin, set the default CONFIG_FRAME_WARN to 2048, since some __init functions have many basic blocks that, when instrumented by the latent_entropy plugin, grow beyond 1024 byte stack size on 32-bit builds. Link: http://lkml.kernel.org/r/20161018211216.GA39687@beast Signed-off-by: Kees Cook Reported-by: kbuild test robot Cc: Emese Revfy Cc: Ingo Molnar Cc: Michal Marek Cc: "Paul E. McKenney" Cc: Dan Williams Cc: Andrey Ryabinin Cc: Josh Poimboeuf Cc: Tejun Heo Cc: Nikolay Aleksandrov Cc: Dmitry Vyukov Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 33bc56c..b01e547 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -198,6 +198,7 @@ config FRAME_WARN int "Warn for stack frames larger than (needs gcc 4.4)" range 0 8192 default 0 if KASAN + default 2048 if GCC_PLUGIN_LATENT_ENTROPY default 1024 if !64BIT default 2048 if 64BIT help -- cgit v0.10.2 From 02754e0a484a50a92d44c38879f2cb2792ebc572 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 27 Oct 2016 17:46:44 -0700 Subject: lib/stackdepot.c: bump stackdepot capacity from 16MB to 128MB KASAN uses stackdepot to memorize stacks for all kmalloc/kfree calls. Current stackdepot capacity is 16MB (1024 top level entries x 4 pages on second level). Size of each stack is (num_frames + 3) * sizeof(long). Which gives us ~84K stacks. This capacity was chosen empirically and it is enough to run kernel normally. However, when lots of configs are enabled and a fuzzer tries to maximize code coverage, it easily hits the limit within tens of minutes. I've tested for long a time with number of top level entries bumped 4x (4096). And I think I've seen overflow only once. But I don't have all configs enabled and code coverage has not reached maximum yet. So bump it 8x to 8192. Since we have two-level table, memory cost of this is very moderate -- currently the top-level table is 8KB, with this patch it is 64KB, which is negligible under KASAN. Here is some approx math. 128MB allows us to memorize ~670K stacks (assuming stack is ~200b). I've grepped kernel for kmalloc|kfree|kmem_cache_alloc|kmem_cache_free| kzalloc|kstrdup|kstrndup|kmemdup and it gives ~60K matches. Most of alloc/free call sites are reachable with only one stack. But some utility functions can have large fanout. Assuming average fanout is 5x, total number of alloc/free stacks is ~300K. Link: http://lkml.kernel.org/r/1476458416-122131-1-git-send-email-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Joonsoo Kim Cc: Baozeng Ding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 60f77f1..4d830e2 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -50,7 +50,7 @@ STACK_ALLOC_ALIGN) #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \ STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS) -#define STACK_ALLOC_SLABS_CAP 1024 +#define STACK_ALLOC_SLABS_CAP 8192 #define STACK_ALLOC_MAX_SLABS \ (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP) -- cgit v0.10.2 From 37df49f433bc3a11f5716fe65aaec5189c6402cb Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 27 Oct 2016 17:46:47 -0700 Subject: mm: kmemleak: ensure that the task stack is not freed during scanning Commit 68f24b08ee89 ("sched/core: Free the stack early if CONFIG_THREAD_INFO_IN_TASK") may cause the task->stack to be freed during kmemleak_scan() execution, leading to either a NULL pointer fault (if task->stack is NULL) or kmemleak accessing already freed memory. This patch uses the new try_get_task_stack() API to ensure that the task stack is not freed during kmemleak stack scanning. Addresses https://bugzilla.kernel.org/show_bug.cgi?id=173901. Fixes: 68f24b08ee89 ("sched/core: Free the stack early if CONFIG_THREAD_INFO_IN_TASK") Link: http://lkml.kernel.org/r/1476266223-14325-1-git-send-email-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Reported-by: CAI Qian Tested-by: CAI Qian Acked-by: Michal Hocko Cc: Andy Lutomirski Cc: CAI Qian Cc: Hillf Danton Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a5e453c..e5355a5 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1453,8 +1453,11 @@ static void kmemleak_scan(void) read_lock(&tasklist_lock); do_each_thread(g, p) { - scan_block(task_stack_page(p), task_stack_page(p) + - THREAD_SIZE, NULL); + void *stack = try_get_task_stack(p); + if (stack) { + scan_block(stack, stack + THREAD_SIZE, NULL); + put_task_stack(p); + } } while_each_thread(g, p); read_unlock(&tasklist_lock); } -- cgit v0.10.2 From 06b2849d103f4a91212876a211d0d7df227a9513 Mon Sep 17 00:00:00 2001 From: Leon Yu Date: Thu, 27 Oct 2016 17:46:50 -0700 Subject: proc: fix NULL dereference when reading /proc//auxv Reading auxv of any kernel thread results in NULL pointer dereferencing in auxv_read() where mm can be NULL. Fix that by checking for NULL mm and bailing out early. This is also the original behavior changed by recent commit c5317167854e ("proc: switch auxv to use of __mem_open()"). # cat /proc/2/auxv Unable to handle kernel NULL pointer dereference at virtual address 000000a8 Internal error: Oops: 17 [#1] PREEMPT SMP ARM CPU: 3 PID: 113 Comm: cat Not tainted 4.9.0-rc1-ARCH+ #1 Hardware name: BCM2709 task: ea3b0b00 task.stack: e99b2000 PC is at auxv_read+0x24/0x4c LR is at do_readv_writev+0x2fc/0x37c Process cat (pid: 113, stack limit = 0xe99b2210) Call chain: auxv_read do_readv_writev vfs_readv default_file_splice_read splice_direct_to_actor do_splice_direct do_sendfile SyS_sendfile64 ret_fast_syscall Fixes: c5317167854e ("proc: switch auxv to use of __mem_open()") Link: http://lkml.kernel.org/r/1476966200-14457-1-git-send-email-chianglungyu@gmail.com Signed-off-by: Leon Yu Acked-by: Oleg Nesterov Acked-by: Michal Hocko Cc: Al Viro Cc: Kees Cook Cc: John Stultz Cc: Mateusz Guzik Cc: Janis Danisevskis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/base.c b/fs/proc/base.c index adfc5b4..ca651ac 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1012,6 +1012,9 @@ static ssize_t auxv_read(struct file *file, char __user *buf, { struct mm_struct *mm = file->private_data; unsigned int nwords = 0; + + if (!mm) + return 0; do { nwords += 2; } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ -- cgit v0.10.2 From 8f72cb4ef90c63bcb5111c2e3ec7ea2727eab2f8 Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Thu, 27 Oct 2016 17:46:53 -0700 Subject: CREDITS: update credit information for Martin Kepplinger Content and employer changed. Link: http://lkml.kernel.org/r/1477304102-28830-1-git-send-email-martin.kepplinger@ginzinger.com Signed-off-by: Martin Kepplinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/CREDITS b/CREDITS index 513aaa3..8373676 100644 --- a/CREDITS +++ b/CREDITS @@ -1864,10 +1864,11 @@ S: The Netherlands N: Martin Kepplinger E: martink@posteo.de -E: martin.kepplinger@theobroma-systems.com +E: martin.kepplinger@ginzinger.com W: http://www.martinkepplinger.com D: mma8452 accelerators iio driver -D: Kernel cleanups +D: pegasus_notetaker input driver +D: Kernel fixes and cleanups S: Garnisonstraße 26 S: 4020 Linz S: Austria -- cgit v0.10.2 From 89a2848381b5fcd9c4d9c0cd97680e3b28730e31 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 27 Oct 2016 17:46:56 -0700 Subject: mm: memcontrol: do not recurse in direct reclaim On 4.0, we saw a stack corruption from a page fault entering direct memory cgroup reclaim, calling into btrfs_releasepage(), which then tried to allocate an extent and recursed back into a kmem charge ad nauseam: [...] btrfs_releasepage+0x2c/0x30 try_to_release_page+0x32/0x50 shrink_page_list+0x6da/0x7a0 shrink_inactive_list+0x1e5/0x510 shrink_lruvec+0x605/0x7f0 shrink_zone+0xee/0x320 do_try_to_free_pages+0x174/0x440 try_to_free_mem_cgroup_pages+0xa7/0x130 try_charge+0x17b/0x830 memcg_charge_kmem+0x40/0x80 new_slab+0x2d9/0x5a0 __slab_alloc+0x2fd/0x44f kmem_cache_alloc+0x193/0x1e0 alloc_extent_state+0x21/0xc0 __clear_extent_bit+0x2b5/0x400 try_release_extent_mapping+0x1a3/0x220 __btrfs_releasepage+0x31/0x70 btrfs_releasepage+0x2c/0x30 try_to_release_page+0x32/0x50 shrink_page_list+0x6da/0x7a0 shrink_inactive_list+0x1e5/0x510 shrink_lruvec+0x605/0x7f0 shrink_zone+0xee/0x320 do_try_to_free_pages+0x174/0x440 try_to_free_mem_cgroup_pages+0xa7/0x130 try_charge+0x17b/0x830 mem_cgroup_try_charge+0x65/0x1c0 handle_mm_fault+0x117f/0x1510 __do_page_fault+0x177/0x420 do_page_fault+0xc/0x10 page_fault+0x22/0x30 On later kernels, kmem charging is opt-in rather than opt-out, and that particular kmem allocation in btrfs_releasepage() is no longer being charged and won't recurse and overrun the stack anymore. But it's not impossible for an accounted allocation to happen from the memcg direct reclaim context, and we needed to reproduce this crash many times before we even got a useful stack trace out of it. Like other direct reclaimers, mark tasks in memcg reclaim PF_MEMALLOC to avoid recursing into any other form of direct reclaim. Then let recursive charges from PF_MEMALLOC contexts bypass the cgroup limit. Link: http://lkml.kernel.org/r/20161025141050.GA13019@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ae052b5..0f870ba 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1917,6 +1917,15 @@ retry: current->flags & PF_EXITING)) goto force; + /* + * Prevent unbounded recursion when reclaim operations need to + * allocate memory. This might exceed the limits temporarily, + * but we prefer facilitating memory reclaim and getting back + * under the limit over triggering OOM kills in these cases. + */ + if (unlikely(current->flags & PF_MEMALLOC)) + goto force; + if (unlikely(task_in_memcg_oom(current))) goto nomem; diff --git a/mm/vmscan.c b/mm/vmscan.c index 744f926..76fda22 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3043,7 +3043,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); + current->flags |= PF_MEMALLOC; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + current->flags &= ~PF_MEMALLOC; trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); -- cgit v0.10.2 From 62e931fac45b17c2a42549389879411572f75804 Mon Sep 17 00:00:00 2001 From: Daniel Mentz Date: Thu, 27 Oct 2016 17:46:59 -0700 Subject: lib/genalloc.c: start search from start of chunk gen_pool_alloc_algo() iterates over the chunks of a pool trying to find a contiguous block of memory that satisfies the allocation request. The shortcut if (size > atomic_read(&chunk->avail)) continue; makes the loop skip over chunks that do not have enough bytes left to fulfill the request. There are two situations, though, where an allocation might still fail: (1) The available memory is not contiguous, i.e. the request cannot be fulfilled due to external fragmentation. (2) A race condition. Another thread runs the same code concurrently and is quicker to grab the available memory. In those situations, the loop calls pool->algo() to search the entire chunk, and pool->algo() returns some value that is >= end_bit to indicate that the search failed. This return value is then assigned to start_bit. The variables start_bit and end_bit describe the range that should be searched, and this range should be reset for every chunk that is searched. Today, the code fails to reset start_bit to 0. As a result, prefixes of subsequent chunks are ignored. Memory allocations might fail even though there is plenty of room left in these prefixes of those other chunks. Fixes: 7f184275aa30 ("lib, Make gen_pool memory allocator lockless") Link: http://lkml.kernel.org/r/1477420604-28918-1-git-send-email-danielmentz@google.com Signed-off-by: Daniel Mentz Reviewed-by: Mathieu Desnoyers Acked-by: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/genalloc.c b/lib/genalloc.c index 0a11396..144fe6b 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -292,7 +292,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size, struct gen_pool_chunk *chunk; unsigned long addr = 0; int order = pool->min_alloc_order; - int nbits, start_bit = 0, end_bit, remain; + int nbits, start_bit, end_bit, remain; #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG BUG_ON(in_nmi()); @@ -307,6 +307,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size, if (size > atomic_read(&chunk->avail)) continue; + start_bit = 0; end_bit = chunk_size(chunk) >> order; retry: start_bit = algo(chunk->bits, end_bit, start_bit, -- cgit v0.10.2 From 14f947c87a2164b19c7f8c02234f4f348e03f409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 27 Oct 2016 17:47:02 -0700 Subject: fs: exofs: print a hex number after a 0x prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It makes the message hard to interpret correctly if a base 10 number is prefixed by 0x. So change to a hex number. Link: http://lkml.kernel.org/r/20161026125658.25728-2-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Cc: Boaz Harrosh Cc: Benny Halevy Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 79101651..42f9a0a 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -137,7 +137,7 @@ Espan: bad_entry: EXOFS_ERR( "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - " - "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n", + "offset=%lu, inode=0x%llx, rec_len=%d, name_len=%d\n", dir->i_ino, error, (page->index<inode_no)), rec_len, p->name_len); -- cgit v0.10.2 From ee52c44dee63ff2686a7b0d98fff7c80852ac022 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 27 Oct 2016 17:47:04 -0700 Subject: block: DAC960: print a hex number after a 0x prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It makes the message hard to interpret correctly if a base 10 number is prefixed by 0x. So change to a hex number. Link: http://lkml.kernel.org/r/20161026125658.25728-3-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 811e11c..0809cda 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -2954,7 +2954,7 @@ DAC960_DetectController(struct pci_dev *PCI_Device, case DAC960_PD_Controller: if (!request_region(Controller->IO_Address, 0x80, Controller->FullModelName)) { - DAC960_Error("IO port 0x%d busy for Controller at\n", + DAC960_Error("IO port 0x%lx busy for Controller at\n", Controller, Controller->IO_Address); goto Failure; } @@ -2990,7 +2990,7 @@ DAC960_DetectController(struct pci_dev *PCI_Device, case DAC960_P_Controller: if (!request_region(Controller->IO_Address, 0x80, Controller->FullModelName)){ - DAC960_Error("IO port 0x%d busy for Controller at\n", + DAC960_Error("IO port 0x%lx busy for Controller at\n", Controller, Controller->IO_Address); goto Failure; } -- cgit v0.10.2 From 9105585d13bd2946f0eb2a664e32ec9a9b23bf1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 27 Oct 2016 17:47:07 -0700 Subject: ipack: print a hex number after a 0x prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It makes the result hard to interpret correctly if a base 10 number is prefixed by 0x. So change to a hex number. Link: http://lkml.kernel.org/r/20161026125658.25728-4-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Cc: Samuel Iglesias Gonsalvez Cc: Jens Taprogge Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/ipack/ipack.c b/drivers/ipack/ipack.c index c0e7b62..1210244 100644 --- a/drivers/ipack/ipack.c +++ b/drivers/ipack/ipack.c @@ -178,7 +178,7 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, idev->id_vendor, idev->id_device); } -ipack_device_attr(id_format, "0x%hhu\n"); +ipack_device_attr(id_format, "0x%hhx\n"); static DEVICE_ATTR_RO(id); static DEVICE_ATTR_RO(id_device); -- cgit v0.10.2 From 17a88939568be28a8ea5195b55ef3a84a469777e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 27 Oct 2016 17:47:10 -0700 Subject: cris/arch-v32: cryptocop: print a hex number after a 0x prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It makes the result hard to interpret correctly if a base 10 number is prefixed by 0x. So change to a hex number. Link: http://lkml.kernel.org/r/20161026125658.25728-6-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Cc: Mikael Starvik Cc: Jesper Nilsson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c index 099e170..0068fd4 100644 --- a/arch/cris/arch-v32/drivers/cryptocop.c +++ b/arch/cris/arch-v32/drivers/cryptocop.c @@ -3149,7 +3149,7 @@ static void print_dma_descriptors(struct cryptocop_int_operation *iop) printk("print_dma_descriptors start\n"); printk("iop:\n"); - printk("\tsid: 0x%lld\n", iop->sid); + printk("\tsid: 0x%llx\n", iop->sid); printk("\tcdesc_out: 0x%p\n", iop->cdesc_out); printk("\tcdesc_in: 0x%p\n", iop->cdesc_in); -- cgit v0.10.2 From 8e819101ce6fcc58801c9a813ea99c4da0255eef Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Thu, 27 Oct 2016 17:47:12 -0700 Subject: drivers/misc/sgi-gru/grumain.c: remove bogus 0x prefix from printk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Would like to have this be a decimal number. Link: http://lkml.kernel.org/r/20161026134746.GA30169@sgi.com Signed-off-by: Dimitri Sivanich Reported-by: Uwe Kleine-König Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c index 1525870..33741ad 100644 --- a/drivers/misc/sgi-gru/grumain.c +++ b/drivers/misc/sgi-gru/grumain.c @@ -283,7 +283,7 @@ static void gru_unload_mm_tracker(struct gru_state *gru, spin_lock(&gru->gs_asid_lock); BUG_ON((asids->mt_ctxbitmap & ctxbitmap) != ctxbitmap); asids->mt_ctxbitmap ^= ctxbitmap; - gru_dbg(grudev, "gid %d, gts %p, gms %p, ctxnum 0x%d, asidmap 0x%lx\n", + gru_dbg(grudev, "gid %d, gts %p, gms %p, ctxnum %d, asidmap 0x%lx\n", gru->gs_gid, gts, gms, gts->ts_ctxnum, gms->ms_asidmap[0]); spin_unlock(&gru->gs_asid_lock); spin_unlock(&gms->ms_asid_lock); -- cgit v0.10.2