From c1470b33bb6e18cddd361fef339ef225b8339fe7 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Thu, 11 Aug 2016 15:32:55 -0700 Subject: mm/hugetlb: fix incorrect hugepages count during mem hotplug When memory hotplug operates, free hugepages will be freed if the movable node is offline. Therefore, /proc/sys/vm/nr_hugepages will be incorrect. Fix it by reducing max_huge_pages when the node is offlined. n-horiguchi@ah.jp.nec.com said: : dissolve_free_huge_page intends to break a hugepage into buddy, and the : destination hugepage is supposed to be allocated from the pool of the : destination node, so the system-wide pool size is reduced. So adding : h->max_huge_pages-- makes sense to me. Link: http://lkml.kernel.org/r/1470624546-902-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Cc: Mike Kravetz Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b9aa1b0..87e11d8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1448,6 +1448,7 @@ static void dissolve_free_huge_page(struct page *page) list_del(&page->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; + h->max_huge_pages--; update_and_free_page(h, page); } spin_unlock(&hugetlb_lock); -- cgit v0.10.2 From 2f95ff90b99600f53df4a0aa652322d349d67957 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 11 Aug 2016 15:32:57 -0700 Subject: proc, meminfo: use correct helpers for calculating LRU sizes in meminfo meminfo_proc_show() and si_mem_available() are using the wrong helpers for calculating the size of the LRUs. The user-visible impact is that there appears to be an abnormally high number of unevictable pages. Link: http://lkml.kernel.org/r/20160805105805.GR2799@techsingularity.net Signed-off-by: Mel Gorman Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 09e18fd..b9a8c81 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -46,7 +46,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) cached = 0; for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_page_state(NR_LRU_BASE + lru); + pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ab2c0ff..3fbe73a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4060,7 +4060,7 @@ long si_mem_available(void) int lru; for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_page_state(NR_LRU_BASE + lru); + pages[lru] = global_node_page_state(NR_LRU_BASE + lru); for_each_zone(zone) wmark_low += zone->watermark[WMARK_LOW]; -- cgit v0.10.2 From 1f47b61fb4077936465dcde872a4e5cc4fe708da Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 11 Aug 2016 15:33:00 -0700 Subject: mm: memcontrol: fix swap counter leak on swapout from offline cgroup An offline memory cgroup might have anonymous memory or shmem left charged to it and no swap. Since only swap entries pin the id of an offline cgroup, such a cgroup will have no id and so an attempt to swapout its anon/shmem will not store memory cgroup info in the swap cgroup map. As a result, memcg->swap or memcg->memsw will never get uncharged from it and any of its ascendants. Fix this by always charging swapout to the first ancestor cgroup that hasn't released its id yet. [hannes@cmpxchg.org: add comment to mem_cgroup_swapout] [vdavydov@virtuozzo.com: use WARN_ON_ONCE() in mem_cgroup_id_get_online()] Link: http://lkml.kernel.org/r/20160803123445.GJ13263@esperanza Fixes: 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs") Link: http://lkml.kernel.org/r/5336daa5c9a32e776067773d9da655d2dc126491.1470219853.git.vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: [3.19+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e74d708..791b00c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4082,6 +4082,24 @@ static void mem_cgroup_id_get(struct mem_cgroup *memcg) atomic_inc(&memcg->id.ref); } +static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +{ + while (!atomic_inc_not_zero(&memcg->id.ref)) { + /* + * The root cgroup cannot be destroyed, so it's refcount must + * always be >= 1. + */ + if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { + VM_BUG_ON(1); + break; + } + memcg = parent_mem_cgroup(memcg); + if (!memcg) + memcg = root_mem_cgroup; + } + return memcg; +} + static void mem_cgroup_id_put(struct mem_cgroup *memcg) { if (atomic_dec_and_test(&memcg->id.ref)) { @@ -5800,7 +5818,7 @@ subsys_initcall(mem_cgroup_init); */ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *swap_memcg; unsigned short oldid; VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5815,16 +5833,27 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; - mem_cgroup_id_get(memcg); - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + /* + * In case the memcg owning these pages has been offlined and doesn't + * have an ID allocated to it anymore, charge the closest online + * ancestor for the swap instead and transfer the memory+swap charge. + */ + swap_memcg = mem_cgroup_id_get_online(memcg); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, true); + mem_cgroup_swap_statistics(swap_memcg, true); page->mem_cgroup = NULL; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); + if (memcg != swap_memcg) { + if (!mem_cgroup_is_root(swap_memcg)) + page_counter_charge(&swap_memcg->memsw, 1); + page_counter_uncharge(&memcg->memsw, 1); + } + /* * Interrupts should be disabled here because the caller holds the * mapping->tree_lock lock which is taken with interrupts-off. It is @@ -5863,11 +5892,14 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!memcg) return 0; + memcg = mem_cgroup_id_get_online(memcg); + if (!mem_cgroup_is_root(memcg) && - !page_counter_try_charge(&memcg->swap, 1, &counter)) + !page_counter_try_charge(&memcg->swap, 1, &counter)) { + mem_cgroup_id_put(memcg); return -ENOMEM; + } - mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); -- cgit v0.10.2 From 615d66c37c755c49ce022c9e5ac0875d27d2603d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 11 Aug 2016 15:33:03 -0700 Subject: mm: memcontrol: fix memcg id ref counter on swap charge move Since commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs") swap entries do not pin memcg->css.refcnt directly. Instead, they pin memcg->id.ref. So we should adjust the reference counters accordingly when moving swap charges between cgroups. Fixes: 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs") Link: http://lkml.kernel.org/r/9ce297c64954a42dc90b543bc76106c4a94f07e8.1470219853.git.vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: [3.19+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 791b00c..2ff0289 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4077,9 +4077,9 @@ static struct cftype mem_cgroup_legacy_files[] = { static DEFINE_IDR(mem_cgroup_idr); -static void mem_cgroup_id_get(struct mem_cgroup *memcg) +static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { - atomic_inc(&memcg->id.ref); + atomic_add(n, &memcg->id.ref); } static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) @@ -4100,9 +4100,9 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) return memcg; } -static void mem_cgroup_id_put(struct mem_cgroup *memcg) +static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { - if (atomic_dec_and_test(&memcg->id.ref)) { + if (atomic_sub_and_test(n, &memcg->id.ref)) { idr_remove(&mem_cgroup_idr, memcg->id.id); memcg->id.id = 0; @@ -4111,6 +4111,16 @@ static void mem_cgroup_id_put(struct mem_cgroup *memcg) } } +static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + mem_cgroup_id_get_many(memcg, 1); +} + +static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + mem_cgroup_id_put_many(memcg, 1); +} + /** * mem_cgroup_from_id - look up a memcg from a memcg id * @id: the memcg id to look up @@ -4745,6 +4755,8 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.from)) page_counter_uncharge(&mc.from->memsw, mc.moved_swap); + mem_cgroup_id_put_many(mc.from, mc.moved_swap); + /* * we charged both to->memory and to->memsw, so we * should uncharge to->memory. @@ -4752,9 +4764,9 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.to)) page_counter_uncharge(&mc.to->memory, mc.moved_swap); - css_put_many(&mc.from->css, mc.moved_swap); + mem_cgroup_id_get_many(mc.to, mc.moved_swap); + css_put_many(&mc.to->css, mc.moved_swap); - /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } memcg_oom_recover(from); -- cgit v0.10.2 From bcbf0d566b6e59a6e873bfe415cc415111a819e2 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 11 Aug 2016 15:33:06 -0700 Subject: kasan: remove the unnecessary WARN_ONCE from quarantine.c It's quite unlikely that the user will so little memory that the per-CPU quarantines won't fit into the given fraction of the available memory. Even in that case he won't be able to do anything with the information given in the warning. Link: http://lkml.kernel.org/r/1470929182-101413-1-git-send-email-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Kuthonuzo Luruo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index b6728a3..baabaad 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -217,11 +217,8 @@ void quarantine_reduce(void) new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); - if (WARN_ONCE(new_quarantine_size < percpu_quarantines, - "Too little memory, disabling global KASAN quarantine.\n")) - new_quarantine_size = 0; - else - new_quarantine_size -= percpu_quarantines; + new_quarantine_size = (new_quarantine_size < percpu_quarantines) ? + 0 : new_quarantine_size - percpu_quarantines; WRITE_ONCE(quarantine_size, new_quarantine_size); last = global_quarantine.head; -- cgit v0.10.2 From f33e6f0671b3ba81acef4d7c078af86afcc855c4 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 11 Aug 2016 15:33:09 -0700 Subject: mm, oom: fix uninitialized ret in task_will_free_mem() mm/oom_kill.c: In function `task_will_free_mem': mm/oom_kill.c:767: warning: `ret' may be used uninitialized in this function If __task_will_free_mem() is never called inside the for_each_process() loop, ret will not be initialized. Fixes: 1af8bb43269563e4 ("mm, oom: fortify task_will_free_mem()") Link: http://lkml.kernel.org/r/1470255599-24841-1-git-send-email-geert@linux-m68k.org Signed-off-by: Geert Uytterhoeven Acked-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7d0a275..d53a9aa 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -764,7 +764,7 @@ bool task_will_free_mem(struct task_struct *task) { struct mm_struct *mm = task->mm; struct task_struct *p; - bool ret; + bool ret = true; /* * Skip tasks without mm because it might have passed its exit_mm and -- cgit v0.10.2 From 5830169f47269f78f6624bd70165eb571270da82 Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Thu, 11 Aug 2016 15:33:12 -0700 Subject: mm/memory_hotplug.c: initialize per_cpu_nodestats for hotadded pgdats The following oops occurs after a pgdat is hotadded: Unable to handle kernel paging request for data at address 0x00c30001 Faulting instruction address: 0xc00000000022f8f4 Oops: Kernel access of bad area, sig: 11 [#1] SMP NR_CPUS=2048 NUMA pSeries Modules linked in: ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter nls_utf8 isofs sg virtio_balloon uio_pdrv_genirq uio ip_tables xfs libcrc32c sr_mod cdrom sd_mod virtio_net ibmvscsi scsi_transport_srp virtio_pci virtio_ring virtio dm_mirror dm_region_hash dm_log dm_mod CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W 4.8.0-rc1-device #110 task: c000000000ef3080 task.stack: c000000000f6c000 NIP: c00000000022f8f4 LR: c00000000022f948 CTR: 0000000000000000 REGS: c000000000f6fa50 TRAP: 0300 Tainted: G W (4.8.0-rc1-device) MSR: 800000010280b033 CR: 84002028 XER: 20000000 CFAR: d000000001d2013c DAR: 0000000000c30001 DSISR: 40000000 SOFTE: 0 NIP refresh_cpu_vm_stats+0x1a4/0x2f0 LR refresh_cpu_vm_stats+0x1f8/0x2f0 Call Trace: refresh_cpu_vm_stats+0x1f8/0x2f0 (unreliable) Add per_cpu_nodestats initialization to the hotplug codepath. Link: http://lkml.kernel.org/r/1470931473-7090-1-git-send-email-arbab@linux.vnet.ibm.com Signed-off-by: Reza Arbab Cc: Mel Gorman Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3894b65..41266dc 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1219,6 +1219,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_node(nid, zones_size, start_pfn, zholes_size); + pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); /* * The node we allocated has no zone fallback lists. For avoiding @@ -1249,6 +1250,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) static void rollback_node_hotadd(int nid, pg_data_t *pgdat) { arch_refresh_nodedata(nid, NULL); + free_percpu(pgdat->per_cpu_nodestats); arch_free_nodedata(pgdat); return; } -- cgit v0.10.2