diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 64 | ||||
-rw-r--r-- | mm/Makefile | 15 | ||||
-rw-r--r-- | mm/backing-dev.c | 46 | ||||
-rw-r--r-- | mm/balloon_compaction.c | 125 | ||||
-rw-r--r-- | mm/bootmem.c | 13 | ||||
-rw-r--r-- | mm/cma.c | 418 | ||||
-rw-r--r-- | mm/compaction.c | 691 | ||||
-rw-r--r-- | mm/debug.c | 237 | ||||
-rw-r--r-- | mm/dmapool.c | 58 | ||||
-rw-r--r-- | mm/filemap.c | 79 | ||||
-rw-r--r-- | mm/gup.c | 376 | ||||
-rw-r--r-- | mm/highmem.c | 86 | ||||
-rw-r--r-- | mm/huge_memory.c | 137 | ||||
-rw-r--r-- | mm/hugetlb.c | 143 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 3 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 3 | ||||
-rw-r--r-- | mm/internal.h | 53 | ||||
-rw-r--r-- | mm/interval_tree.c | 2 | ||||
-rw-r--r-- | mm/iov_iter.c | 257 | ||||
-rw-r--r-- | mm/kmemcheck.c | 1 | ||||
-rw-r--r-- | mm/ksm.c | 4 | ||||
-rw-r--r-- | mm/madvise.c | 3 | ||||
-rw-r--r-- | mm/memblock.c | 7 | ||||
-rw-r--r-- | mm/memcontrol.c | 2022 | ||||
-rw-r--r-- | mm/memory-failure.c | 12 | ||||
-rw-r--r-- | mm/memory.c | 166 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 78 | ||||
-rw-r--r-- | mm/mempolicy.c | 134 | ||||
-rw-r--r-- | mm/migrate.c | 59 | ||||
-rw-r--r-- | mm/mlock.c | 17 | ||||
-rw-r--r-- | mm/mmap.c | 172 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 45 | ||||
-rw-r--r-- | mm/mprotect.c | 20 | ||||
-rw-r--r-- | mm/mremap.c | 5 | ||||
-rw-r--r-- | mm/nobootmem.c | 10 | ||||
-rw-r--r-- | mm/nommu.c | 7 | ||||
-rw-r--r-- | mm/oom_kill.c | 57 | ||||
-rw-r--r-- | mm/page-writeback.c | 58 | ||||
-rw-r--r-- | mm/page_alloc.c | 570 | ||||
-rw-r--r-- | mm/page_cgroup.c | 1 | ||||
-rw-r--r-- | mm/page_isolation.c | 43 | ||||
-rw-r--r-- | mm/pagewalk.c | 2 | ||||
-rw-r--r-- | mm/percpu-km.c | 16 | ||||
-rw-r--r-- | mm/percpu-vm.c | 184 | ||||
-rw-r--r-- | mm/percpu.c | 524 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 2 | ||||
-rw-r--r-- | mm/readahead.c | 3 | ||||
-rw-r--r-- | mm/rmap.c | 122 | ||||
-rw-r--r-- | mm/shmem.c | 482 | ||||
-rw-r--r-- | mm/slab.c | 864 | ||||
-rw-r--r-- | mm/slab.h | 77 | ||||
-rw-r--r-- | mm/slab_common.c | 293 | ||||
-rw-r--r-- | mm/slob.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 347 | ||||
-rw-r--r-- | mm/swap.c | 84 | ||||
-rw-r--r-- | mm/swap_state.c | 25 | ||||
-rw-r--r-- | mm/swapfile.c | 21 | ||||
-rw-r--r-- | mm/truncate.c | 70 | ||||
-rw-r--r-- | mm/util.c | 134 | ||||
-rw-r--r-- | mm/vmalloc.c | 50 | ||||
-rw-r--r-- | mm/vmscan.c | 394 | ||||
-rw-r--r-- | mm/vmstat.c | 162 | ||||
-rw-r--r-- | mm/zbud.c | 112 | ||||
-rw-r--r-- | mm/zpool.c | 364 | ||||
-rw-r--r-- | mm/zsmalloc.c | 131 | ||||
-rw-r--r-- | mm/zswap.c | 81 |
66 files changed, 6629 insertions, 4214 deletions
@@ -137,6 +137,9 @@ config HAVE_MEMBLOCK_NODE_MAP config HAVE_MEMBLOCK_PHYS_MAP boolean +config HAVE_GENERIC_RCU_GUP + boolean + config ARCH_DISCARD_MEMBLOCK boolean @@ -228,11 +231,16 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK boolean # +# support for memory balloon +config MEMORY_BALLOON + boolean + +# # support for memory balloon compaction config BALLOON_COMPACTION bool "Allow for balloon memory compaction/migration" def_bool y - depends on COMPACTION && VIRTIO_BALLOON + depends on COMPACTION && MEMORY_BALLOON help Memory fragmentation introduced by ballooning might reduce significantly the number of 2MB contiguous memory blocks that can be @@ -508,21 +516,34 @@ config CMA_DEBUG processing calls such as dma_alloc_from_contiguous(). This option does not affect warning and error messages. -config ZBUD - tristate - default n +config CMA_AREAS + int "Maximum count of the CMA areas" + depends on CMA + default 7 help - A special purpose allocator for storing compressed pages. - It is designed to store up to two compressed pages per physical - page. While this design limits storage density, it has simple and - deterministic reclaim properties that make it preferable to a higher - density approach when reclaim will be used. + CMA allows to create CMA areas for particular purpose, mainly, + used as device private area. This parameter sets the maximum + number of CMA area in the system. + + If unsure, leave the default value "7". + +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS + select PROC_PAGE_MONITOR + help + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/vm/soft-dirty.txt for more details. config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" depends on FRONTSWAP && CRYPTO=y select CRYPTO_LZO - select ZBUD + select ZPOOL default n help A lightweight compressed cache for swap pages. It takes @@ -538,17 +559,22 @@ config ZSWAP they have not be fully explored on the large set of potential configurations and workloads that exist. -config MEM_SOFT_DIRTY - bool "Track memory changes" - depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS - select PROC_PAGE_MONITOR +config ZPOOL + tristate "Common API for compressed memory storage" + default n help - This option enables memory changes tracking by introducing a - soft-dirty bit on pte-s. This bit it set when someone writes - into a page just as regular dirty bit, but unlike the latter - it can be cleared by hands. + Compressed memory storage API. This allows using either zbud or + zsmalloc. - See Documentation/vm/soft-dirty.txt for more details. +config ZBUD + tristate "Low density storage for compressed pages" + default n + help + A special purpose allocator for storing compressed pages. + It is designed to store up to two compressed pages per physical + page. While this design limits storage density, it has simple and + deterministic reclaim properties that make it preferable to a higher + density approach when reclaim will be used. config ZSMALLOC tristate "Memory allocator for compressed pages" diff --git a/mm/Makefile b/mm/Makefile index 4064f3e..8405eb0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,7 +3,7 @@ # mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o @@ -11,14 +11,14 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o endif -obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ +obj-y := filemap.o mempool.o oom_kill.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o balloon_compaction.o vmacache.o \ + compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ - iov_iter.o $(mmu-y) + iov_iter.o debug.o $(mmu-y) obj-y += init-mm.o @@ -28,6 +28,10 @@ else obj-y += bootmem.o endif +obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o +ifdef CONFIG_MMU + obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o +endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o @@ -59,6 +63,9 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o +obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o +obj-$(CONFIG_CMA) += cma.o +obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1706cbb..0ae0df5 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -40,7 +40,7 @@ LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; -void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) +static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { if (wb1 < wb2) { spin_lock(&wb1->list_lock); @@ -376,13 +376,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); flush_delayed_work(&bdi->wb.dwork); WARN_ON(!list_empty(&bdi->work_list)); - - /* - * This shouldn't be necessary unless @bdi for some reason has - * unflushed dirty IO after work_list is drained. Do it anyway - * just in case. - */ - cancel_delayed_work_sync(&bdi->wb.dwork); + WARN_ON(delayed_work_pending(&bdi->wb.dwork)); } /* @@ -402,21 +396,15 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { - struct device *dev = bdi->dev; - - if (dev) { + if (bdi->dev) { bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); - - spin_lock_bh(&bdi->wb_lock); + device_unregister(bdi->dev); bdi->dev = NULL; - spin_unlock_bh(&bdi->wb_lock); - - device_unregister(dev); } } EXPORT_SYMBOL(bdi_unregister); @@ -455,7 +443,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi_wb_init(&bdi->wb, bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init(&bdi->bdi_stat[i], 0); + err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); if (err) goto err; } @@ -470,7 +458,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->write_bandwidth = INIT_BW; bdi->avg_write_bandwidth = INIT_BW; - err = fprop_local_init_percpu(&bdi->completions); + err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); if (err) { err: @@ -487,8 +475,17 @@ void bdi_destroy(struct backing_dev_info *bdi) int i; /* - * Splice our entries to the default_backing_dev_info, if this - * bdi disappears + * Splice our entries to the default_backing_dev_info. This + * condition shouldn't happen. @wb must be empty at this point and + * dirty inodes on it might cause other issues. This workaround is + * added by ce5f8e779519 ("writeback: splice dirty inode entries to + * default bdi on bdi_destroy()") without root-causing the issue. + * + * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com + * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350 + * + * We should probably add WARN_ON() to find out whether it still + * happens and track it down if so. */ if (bdi_has_dirty_io(bdi)) { struct bdi_writeback *dst = &default_backing_dev_info.wb; @@ -503,12 +500,7 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); - /* - * If bdi_unregister() had already been called earlier, the dwork - * could still be pending because bdi_prune_sb() can race with the - * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty(). - */ - cancel_delayed_work_sync(&bdi->wb.dwork); + WARN_ON(delayed_work_pending(&bdi->wb.dwork)); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); @@ -631,7 +623,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) * of sleeping on the congestion queue */ if (atomic_read(&nr_bdi_congested[sync]) == 0 || - !zone_is_reclaim_congested(zone)) { + !test_bit(ZONE_CONGESTED, &zone->flags)) { cond_resched(); /* In case we scheduled, work out time remaining */ diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 6e45a50..fcad832 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -11,32 +11,6 @@ #include <linux/balloon_compaction.h> /* - * balloon_devinfo_alloc - allocates a balloon device information descriptor. - * @balloon_dev_descriptor: pointer to reference the balloon device which - * this struct balloon_dev_info will be servicing. - * - * Driver must call it to properly allocate and initialize an instance of - * struct balloon_dev_info which will be used to reference a balloon device - * as well as to keep track of the balloon device page list. - */ -struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor) -{ - struct balloon_dev_info *b_dev_info; - b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL); - if (!b_dev_info) - return ERR_PTR(-ENOMEM); - - b_dev_info->balloon_device = balloon_dev_descriptor; - b_dev_info->mapping = NULL; - b_dev_info->isolated_pages = 0; - spin_lock_init(&b_dev_info->pages_lock); - INIT_LIST_HEAD(&b_dev_info->pages); - - return b_dev_info; -} -EXPORT_SYMBOL_GPL(balloon_devinfo_alloc); - -/* * balloon_page_enqueue - allocates a new page and inserts it into the balloon * page list. * @b_dev_info: balloon device decriptor where we will insert a new page to @@ -61,7 +35,8 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) */ BUG_ON(!trylock_page(page)); spin_lock_irqsave(&b_dev_info->pages_lock, flags); - balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages); + balloon_page_insert(b_dev_info, page); + __count_vm_event(BALLOON_INFLATE); spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); unlock_page(page); return page; @@ -93,18 +68,16 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) * to be released by the balloon driver. */ if (trylock_page(page)) { +#ifdef CONFIG_BALLOON_COMPACTION + if (!PagePrivate(page)) { + /* raced with isolation */ + unlock_page(page); + continue; + } +#endif spin_lock_irqsave(&b_dev_info->pages_lock, flags); - /* - * Raise the page refcount here to prevent any wrong - * attempt to isolate this page, in case of coliding - * with balloon_page_isolate() just after we release - * the page lock. - * - * balloon_page_free() will take care of dropping - * this extra refcount later. - */ - get_page(page); balloon_page_delete(page); + __count_vm_event(BALLOON_DEFLATE); spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); unlock_page(page); dequeued_page = true; @@ -132,62 +105,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) EXPORT_SYMBOL_GPL(balloon_page_dequeue); #ifdef CONFIG_BALLOON_COMPACTION -/* - * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages. - * @b_dev_info: holds the balloon device information descriptor. - * @a_ops: balloon_mapping address_space_operations descriptor. - * - * Driver must call it to properly allocate and initialize an instance of - * struct address_space which will be used as the special page->mapping for - * balloon device enlisted page instances. - */ -struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info, - const struct address_space_operations *a_ops) -{ - struct address_space *mapping; - - mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); - if (!mapping) - return ERR_PTR(-ENOMEM); - - /* - * Give a clean 'zeroed' status to all elements of this special - * balloon page->mapping struct address_space instance. - */ - address_space_init_once(mapping); - - /* - * Set mapping->flags appropriately, to allow balloon pages - * ->mapping identification. - */ - mapping_set_balloon(mapping); - mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask()); - - /* balloon's page->mapping->a_ops callback descriptor */ - mapping->a_ops = a_ops; - - /* - * Establish a pointer reference back to the balloon device descriptor - * this particular page->mapping will be servicing. - * This is used by compaction / migration procedures to identify and - * access the balloon device pageset while isolating / migrating pages. - * - * As some balloon drivers can register multiple balloon devices - * for a single guest, this also helps compaction / migration to - * properly deal with multiple balloon pagesets, when required. - */ - mapping->private_data = b_dev_info; - b_dev_info->mapping = mapping; - - return mapping; -} -EXPORT_SYMBOL_GPL(balloon_mapping_alloc); static inline void __isolate_balloon_page(struct page *page) { - struct balloon_dev_info *b_dev_info = page->mapping->private_data; + struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + ClearPagePrivate(page); list_del(&page->lru); b_dev_info->isolated_pages++; spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); @@ -195,20 +120,16 @@ static inline void __isolate_balloon_page(struct page *page) static inline void __putback_balloon_page(struct page *page) { - struct balloon_dev_info *b_dev_info = page->mapping->private_data; + struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + SetPagePrivate(page); list_add(&page->lru, &b_dev_info->pages); b_dev_info->isolated_pages--; spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); } -static inline int __migrate_balloon_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) -{ - return page->mapping->a_ops->migratepage(mapping, newpage, page, mode); -} - /* __isolate_lru_page() counterpart for a ballooned page */ bool balloon_page_isolate(struct page *page) { @@ -235,12 +156,11 @@ bool balloon_page_isolate(struct page *page) */ if (likely(trylock_page(page))) { /* - * A ballooned page, by default, has just one refcount. + * A ballooned page, by default, has PagePrivate set. * Prevent concurrent compaction threads from isolating - * an already isolated balloon page by refcount check. + * an already isolated balloon page by clearing it. */ - if (__is_movable_balloon_page(page) && - page_count(page) == 2) { + if (balloon_page_movable(page)) { __isolate_balloon_page(page); unlock_page(page); return true; @@ -276,7 +196,7 @@ void balloon_page_putback(struct page *page) int balloon_page_migrate(struct page *newpage, struct page *page, enum migrate_mode mode) { - struct address_space *mapping; + struct balloon_dev_info *balloon = balloon_page_device(page); int rc = -EAGAIN; /* @@ -292,9 +212,8 @@ int balloon_page_migrate(struct page *newpage, return rc; } - mapping = page->mapping; - if (mapping) - rc = __migrate_balloon_page(mapping, newpage, page, mode); + if (balloon && balloon->migratepage) + rc = balloon->migratepage(balloon, newpage, page, mode); unlock_page(newpage); return rc; diff --git a/mm/bootmem.c b/mm/bootmem.c index 90bd350..477be69 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -16,9 +16,9 @@ #include <linux/kmemleak.h> #include <linux/range.h> #include <linux/memblock.h> +#include <linux/bug.h> +#include <linux/io.h> -#include <asm/bug.h> -#include <asm/io.h> #include <asm/processor.h> #include "internal.h" @@ -243,13 +243,10 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) static int reset_managed_pages_done __initdata; -static inline void __init reset_node_managed_pages(pg_data_t *pgdat) +void reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - if (reset_managed_pages_done) - return; - for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) z->managed_pages = 0; } @@ -258,8 +255,12 @@ void __init reset_all_zones_managed_pages(void) { struct pglist_data *pgdat; + if (reset_managed_pages_done) + return; + for_each_online_pgdat(pgdat) reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } diff --git a/mm/cma.c b/mm/cma.c new file mode 100644 index 0000000..fde706e --- /dev/null +++ b/mm/cma.c @@ -0,0 +1,418 @@ +/* + * Contiguous Memory Allocator + * + * Copyright (c) 2010-2011 by Samsung Electronics. + * Copyright IBM Corporation, 2013 + * Copyright LG Electronics Inc., 2014 + * Written by: + * Marek Szyprowski <m.szyprowski@samsung.com> + * Michal Nazarewicz <mina86@mina86.com> + * Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * Joonsoo Kim <iamjoonsoo.kim@lge.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + */ + +#define pr_fmt(fmt) "cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +# define DEBUG +#endif +#endif + +#include <linux/memblock.h> +#include <linux/err.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/log2.h> +#include <linux/cma.h> +#include <linux/highmem.h> + +struct cma { + unsigned long base_pfn; + unsigned long count; + unsigned long *bitmap; + unsigned int order_per_bit; /* Order of pages represented by one bit */ + struct mutex lock; +}; + +static struct cma cma_areas[MAX_CMA_AREAS]; +static unsigned cma_area_count; +static DEFINE_MUTEX(cma_mutex); + +phys_addr_t cma_get_base(struct cma *cma) +{ + return PFN_PHYS(cma->base_pfn); +} + +unsigned long cma_get_size(struct cma *cma) +{ + return cma->count << PAGE_SHIFT; +} + +static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) +{ + if (align_order <= cma->order_per_bit) + return 0; + return (1UL << (align_order - cma->order_per_bit)) - 1; +} + +static unsigned long cma_bitmap_maxno(struct cma *cma) +{ + return cma->count >> cma->order_per_bit; +} + +static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, + unsigned long pages) +{ + return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; +} + +static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) +{ + unsigned long bitmap_no, bitmap_count; + + bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + mutex_lock(&cma->lock); + bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); + mutex_unlock(&cma->lock); +} + +static int __init cma_activate_area(struct cma *cma) +{ + int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long); + unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; + unsigned i = cma->count >> pageblock_order; + struct zone *zone; + + cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + + if (!cma->bitmap) + return -ENOMEM; + + WARN_ON_ONCE(!pfn_valid(pfn)); + zone = page_zone(pfn_to_page(pfn)); + + do { + unsigned j; + + base_pfn = pfn; + for (j = pageblock_nr_pages; j; --j, pfn++) { + WARN_ON_ONCE(!pfn_valid(pfn)); + /* + * alloc_contig_range requires the pfn range + * specified to be in the same zone. Make this + * simple by forcing the entire CMA resv range + * to be in the same zone. + */ + if (page_zone(pfn_to_page(pfn)) != zone) + goto err; + } + init_cma_reserved_pageblock(pfn_to_page(base_pfn)); + } while (--i); + + mutex_init(&cma->lock); + return 0; + +err: + kfree(cma->bitmap); + cma->count = 0; + return -EINVAL; +} + +static int __init cma_init_reserved_areas(void) +{ + int i; + + for (i = 0; i < cma_area_count; i++) { + int ret = cma_activate_area(&cma_areas[i]); + + if (ret) + return ret; + } + + return 0; +} +core_initcall(cma_init_reserved_areas); + +/** + * cma_init_reserved_mem() - create custom contiguous area from reserved memory + * @base: Base address of the reserved area + * @size: Size of the reserved area (in bytes), + * @order_per_bit: Order of pages represented by one bit on bitmap. + * @res_cma: Pointer to store the created cma region. + * + * This function creates custom contiguous area from already reserved memory. + */ +int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, + int order_per_bit, struct cma **res_cma) +{ + struct cma *cma; + phys_addr_t alignment; + + /* Sanity checks */ + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size || !memblock_is_region_reserved(base, size)) + return -EINVAL; + + /* ensure minimal alignment requied by mm core */ + alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); + + /* alignment should be aligned with order_per_bit */ + if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + + if (ALIGN(base, alignment) != base || ALIGN(size, alignment) != size) + return -EINVAL; + + /* + * Each reserved area must be initialised later, when more kernel + * subsystems (like slab allocator) are available. + */ + cma = &cma_areas[cma_area_count]; + cma->base_pfn = PFN_DOWN(base); + cma->count = size >> PAGE_SHIFT; + cma->order_per_bit = order_per_bit; + *res_cma = cma; + cma_area_count++; + + return 0; +} + +/** + * cma_declare_contiguous() - reserve custom contiguous area + * @base: Base address of the reserved area optional, use 0 for any + * @size: Size of the reserved area (in bytes), + * @limit: End address of the reserved memory (optional, 0 for any). + * @alignment: Alignment for the CMA area, should be power of 2 or zero + * @order_per_bit: Order of pages represented by one bit on bitmap. + * @fixed: hint about where to place the reserved area + * @res_cma: Pointer to store the created cma region. + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. This function allows to create custom reserved areas. + * + * If @fixed is true, reserve contiguous area at exactly @base. If false, + * reserve in range from @base to @limit. + */ +int __init cma_declare_contiguous(phys_addr_t base, + phys_addr_t size, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + bool fixed, struct cma **res_cma) +{ + phys_addr_t memblock_end = memblock_end_of_DRAM(); + phys_addr_t highmem_start = __pa(high_memory); + int ret = 0; + + pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", + __func__, &size, &base, &limit, &alignment); + + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size) + return -EINVAL; + + if (alignment && !is_power_of_2(alignment)) + return -EINVAL; + + /* + * Sanitise input arguments. + * Pages both ends in CMA area could be merged into adjacent unmovable + * migratetype page by page allocator's buddy algorithm. In the case, + * you couldn't get a contiguous memory, which is not what we want. + */ + alignment = max(alignment, + (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); + base = ALIGN(base, alignment); + size = ALIGN(size, alignment); + limit &= ~(alignment - 1); + + if (!base) + fixed = false; + + /* size should be aligned with order_per_bit */ + if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + + /* + * If allocating at a fixed base the request region must not cross the + * low/high memory boundary. + */ + if (fixed && base < highmem_start && base + size > highmem_start) { + ret = -EINVAL; + pr_err("Region at %pa defined on low/high memory boundary (%pa)\n", + &base, &highmem_start); + goto err; + } + + /* + * If the limit is unspecified or above the memblock end, its effective + * value will be the memblock end. Set it explicitly to simplify further + * checks. + */ + if (limit == 0 || limit > memblock_end) + limit = memblock_end; + + /* Reserve memory */ + if (fixed) { + if (memblock_is_region_reserved(base, size) || + memblock_reserve(base, size) < 0) { + ret = -EBUSY; + goto err; + } + } else { + phys_addr_t addr = 0; + + /* + * All pages in the reserved area must come from the same zone. + * If the requested region crosses the low/high memory boundary, + * try allocating from high memory first and fall back to low + * memory in case of failure. + */ + if (base < highmem_start && limit > highmem_start) { + addr = memblock_alloc_range(size, alignment, + highmem_start, limit); + limit = highmem_start; + } + + if (!addr) { + addr = memblock_alloc_range(size, alignment, base, + limit); + if (!addr) { + ret = -ENOMEM; + goto err; + } + } + + base = addr; + } + + ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma); + if (ret) + goto err; + + pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, + &base); + return 0; + +err: + pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); + return ret; +} + +/** + * cma_alloc() - allocate pages from contiguous area + * @cma: Contiguous memory region for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). + * + * This function allocates part of contiguous memory on specific + * contiguous memory area. + */ +struct page *cma_alloc(struct cma *cma, int count, unsigned int align) +{ + unsigned long mask, pfn, start = 0; + unsigned long bitmap_maxno, bitmap_no, bitmap_count; + struct page *page = NULL; + int ret; + + if (!cma || !cma->count) + return NULL; + + pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, + count, align); + + if (!count) + return NULL; + + mask = cma_bitmap_aligned_mask(cma, align); + bitmap_maxno = cma_bitmap_maxno(cma); + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + for (;;) { + mutex_lock(&cma->lock); + bitmap_no = bitmap_find_next_zero_area(cma->bitmap, + bitmap_maxno, start, bitmap_count, mask); + if (bitmap_no >= bitmap_maxno) { + mutex_unlock(&cma->lock); + break; + } + bitmap_set(cma->bitmap, bitmap_no, bitmap_count); + /* + * It's safe to drop the lock here. We've marked this region for + * our exclusive use. If the migration fails we will take the + * lock again and unmark it. + */ + mutex_unlock(&cma->lock); + + pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); + mutex_lock(&cma_mutex); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); + mutex_unlock(&cma_mutex); + if (ret == 0) { + page = pfn_to_page(pfn); + break; + } + + cma_clear_bitmap(cma, pfn, count); + if (ret != -EBUSY) + break; + + pr_debug("%s(): memory range at %p is busy, retrying\n", + __func__, pfn_to_page(pfn)); + /* try again with a bit different memory target */ + start = bitmap_no + mask + 1; + } + + pr_debug("%s(): returned %p\n", __func__, page); + return page; +} + +/** + * cma_release() - release allocated pages + * @cma: Contiguous memory region for which the allocation is performed. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This function releases memory allocated by alloc_cma(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool cma_release(struct cma *cma, struct page *pages, int count) +{ + unsigned long pfn; + + if (!cma || !pages) + return false; + + pr_debug("%s(page %p)\n", __func__, (void *)pages); + + pfn = page_to_pfn(pages); + + if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) + return false; + + VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); + + free_contig_range(pfn, count); + cma_clear_bitmap(cma, pfn, count); + + return true; +} diff --git a/mm/compaction.c b/mm/compaction.c index 21bf292b..f9792ba 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -67,6 +67,49 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } +/* + * Check that the whole (or subset of) a pageblock given by the interval of + * [start_pfn, end_pfn) is valid and within the same zone, before scanning it + * with the migration of free compaction scanner. The scanners then need to + * use only pfn_valid_within() check for arches that allow holes within + * pageblocks. + * + * Return struct page pointer of start_pfn, or NULL if checks were not passed. + * + * It's possible on some configurations to have a setup like node0 node1 node0 + * i.e. it's possible that all pages within a zones range of pages do not + * belong to a single zone. We assume that a border between node0 and node1 + * can occur within a single pageblock, but not a node0 node1 node0 + * interleaving within a single pageblock. It is therefore sufficient to check + * the first and last page of a pageblock and avoid checking each individual + * page in a pageblock. + */ +static struct page *pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + struct page *start_page; + struct page *end_page; + + /* end_pfn is one past the range we are checking */ + end_pfn--; + + if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) + return NULL; + + start_page = pfn_to_page(start_pfn); + + if (page_zone(start_page) != zone) + return NULL; + + end_page = pfn_to_page(end_pfn); + + /* This gives a shorter code than deriving page_zone(end_page) */ + if (page_zone_id(start_page) != page_zone_id(end_page)) + return NULL; + + return start_page; +} + #ifdef CONFIG_COMPACTION /* Returns true if the pageblock should be scanned for pages to isolate. */ static inline bool isolation_suitable(struct compact_control *cc, @@ -132,7 +175,7 @@ void reset_isolation_suitable(pg_data_t *pgdat) */ static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool set_unsuitable, bool migrate_scanner) + bool migrate_scanner) { struct zone *zone = cc->zone; unsigned long pfn; @@ -146,12 +189,7 @@ static void update_pageblock_skip(struct compact_control *cc, if (nr_isolated) return; - /* - * Only skip pageblocks when all forms of compaction will be known to - * fail in the near future. - */ - if (set_unsuitable) - set_pageblock_skip(page); + set_pageblock_skip(page); pfn = page_to_pfn(page); @@ -180,52 +218,77 @@ static inline bool isolation_suitable(struct compact_control *cc, static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool set_unsuitable, bool migrate_scanner) + bool migrate_scanner) { } #endif /* CONFIG_COMPACTION */ -static inline bool should_release_lock(spinlock_t *lock) +/* + * Compaction requires the taking of some coarse locks that are potentially + * very heavily contended. For async compaction, back out if the lock cannot + * be taken immediately. For sync compaction, spin on the lock if needed. + * + * Returns true if the lock is held + * Returns false if the lock is not held and compaction should abort + */ +static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, + struct compact_control *cc) { - return need_resched() || spin_is_contended(lock); + if (cc->mode == MIGRATE_ASYNC) { + if (!spin_trylock_irqsave(lock, *flags)) { + cc->contended = COMPACT_CONTENDED_LOCK; + return false; + } + } else { + spin_lock_irqsave(lock, *flags); + } + + return true; } /* * Compaction requires the taking of some coarse locks that are potentially - * very heavily contended. Check if the process needs to be scheduled or - * if the lock is contended. For async compaction, back out in the event - * if contention is severe. For sync compaction, schedule. + * very heavily contended. The lock should be periodically unlocked to avoid + * having disabled IRQs for a long time, even when there is nobody waiting on + * the lock. It might also be that allowing the IRQs will result in + * need_resched() becoming true. If scheduling is needed, async compaction + * aborts. Sync compaction schedules. + * Either compaction type will also abort if a fatal signal is pending. + * In either case if the lock was locked, it is dropped and not regained. * - * Returns true if the lock is held. - * Returns false if the lock is released and compaction should abort + * Returns true if compaction should abort due to fatal signal pending, or + * async compaction due to need_resched() + * Returns false when compaction can continue (sync compaction might have + * scheduled) */ -static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, - bool locked, struct compact_control *cc) +static bool compact_unlock_should_abort(spinlock_t *lock, + unsigned long flags, bool *locked, struct compact_control *cc) { - if (should_release_lock(lock)) { - if (locked) { - spin_unlock_irqrestore(lock, *flags); - locked = false; - } + if (*locked) { + spin_unlock_irqrestore(lock, flags); + *locked = false; + } - /* async aborts if taking too long or contended */ + if (fatal_signal_pending(current)) { + cc->contended = COMPACT_CONTENDED_SCHED; + return true; + } + + if (need_resched()) { if (cc->mode == MIGRATE_ASYNC) { - cc->contended = true; - return false; + cc->contended = COMPACT_CONTENDED_SCHED; + return true; } - cond_resched(); } - if (!locked) - spin_lock_irqsave(lock, *flags); - return true; + return false; } /* * Aside from avoiding lock contention, compaction also periodically checks * need_resched() and either schedules in sync compaction or aborts async - * compaction. This is similar to what compact_checklock_irqsave() does, but + * compaction. This is similar to what compact_unlock_should_abort() does, but * is used where no lock is concerned. * * Returns false when no scheduling was needed, or sync compaction scheduled. @@ -236,7 +299,7 @@ static inline bool compact_should_abort(struct compact_control *cc) /* async compaction aborts if contended */ if (need_resched()) { if (cc->mode == MIGRATE_ASYNC) { - cc->contended = true; + cc->contended = COMPACT_CONTENDED_SCHED; return true; } @@ -250,8 +313,15 @@ static inline bool compact_should_abort(struct compact_control *cc) static bool suitable_migration_target(struct page *page) { /* If the page is a large free page, then disallow migration */ - if (PageBuddy(page) && page_order(page) >= pageblock_order) - return false; + if (PageBuddy(page)) { + /* + * We are checking page_order without zone->lock taken. But + * the only small danger is that we skip a potentially suitable + * pageblock, so it's not worth to check order for valid range. + */ + if (page_order_unsafe(page) >= pageblock_order) + return false; + } /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ if (migrate_async_suitable(get_pageblock_migratetype(page))) @@ -267,16 +337,16 @@ static bool suitable_migration_target(struct page *page) * (even though it may still end up isolating some pages). */ static unsigned long isolate_freepages_block(struct compact_control *cc, - unsigned long blockpfn, + unsigned long *start_pfn, unsigned long end_pfn, struct list_head *freelist, bool strict) { int nr_scanned = 0, total_isolated = 0; struct page *cursor, *valid_page = NULL; - unsigned long flags; + unsigned long flags = 0; bool locked = false; - bool checked_pageblock = false; + unsigned long blockpfn = *start_pfn; cursor = pfn_to_page(blockpfn); @@ -285,6 +355,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, int isolated, i; struct page *page = cursor; + /* + * Periodically drop the lock (if held) regardless of its + * contention, to give chance to IRQs. Abort if fatal signal + * pending or async compaction detects need_resched() + */ + if (!(blockpfn % SWAP_CLUSTER_MAX) + && compact_unlock_should_abort(&cc->zone->lock, flags, + &locked, cc)) + break; + nr_scanned++; if (!pfn_valid_within(blockpfn)) goto isolate_fail; @@ -295,33 +375,30 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, goto isolate_fail; /* - * The zone lock must be held to isolate freepages. - * Unfortunately this is a very coarse lock and can be - * heavily contended if there are parallel allocations - * or parallel compactions. For async compaction do not - * spin on the lock and we acquire the lock as late as - * possible. + * If we already hold the lock, we can skip some rechecking. + * Note that if we hold the lock now, checked_pageblock was + * already set in some previous iteration (or strict is true), + * so it is correct to skip the suitable migration target + * recheck as well. */ - locked = compact_checklock_irqsave(&cc->zone->lock, &flags, - locked, cc); - if (!locked) - break; - - /* Recheck this is a suitable migration target under lock */ - if (!strict && !checked_pageblock) { + if (!locked) { /* - * We need to check suitability of pageblock only once - * and this isolate_freepages_block() is called with - * pageblock range, so just check once is sufficient. + * The zone lock must be held to isolate freepages. + * Unfortunately this is a very coarse lock and can be + * heavily contended if there are parallel allocations + * or parallel compactions. For async compaction do not + * spin on the lock and we acquire the lock as late as + * possible. */ - checked_pageblock = true; - if (!suitable_migration_target(page)) + locked = compact_trylock_irqsave(&cc->zone->lock, + &flags, cc); + if (!locked) break; - } - /* Recheck this is a buddy page under lock */ - if (!PageBuddy(page)) - goto isolate_fail; + /* Recheck this is a buddy page under lock */ + if (!PageBuddy(page)) + goto isolate_fail; + } /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); @@ -346,6 +423,9 @@ isolate_fail: } + /* Record how far we have got within the block */ + *start_pfn = blockpfn; + trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); /* @@ -361,8 +441,7 @@ isolate_fail: /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) - update_pageblock_skip(cc, valid_page, total_isolated, true, - false); + update_pageblock_skip(cc, valid_page, total_isolated, false); count_compact_events(COMPACTFREE_SCANNED, nr_scanned); if (total_isolated) @@ -390,19 +469,31 @@ isolate_freepages_range(struct compact_control *cc, unsigned long isolated, pfn, block_end_pfn; LIST_HEAD(freelist); - for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { - if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn))) - break; + pfn = start_pfn; + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + + for (; pfn < end_pfn; pfn += isolated, + block_end_pfn += pageblock_nr_pages) { + /* Protect pfn from changing by isolate_freepages_block */ + unsigned long isolate_start_pfn = pfn; + + block_end_pfn = min(block_end_pfn, end_pfn); /* - * On subsequent iterations ALIGN() is actually not needed, - * but we keep it that we not to complicate the code. + * pfn could pass the block_end_pfn if isolated freepage + * is more than pageblock order. In this case, we adjust + * scanning range to right one. */ - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); - block_end_pfn = min(block_end_pfn, end_pfn); + if (pfn >= block_end_pfn) { + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); + } - isolated = isolate_freepages_block(cc, pfn, block_end_pfn, - &freelist, true); + if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + break; + + isolated = isolate_freepages_block(cc, &isolate_start_pfn, + block_end_pfn, &freelist, true); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -433,22 +524,19 @@ isolate_freepages_range(struct compact_control *cc, } /* Update the number of anon and file isolated pages in the zone */ -static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc) +static void acct_isolated(struct zone *zone, struct compact_control *cc) { struct page *page; unsigned int count[2] = { 0, }; + if (list_empty(&cc->migratepages)) + return; + list_for_each_entry(page, &cc->migratepages, lru) count[!!page_is_file_cache(page)]++; - /* If locked we can use the interrupt unsafe versions */ - if (locked) { - __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); - } else { - mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); - mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); - } + mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); + mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); } /* Similar to reclaim, but different enough that they don't share logic */ @@ -467,40 +555,34 @@ static bool too_many_isolated(struct zone *zone) } /** - * isolate_migratepages_range() - isolate all migrate-able pages in range. - * @zone: Zone pages are in. + * isolate_migratepages_block() - isolate all migrate-able pages within + * a single pageblock * @cc: Compaction control structure. - * @low_pfn: The first PFN of the range. - * @end_pfn: The one-past-the-last PFN of the range. - * @unevictable: true if it allows to isolate unevictable pages + * @low_pfn: The first PFN to isolate + * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock + * @isolate_mode: Isolation mode to be used. * * Isolate all pages that can be migrated from the range specified by - * [low_pfn, end_pfn). Returns zero if there is a fatal signal - * pending), otherwise PFN of the first page that was not scanned - * (which may be both less, equal to or more then end_pfn). + * [low_pfn, end_pfn). The range is expected to be within same pageblock. + * Returns zero if there is a fatal signal pending, otherwise PFN of the + * first page that was not scanned (which may be both less, equal to or more + * than end_pfn). * - * Assumes that cc->migratepages is empty and cc->nr_migratepages is - * zero. - * - * Apart from cc->migratepages and cc->nr_migratetypes this function - * does not modify any cc's fields, in particular it does not modify - * (or read for that matter) cc->migrate_pfn. + * The pages are isolated on cc->migratepages list (not required to be empty), + * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field + * is neither read nor updated. */ -unsigned long -isolate_migratepages_range(struct zone *zone, struct compact_control *cc, - unsigned long low_pfn, unsigned long end_pfn, bool unevictable) +static unsigned long +isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, + unsigned long end_pfn, isolate_mode_t isolate_mode) { - unsigned long last_pageblock_nr = 0, pageblock_nr; + struct zone *zone = cc->zone; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; struct lruvec *lruvec; - unsigned long flags; + unsigned long flags = 0; bool locked = false; struct page *page = NULL, *valid_page = NULL; - bool set_unsuitable = true; - const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? - ISOLATE_ASYNC_MIGRATE : 0) | - (unevictable ? ISOLATE_UNEVICTABLE : 0); /* * Ensure that there are not too many pages isolated from the LRU @@ -523,72 +605,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { - /* give a chance to irqs before checking need_resched() */ - if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { - if (should_release_lock(&zone->lru_lock)) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - locked = false; - } - } - /* - * migrate_pfn does not necessarily start aligned to a - * pageblock. Ensure that pfn_valid is called when moving - * into a new MAX_ORDER_NR_PAGES range in case of large - * memory holes within the zone + * Periodically drop the lock (if held) regardless of its + * contention, to give chance to IRQs. Abort async compaction + * if contended. */ - if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { - if (!pfn_valid(low_pfn)) { - low_pfn += MAX_ORDER_NR_PAGES - 1; - continue; - } - } + if (!(low_pfn % SWAP_CLUSTER_MAX) + && compact_unlock_should_abort(&zone->lru_lock, flags, + &locked, cc)) + break; if (!pfn_valid_within(low_pfn)) continue; nr_scanned++; - /* - * Get the page and ensure the page is within the same zone. - * See the comment in isolate_freepages about overlapping - * nodes. It is deliberate that the new zone lock is not taken - * as memory compaction should not move pages between nodes. - */ page = pfn_to_page(low_pfn); - if (page_zone(page) != zone) - continue; if (!valid_page) valid_page = page; - /* If isolation recently failed, do not retry */ - pageblock_nr = low_pfn >> pageblock_order; - if (last_pageblock_nr != pageblock_nr) { - int mt; - - last_pageblock_nr = pageblock_nr; - if (!isolation_suitable(cc, page)) - goto next_pageblock; + /* + * Skip if free. We read page order here without zone lock + * which is generally unsafe, but the race window is small and + * the worst thing that can happen is that we skip some + * potential isolation targets. + */ + if (PageBuddy(page)) { + unsigned long freepage_order = page_order_unsafe(page); /* - * For async migration, also only scan in MOVABLE - * blocks. Async migration is optimistic to see if - * the minimum amount of work satisfies the allocation + * Without lock, we cannot be sure that what we got is + * a valid page order. Consider only values in the + * valid order range to prevent low_pfn overflow. */ - mt = get_pageblock_migratetype(page); - if (cc->mode == MIGRATE_ASYNC && - !migrate_async_suitable(mt)) { - set_unsuitable = false; - goto next_pageblock; - } - } - - /* - * Skip if free. page_order cannot be used without zone->lock - * as nothing prevents parallel allocations or buddy merging. - */ - if (PageBuddy(page)) + if (freepage_order > 0 && freepage_order < MAX_ORDER) + low_pfn += (1UL << freepage_order) - 1; continue; + } /* * Check may be lockless but that's ok as we recheck later. @@ -597,7 +650,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ if (!PageLRU(page)) { if (unlikely(balloon_page_movable(page))) { - if (locked && balloon_page_isolate(page)) { + if (balloon_page_isolate(page)) { /* Successfully isolated */ goto isolate_success; } @@ -617,8 +670,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ if (PageTransHuge(page)) { if (!locked) - goto next_pageblock; - low_pfn += (1 << compound_order(page)) - 1; + low_pfn = ALIGN(low_pfn + 1, + pageblock_nr_pages) - 1; + else + low_pfn += (1 << compound_order(page)) - 1; + continue; } @@ -631,24 +687,26 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, page_count(page) > page_mapcount(page)) continue; - /* Check if it is ok to still hold the lock */ - locked = compact_checklock_irqsave(&zone->lru_lock, &flags, - locked, cc); - if (!locked || fatal_signal_pending(current)) - break; + /* If we already hold the lock, we can skip some rechecking */ + if (!locked) { + locked = compact_trylock_irqsave(&zone->lru_lock, + &flags, cc); + if (!locked) + break; - /* Recheck PageLRU and PageTransHuge under lock */ - if (!PageLRU(page)) - continue; - if (PageTransHuge(page)) { - low_pfn += (1 << compound_order(page)) - 1; - continue; + /* Recheck PageLRU and PageTransHuge under lock */ + if (!PageLRU(page)) + continue; + if (PageTransHuge(page)) { + low_pfn += (1 << compound_order(page)) - 1; + continue; + } } lruvec = mem_cgroup_page_lruvec(page, zone); /* Try isolate the page */ - if (__isolate_lru_page(page, mode) != 0) + if (__isolate_lru_page(page, isolate_mode) != 0) continue; VM_BUG_ON_PAGE(PageTransCompound(page), page); @@ -667,14 +725,14 @@ isolate_success: ++low_pfn; break; } - - continue; - -next_pageblock: - low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; } - acct_isolated(zone, locked, cc); + /* + * The PageBuddy() check could have potentially brought us outside + * the range to be scanned. + */ + if (unlikely(low_pfn > end_pfn)) + low_pfn = end_pfn; if (locked) spin_unlock_irqrestore(&zone->lru_lock, flags); @@ -684,8 +742,7 @@ next_pageblock: * if the whole pageblock was scanned without isolating any page. */ if (low_pfn == end_pfn) - update_pageblock_skip(cc, valid_page, nr_isolated, - set_unsuitable, true); + update_pageblock_skip(cc, valid_page, nr_isolated, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -696,17 +753,68 @@ next_pageblock: return low_pfn; } +/** + * isolate_migratepages_range() - isolate migrate-able pages in a PFN range + * @cc: Compaction control structure. + * @start_pfn: The first PFN to start isolating. + * @end_pfn: The one-past-last PFN. + * + * Returns zero if isolation fails fatally due to e.g. pending signal. + * Otherwise, function returns one-past-the-last PFN of isolated page + * (which may be greater than end_pfn if end fell in a middle of a THP page). + */ +unsigned long +isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn, block_end_pfn; + + /* Scan block by block. First and last block may be incomplete */ + pfn = start_pfn; + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + + for (; pfn < end_pfn; pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, end_pfn); + + if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + continue; + + pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, + ISOLATE_UNEVICTABLE); + + /* + * In case of fatal failure, release everything that might + * have been isolated in the previous iteration, and signal + * the failure back to caller. + */ + if (!pfn) { + putback_movable_pages(&cc->migratepages); + cc->nr_migratepages = 0; + break; + } + + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) + break; + } + acct_isolated(cc->zone, cc); + + return pfn; +} + #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION /* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. */ -static void isolate_freepages(struct zone *zone, - struct compact_control *cc) +static void isolate_freepages(struct compact_control *cc) { + struct zone *zone = cc->zone; struct page *page; unsigned long block_start_pfn; /* start of current pageblock */ + unsigned long isolate_start_pfn; /* exact pfn we start at */ unsigned long block_end_pfn; /* end of current pageblock */ unsigned long low_pfn; /* lowest pfn scanner is able to scan */ int nr_freepages = cc->nr_freepages; @@ -715,14 +823,15 @@ static void isolate_freepages(struct zone *zone, /* * Initialise the free scanner. The starting point is where we last * successfully isolated from, zone-cached value, or the end of the - * zone when isolating for the first time. We need this aligned to - * the pageblock boundary, because we do + * zone when isolating for the first time. For looping we also need + * this pfn aligned down to the pageblock boundary, because we do * block_start_pfn -= pageblock_nr_pages in the for loop. * For ending point, take care when isolating in last pageblock of a * a zone which ends in the middle of a pageblock. * The low boundary is the end of the pageblock the migration scanner * is using. */ + isolate_start_pfn = cc->free_pfn; block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); block_end_pfn = min(block_start_pfn + pageblock_nr_pages, zone_end_pfn(zone)); @@ -735,7 +844,8 @@ static void isolate_freepages(struct zone *zone, */ for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; block_end_pfn = block_start_pfn, - block_start_pfn -= pageblock_nr_pages) { + block_start_pfn -= pageblock_nr_pages, + isolate_start_pfn = block_start_pfn) { unsigned long isolated; /* @@ -747,18 +857,9 @@ static void isolate_freepages(struct zone *zone, && compact_should_abort(cc)) break; - if (!pfn_valid(block_start_pfn)) - continue; - - /* - * Check for overlapping nodes/zones. It's possible on some - * configurations to have a setup like - * node0 node1 node0 - * i.e. it's possible that all pages within a zones range of - * pages do not belong to a single zone. - */ - page = pfn_to_page(block_start_pfn); - if (page_zone(page) != zone) + page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, + zone); + if (!page) continue; /* Check the block is suitable for migration */ @@ -769,13 +870,25 @@ static void isolate_freepages(struct zone *zone, if (!isolation_suitable(cc, page)) continue; - /* Found a block suitable for isolating free pages from */ - cc->free_pfn = block_start_pfn; - isolated = isolate_freepages_block(cc, block_start_pfn, + /* Found a block suitable for isolating free pages from. */ + isolated = isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, freelist, false); nr_freepages += isolated; /* + * Remember where the free scanner should restart next time, + * which is where isolate_freepages_block() left off. + * But if it scanned the whole pageblock, isolate_start_pfn + * now points at block_end_pfn, which is the start of the next + * pageblock. + * In that case we will however want to restart at the start + * of the previous pageblock. + */ + cc->free_pfn = (isolate_start_pfn < block_end_pfn) ? + isolate_start_pfn : + block_start_pfn - pageblock_nr_pages; + + /* * Set a flag that we successfully isolated in this pageblock. * In the next loop iteration, zone->compact_cached_free_pfn * will not be updated and thus it will effectively contain the @@ -822,7 +935,7 @@ static struct page *compaction_alloc(struct page *migratepage, */ if (list_empty(&cc->freepages)) { if (!cc->contended) - isolate_freepages(cc->zone, cc); + isolate_freepages(cc); if (list_empty(&cc->freepages)) return NULL; @@ -856,38 +969,88 @@ typedef enum { } isolate_migrate_t; /* - * Isolate all pages that can be migrated from the block pointed to by - * the migrate scanner within compact_control. + * Isolate all pages that can be migrated from the first suitable block, + * starting at the block pointed to by the migrate scanner pfn within + * compact_control. */ static isolate_migrate_t isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; + struct page *page; + const isolate_mode_t isolate_mode = + (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); - /* Do not scan outside zone boundaries */ - low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); + /* + * Start at where we last stopped, or beginning of the zone as + * initialized by compact_zone() + */ + low_pfn = cc->migrate_pfn; /* Only scan within a pageblock boundary */ end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); - /* Do not cross the free scanner or scan within a memory hole */ - if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { - cc->migrate_pfn = end_pfn; - return ISOLATE_NONE; - } + /* + * Iterate over whole pageblocks until we find the first suitable. + * Do not cross the free scanner. + */ + for (; end_pfn <= cc->free_pfn; + low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { + + /* + * This can potentially iterate a massively long zone with + * many pageblocks unsuitable, so periodically check if we + * need to schedule, or even abort async compaction. + */ + if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) + && compact_should_abort(cc)) + break; - /* Perform the isolation */ - low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); - if (!low_pfn || cc->contended) - return ISOLATE_ABORT; + page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); + if (!page) + continue; - cc->migrate_pfn = low_pfn; + /* If isolation recently failed, do not retry */ + if (!isolation_suitable(cc, page)) + continue; - return ISOLATE_SUCCESS; + /* + * For async compaction, also only scan in MOVABLE blocks. + * Async compaction is optimistic to see if the minimum amount + * of work satisfies the allocation. + */ + if (cc->mode == MIGRATE_ASYNC && + !migrate_async_suitable(get_pageblock_migratetype(page))) + continue; + + /* Perform the isolation */ + low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, + isolate_mode); + + if (!low_pfn || cc->contended) + return ISOLATE_ABORT; + + /* + * Either we isolated something and proceed with migration. Or + * we failed and compact_zone should decide if we should + * continue or not. + */ + break; + } + + acct_isolated(zone, cc); + /* + * Record where migration scanner will be restarted. If we end up in + * the same pageblock as the free scanner, make the scanners fully + * meet so that compact_finished() terminates compaction. + */ + cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn; + + return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } -static int compact_finished(struct zone *zone, - struct compact_control *cc) +static int compact_finished(struct zone *zone, struct compact_control *cc, + const int migratetype) { unsigned int order; unsigned long watermark; @@ -933,7 +1096,7 @@ static int compact_finished(struct zone *zone, struct free_area *area = &zone->free_area[order]; /* Job done if page is free of the right migratetype */ - if (!list_empty(&area->free_list[cc->migratetype])) + if (!list_empty(&area->free_list[migratetype])) return COMPACT_PARTIAL; /* Job done if allocation would set block type */ @@ -999,6 +1162,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) int ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); + const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); const bool sync = cc->mode != MIGRATE_ASYNC; ret = compaction_suitable(zone, cc->order); @@ -1041,7 +1205,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) migrate_prep_local(); - while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { + while ((ret = compact_finished(zone, cc, migratetype)) == + COMPACT_CONTINUE) { int err; switch (isolate_migratepages(zone, cc)) { @@ -1056,9 +1221,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } - if (!cc->nr_migratepages) - continue; - err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION); @@ -1092,14 +1254,14 @@ out: } static unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, enum migrate_mode mode, bool *contended) + gfp_t gfp_mask, enum migrate_mode mode, int *contended) { unsigned long ret; struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, .order = order, - .migratetype = allocflags_to_migratetype(gfp_mask), + .gfp_mask = gfp_mask, .zone = zone, .mode = mode, }; @@ -1124,48 +1286,117 @@ int sysctl_extfrag_threshold = 500; * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from * @mode: The migration mode for async, sync light, or sync migration - * @contended: Return value that is true if compaction was aborted due to lock contention - * @page: Optionally capture a free page of the requested order during compaction + * @contended: Return value that determines if compaction was aborted due to + * need_resched() or lock contention + * @candidate_zone: Return the zone where we think allocation should succeed * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - enum migrate_mode mode, bool *contended) + enum migrate_mode mode, int *contended, + struct zone **candidate_zone) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; - int rc = COMPACT_SKIPPED; + int rc = COMPACT_DEFERRED; int alloc_flags = 0; + int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ + + *contended = COMPACT_CONTENDED_NONE; /* Check if the GFP flags allow compaction */ if (!order || !may_enter_fs || !may_perform_io) - return rc; - - count_compact_event(COMPACTSTALL); + return COMPACT_SKIPPED; #ifdef CONFIG_CMA - if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { int status; + int zone_contended; + + if (compaction_deferred(zone, order)) + continue; status = compact_zone_order(zone, order, gfp_mask, mode, - contended); + &zone_contended); rc = max(status, rc); + /* + * It takes at least one zone that wasn't lock contended + * to clear all_zones_contended. + */ + all_zones_contended &= zone_contended; /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, - alloc_flags)) - break; + alloc_flags)) { + *candidate_zone = zone; + /* + * We think the allocation will succeed in this zone, + * but it is not certain, hence the false. The caller + * will repeat this with true if allocation indeed + * succeeds in this zone. + */ + compaction_defer_reset(zone, order, false); + /* + * It is possible that async compaction aborted due to + * need_resched() and the watermarks were ok thanks to + * somebody else freeing memory. The allocation can + * however still fail so we better signal the + * need_resched() contention anyway (this will not + * prevent the allocation attempt). + */ + if (zone_contended == COMPACT_CONTENDED_SCHED) + *contended = COMPACT_CONTENDED_SCHED; + + goto break_loop; + } + + if (mode != MIGRATE_ASYNC) { + /* + * We think that allocation won't succeed in this zone + * so we defer compaction there. If it ends up + * succeeding after all, it will be reset. + */ + defer_compaction(zone, order); + } + + /* + * We might have stopped compacting due to need_resched() in + * async compaction, or due to a fatal signal detected. In that + * case do not try further zones and signal need_resched() + * contention. + */ + if ((zone_contended == COMPACT_CONTENDED_SCHED) + || fatal_signal_pending(current)) { + *contended = COMPACT_CONTENDED_SCHED; + goto break_loop; + } + + continue; +break_loop: + /* + * We might not have tried all the zones, so be conservative + * and assume they are not all lock contended. + */ + all_zones_contended = 0; + break; } + /* + * If at least one zone wasn't deferred or skipped, we report if all + * zones that were tried were lock contended. + */ + if (rc > COMPACT_SKIPPED && all_zones_contended) + *contended = COMPACT_CONTENDED_LOCK; + return rc; } diff --git a/mm/debug.c b/mm/debug.c new file mode 100644 index 0000000..5ce45c9 --- /dev/null +++ b/mm/debug.c @@ -0,0 +1,237 @@ +/* + * mm/debug.c + * + * mm/ specific debug routines. + * + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/ftrace_event.h> +#include <linux/memcontrol.h> + +static const struct trace_print_flags pageflag_names[] = { + {1UL << PG_locked, "locked" }, + {1UL << PG_error, "error" }, + {1UL << PG_referenced, "referenced" }, + {1UL << PG_uptodate, "uptodate" }, + {1UL << PG_dirty, "dirty" }, + {1UL << PG_lru, "lru" }, + {1UL << PG_active, "active" }, + {1UL << PG_slab, "slab" }, + {1UL << PG_owner_priv_1, "owner_priv_1" }, + {1UL << PG_arch_1, "arch_1" }, + {1UL << PG_reserved, "reserved" }, + {1UL << PG_private, "private" }, + {1UL << PG_private_2, "private_2" }, + {1UL << PG_writeback, "writeback" }, +#ifdef CONFIG_PAGEFLAGS_EXTENDED + {1UL << PG_head, "head" }, + {1UL << PG_tail, "tail" }, +#else + {1UL << PG_compound, "compound" }, +#endif + {1UL << PG_swapcache, "swapcache" }, + {1UL << PG_mappedtodisk, "mappedtodisk" }, + {1UL << PG_reclaim, "reclaim" }, + {1UL << PG_swapbacked, "swapbacked" }, + {1UL << PG_unevictable, "unevictable" }, +#ifdef CONFIG_MMU + {1UL << PG_mlocked, "mlocked" }, +#endif +#ifdef CONFIG_ARCH_USES_PG_UNCACHED + {1UL << PG_uncached, "uncached" }, +#endif +#ifdef CONFIG_MEMORY_FAILURE + {1UL << PG_hwpoison, "hwpoison" }, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + {1UL << PG_compound_lock, "compound_lock" }, +#endif +}; + +static void dump_flags(unsigned long flags, + const struct trace_print_flags *names, int count) +{ + const char *delim = ""; + unsigned long mask; + int i; + + pr_emerg("flags: %#lx(", flags); + + /* remove zone id */ + flags &= (1UL << NR_PAGEFLAGS) - 1; + + for (i = 0; i < count && flags; i++) { + + mask = names[i].mask; + if ((flags & mask) != mask) + continue; + + flags &= ~mask; + pr_cont("%s%s", delim, names[i].name); + delim = "|"; + } + + /* check for left over flags */ + if (flags) + pr_cont("%s%#lx", delim, flags); + + pr_cont(")\n"); +} + +void dump_page_badflags(struct page *page, const char *reason, + unsigned long badflags) +{ + pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", + page, atomic_read(&page->_count), page_mapcount(page), + page->mapping, page->index); + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); + dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); + if (reason) + pr_alert("page dumped because: %s\n", reason); + if (page->flags & badflags) { + pr_alert("bad because of flags:\n"); + dump_flags(page->flags & badflags, + pageflag_names, ARRAY_SIZE(pageflag_names)); + } + mem_cgroup_print_bad_page(page); +} + +void dump_page(struct page *page, const char *reason) +{ + dump_page_badflags(page, reason, 0); +} +EXPORT_SYMBOL(dump_page); + +#ifdef CONFIG_DEBUG_VM + +static const struct trace_print_flags vmaflags_names[] = { + {VM_READ, "read" }, + {VM_WRITE, "write" }, + {VM_EXEC, "exec" }, + {VM_SHARED, "shared" }, + {VM_MAYREAD, "mayread" }, + {VM_MAYWRITE, "maywrite" }, + {VM_MAYEXEC, "mayexec" }, + {VM_MAYSHARE, "mayshare" }, + {VM_GROWSDOWN, "growsdown" }, + {VM_PFNMAP, "pfnmap" }, + {VM_DENYWRITE, "denywrite" }, + {VM_LOCKED, "locked" }, + {VM_IO, "io" }, + {VM_SEQ_READ, "seqread" }, + {VM_RAND_READ, "randread" }, + {VM_DONTCOPY, "dontcopy" }, + {VM_DONTEXPAND, "dontexpand" }, + {VM_ACCOUNT, "account" }, + {VM_NORESERVE, "noreserve" }, + {VM_HUGETLB, "hugetlb" }, + {VM_NONLINEAR, "nonlinear" }, +#if defined(CONFIG_X86) + {VM_PAT, "pat" }, +#elif defined(CONFIG_PPC) + {VM_SAO, "sao" }, +#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) + {VM_GROWSUP, "growsup" }, +#elif !defined(CONFIG_MMU) + {VM_MAPPED_COPY, "mappedcopy" }, +#else + {VM_ARCH_1, "arch_1" }, +#endif + {VM_DONTDUMP, "dontdump" }, +#ifdef CONFIG_MEM_SOFT_DIRTY + {VM_SOFTDIRTY, "softdirty" }, +#endif + {VM_MIXEDMAP, "mixedmap" }, + {VM_HUGEPAGE, "hugepage" }, + {VM_NOHUGEPAGE, "nohugepage" }, + {VM_MERGEABLE, "mergeable" }, +}; + +void dump_vma(const struct vm_area_struct *vma) +{ + pr_emerg("vma %p start %p end %p\n" + "next %p prev %p mm %p\n" + "prot %lx anon_vma %p vm_ops %p\n" + "pgoff %lx file %p private_data %p\n", + vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, + vma->vm_prev, vma->vm_mm, + (unsigned long)pgprot_val(vma->vm_page_prot), + vma->anon_vma, vma->vm_ops, vma->vm_pgoff, + vma->vm_file, vma->vm_private_data); + dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); +} +EXPORT_SYMBOL(dump_vma); + +void dump_mm(const struct mm_struct *mm) +{ + pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n" +#ifdef CONFIG_MMU + "get_unmapped_area %p\n" +#endif + "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" + "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" + "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" + "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" + "start_code %lx end_code %lx start_data %lx end_data %lx\n" + "start_brk %lx brk %lx start_stack %lx\n" + "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" + "binfmt %p flags %lx core_state %p\n" +#ifdef CONFIG_AIO + "ioctx_table %p\n" +#endif +#ifdef CONFIG_MEMCG + "owner %p " +#endif + "exe_file %p\n" +#ifdef CONFIG_MMU_NOTIFIER + "mmu_notifier_mm %p\n" +#endif +#ifdef CONFIG_NUMA_BALANCING + "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n" +#endif +#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) + "tlb_flush_pending %d\n" +#endif + "%s", /* This is here to hold the comma */ + + mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, +#ifdef CONFIG_MMU + mm->get_unmapped_area, +#endif + mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, + mm->pgd, atomic_read(&mm->mm_users), + atomic_read(&mm->mm_count), + atomic_long_read((atomic_long_t *)&mm->nr_ptes), + mm->map_count, + mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, + mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, + mm->start_code, mm->end_code, mm->start_data, mm->end_data, + mm->start_brk, mm->brk, mm->start_stack, + mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, + mm->binfmt, mm->flags, mm->core_state, +#ifdef CONFIG_AIO + mm->ioctx_table, +#endif +#ifdef CONFIG_MEMCG + mm->owner, +#endif + mm->exe_file, +#ifdef CONFIG_MMU_NOTIFIER + mm->mmu_notifier_mm, +#endif +#ifdef CONFIG_NUMA_BALANCING + mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq, +#endif +#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) + mm->tlb_flush_pending, +#endif + "" /* This is here to not have a comma! */ + ); + + dump_flags(mm->def_flags, vmaflags_names, + ARRAY_SIZE(vmaflags_names)); +} + +#endif /* CONFIG_DEBUG_VM */ diff --git a/mm/dmapool.c b/mm/dmapool.c index 306baa5..fd5fe43 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -62,6 +62,7 @@ struct dma_page { /* cacheable header for 'allocation' bytes */ }; static DEFINE_MUTEX(pools_lock); +static DEFINE_MUTEX(pools_reg_lock); static ssize_t show_pools(struct device *dev, struct device_attribute *attr, char *buf) @@ -132,29 +133,27 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, { struct dma_pool *retval; size_t allocation; + bool empty = false; - if (align == 0) { + if (align == 0) align = 1; - } else if (align & (align - 1)) { + else if (align & (align - 1)) return NULL; - } - if (size == 0) { + if (size == 0) return NULL; - } else if (size < 4) { + else if (size < 4) size = 4; - } if ((size % align) != 0) size = ALIGN(size, align); allocation = max_t(size_t, size, PAGE_SIZE); - if (!boundary) { + if (!boundary) boundary = allocation; - } else if ((boundary < size) || (boundary & (boundary - 1))) { + else if ((boundary < size) || (boundary & (boundary - 1))) return NULL; - } retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); if (!retval) @@ -172,15 +171,34 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, INIT_LIST_HEAD(&retval->pools); + /* + * pools_lock ensures that the ->dma_pools list does not get corrupted. + * pools_reg_lock ensures that there is not a race between + * dma_pool_create() and dma_pool_destroy() or within dma_pool_create() + * when the first invocation of dma_pool_create() failed on + * device_create_file() and the second assumes that it has been done (I + * know it is a short window). + */ + mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); - if (list_empty(&dev->dma_pools) && - device_create_file(dev, &dev_attr_pools)) { - kfree(retval); - return NULL; - } else - list_add(&retval->pools, &dev->dma_pools); + if (list_empty(&dev->dma_pools)) + empty = true; + list_add(&retval->pools, &dev->dma_pools); mutex_unlock(&pools_lock); - + if (empty) { + int err; + + err = device_create_file(dev, &dev_attr_pools); + if (err) { + mutex_lock(&pools_lock); + list_del(&retval->pools); + mutex_unlock(&pools_lock); + mutex_unlock(&pools_reg_lock); + kfree(retval); + return NULL; + } + } + mutex_unlock(&pools_reg_lock); return retval; } EXPORT_SYMBOL(dma_pool_create); @@ -251,11 +269,17 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page) */ void dma_pool_destroy(struct dma_pool *pool) { + bool empty = false; + + mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); list_del(&pool->pools); if (pool->dev && list_empty(&pool->dev->dma_pools)) - device_remove_file(pool->dev, &dev_attr_pools); + empty = true; mutex_unlock(&pools_lock); + if (empty) + device_remove_file(pool->dev, &dev_attr_pools); + mutex_unlock(&pools_reg_lock); while (!list_empty(&pool->page_list)) { struct dma_page *page; diff --git a/mm/filemap.c b/mm/filemap.c index f092654..14b4642 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -31,6 +31,7 @@ #include <linux/security.h> #include <linux/cpuset.h> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ +#include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/cleancache.h> #include <linux/rmap.h> @@ -233,7 +234,6 @@ void delete_from_page_cache(struct page *page) spin_lock_irq(&mapping->tree_lock); __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); if (freepage) freepage(page); @@ -489,8 +489,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); - /* mem_cgroup codes must not be called under tree_lock */ - mem_cgroup_replace_page_cache(old, new); + mem_cgroup_migrate(old, new, true); radix_tree_preload_end(); if (freepage) freepage(old); @@ -548,19 +547,24 @@ static int __add_to_page_cache_locked(struct page *page, pgoff_t offset, gfp_t gfp_mask, void **shadowp) { + int huge = PageHuge(page); + struct mem_cgroup *memcg; int error; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); - error = mem_cgroup_charge_file(page, current->mm, - gfp_mask & GFP_RECLAIM_MASK); - if (error) - return error; + if (!huge) { + error = mem_cgroup_try_charge(page, current->mm, + gfp_mask, &memcg); + if (error) + return error; + } error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); if (error) { - mem_cgroup_uncharge_cache_page(page); + if (!huge) + mem_cgroup_cancel_charge(page, memcg); return error; } @@ -575,13 +579,16 @@ static int __add_to_page_cache_locked(struct page *page, goto err_insert; __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); + if (!huge) + mem_cgroup_commit_charge(page, memcg, false); trace_mm_filemap_add_to_page_cache(page); return 0; err_insert: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); + if (!huge) + mem_cgroup_cancel_charge(page, memcg); page_cache_release(page); return error; } @@ -663,17 +670,13 @@ EXPORT_SYMBOL(__page_cache_alloc); * at a cost of "thundering herd" phenomena during rare hash * collisions. */ -static wait_queue_head_t *page_waitqueue(struct page *page) +wait_queue_head_t *page_waitqueue(struct page *page) { const struct zone *zone = page_zone(page); return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; } - -static inline void wake_up_page(struct page *page, int bit) -{ - __wake_up_bit(page_waitqueue(page), &page->flags, bit); -} +EXPORT_SYMBOL(page_waitqueue); void wait_on_page_bit(struct page *page, int bit_nr) { @@ -696,6 +699,19 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) bit_wait_io, TASK_KILLABLE); } +int wait_on_page_bit_killable_timeout(struct page *page, + int bit_nr, unsigned long timeout) +{ + DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + + wait.key.timeout = jiffies + timeout; + if (!test_bit(bit_nr, &page->flags)) + return 0; + return __wait_on_bit(page_waitqueue(page), &wait, + bit_wait_io_timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout); + /** * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue * @page: Page defining the wait queue of interest @@ -808,6 +824,17 @@ int __lock_page_killable(struct page *page) } EXPORT_SYMBOL_GPL(__lock_page_killable); +/* + * Return values: + * 1 - page is locked; mmap_sem is still held. + * 0 - page is not locked. + * mmap_sem has been released (up_read()), unless flags had both + * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in + * which case mmap_sem is still held. + * + * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 + * with the page locked and the mmap_sem unperturbed. + */ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { @@ -1091,9 +1118,9 @@ no_page: if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) fgp_flags |= FGP_LOCK; - /* Init accessed so avoit atomic mark_page_accessed later */ + /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) - init_page_accessed(page); + __SetPageReferenced(page); err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); if (unlikely(err)) { @@ -1726,7 +1753,7 @@ EXPORT_SYMBOL(generic_file_read_iter); static int page_cache_read(struct file *file, pgoff_t offset) { struct address_space *mapping = file->f_mapping; - struct page *page; + struct page *page; int ret; do { @@ -1743,7 +1770,7 @@ static int page_cache_read(struct file *file, pgoff_t offset) page_cache_release(page); } while (ret == AOP_TRUNCATED_PAGE); - + return ret; } @@ -1827,6 +1854,18 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. + * + * vma->vm_mm->mmap_sem must be held on entry. + * + * If our return value has VM_FAULT_RETRY set, it's because + * lock_page_or_retry() returned 0. + * The mmap_sem has usually been released in this case. + * See __lock_page_or_retry() for the exception. + * + * If our return value does not have VM_FAULT_RETRY set, the mmap_sem + * has not been released. + * + * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -2572,7 +2611,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) * that this differs from normal direct-io semantics, which * will return -EFOO even if some bytes were written. */ - if (unlikely(status < 0) && !written) { + if (unlikely(status < 0)) { err = status; goto out; } @@ -10,6 +10,10 @@ #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <asm/pgtable.h> + #include "internal.h" static struct page *no_page_table(struct vm_area_struct *vma, @@ -258,6 +262,11 @@ unmap: return ret; } +/* + * mmap_sem must be held on entry. If @nonblocking != NULL and + * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. + * If it is, *@nonblocking will be set to 0 and -EBUSY returned. + */ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long address, unsigned int *flags, int *nonblocking) { @@ -276,6 +285,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_ALLOW_RETRY; if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; + if (*flags & FOLL_TRIED) { + VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY); + fault_flags |= FAULT_FLAG_TRIED; + } ret = handle_mm_fault(mm, vma, address, fault_flags); if (ret & VM_FAULT_ERROR) { @@ -373,7 +386,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * with a put_page() call when it is finished with. vmas will only * remain valid while mmap_sem is held. * - * Must be called with mmap_sem held for read or write. + * Must be called with mmap_sem held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to * each struct page that each user address corresponds to at a given @@ -396,7 +409,14 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * * If @nonblocking != NULL, __get_user_pages will not wait for disk IO * or mmap_sem contention, and if waiting is needed to pin all pages, - * *@nonblocking will be set to 0. + * *@nonblocking will be set to 0. Further, if @gup_flags does not + * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in + * this case. + * + * A caller using such a combination of @nonblocking and @gup_flags + * must therefore hold the mmap_sem for reading only, and recognize + * when it's been released. Otherwise, it must be held for either + * reading or writing and will not be released. * * In most cases, get_user_pages or get_user_pages_fast should be used * instead of __get_user_pages. __get_user_pages should be used only if @@ -528,7 +548,7 @@ EXPORT_SYMBOL(__get_user_pages); * such architectures, gup() will not be enough to make a subsequent access * succeed. * - * This should be called with the mm_sem held for read. + * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). */ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags) @@ -660,3 +680,353 @@ struct page *get_dump_page(unsigned long addr) return page; } #endif /* CONFIG_ELF_CORE */ + +/* + * Generic RCU Fast GUP + * + * get_user_pages_fast attempts to pin user pages by walking the page + * tables directly and avoids taking locks. Thus the walker needs to be + * protected from page table pages being freed from under it, and should + * block any THP splits. + * + * One way to achieve this is to have the walker disable interrupts, and + * rely on IPIs from the TLB flushing code blocking before the page table + * pages are freed. This is unsuitable for architectures that do not need + * to broadcast an IPI when invalidating TLBs. + * + * Another way to achieve this is to batch up page table containing pages + * belonging to more than one mm_user, then rcu_sched a callback to free those + * pages. Disabling interrupts will allow the fast_gup walker to both block + * the rcu_sched callback, and an IPI that we broadcast for splitting THPs + * (which is a relatively rare event). The code below adopts this strategy. + * + * Before activating this code, please be aware that the following assumptions + * are currently made: + * + * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free + * pages containing page tables. + * + * *) THP splits will broadcast an IPI, this can be achieved by overriding + * pmdp_splitting_flush. + * + * *) ptes can be read atomically by the architecture. + * + * *) access_ok is sufficient to validate userspace address ranges. + * + * The last two assumptions can be relaxed by the addition of helper functions. + * + * This code is based heavily on the PowerPC implementation by Nick Piggin. + */ +#ifdef CONFIG_HAVE_GENERIC_RCU_GUP + +#ifdef __HAVE_ARCH_PTE_SPECIAL +static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + pte_t *ptep, *ptem; + int ret = 0; + + ptem = ptep = pte_offset_map(&pmd, addr); + do { + /* + * In the line below we are assuming that the pte can be read + * atomically. If this is not the case for your architecture, + * please wrap this in a helper function! + * + * for an example see gup_get_pte in arch/x86/mm/gup.c + */ + pte_t pte = ACCESS_ONCE(*ptep); + struct page *page; + + /* + * Similar to the PMD case below, NUMA hinting must take slow + * path + */ + if (!pte_present(pte) || pte_special(pte) || + pte_numa(pte) || (write && !pte_write(pte))) + goto pte_unmap; + + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + + if (!page_cache_get_speculative(page)) + goto pte_unmap; + + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + put_page(page); + goto pte_unmap; + } + + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + + ret = 1; + +pte_unmap: + pte_unmap(ptem); + return ret; +} +#else + +/* + * If we can't determine whether or not a pte is special, then fail immediately + * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not + * to be special. + * + * For a futex to be placed on a THP tail page, get_futex_key requires a + * __get_user_pages_fast implementation that can pin pages. Thus it's still + * useful to have gup_huge_pmd even if we can't operate on ptes. + */ +static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + return 0; +} +#endif /* __HAVE_ARCH_PTE_SPECIAL */ + +static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + struct page *head, *page, *tail; + int refs; + + if (write && !pmd_write(orig)) + return 0; + + refs = 0; + head = pmd_page(orig); + page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + tail = page; + do { + VM_BUG_ON_PAGE(compound_head(page) != head, page); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + /* + * Any tail pages need their mapcount reference taken before we + * return. (This allows the THP code to bump their ref count when + * they are split into base pages). + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } + + return 1; +} + +static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + struct page *head, *page, *tail; + int refs; + + if (write && !pud_write(orig)) + return 0; + + refs = 0; + head = pud_page(orig); + page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + tail = page; + do { + VM_BUG_ON_PAGE(compound_head(page) != head, page); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pud_val(orig) != pud_val(*pudp))) { + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } + + return 1; +} + +static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp; + + pmdp = pmd_offset(&pud, addr); + do { + pmd_t pmd = ACCESS_ONCE(*pmdp); + + next = pmd_addr_end(addr, end); + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + return 0; + + if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { + /* + * NUMA hinting faults need to be handled in the GUP + * slowpath for accounting purposes and so that they + * can be serialised against THP migration. + */ + if (pmd_numa(pmd)) + return 0; + + if (!gup_huge_pmd(pmd, pmdp, addr, next, write, + pages, nr)) + return 0; + + } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + return 0; + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp; + + pudp = pud_offset(pgdp, addr); + do { + pud_t pud = ACCESS_ONCE(*pudp); + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (pud_huge(pud)) { + if (!gup_huge_pud(pud, pudp, addr, next, write, + pages, nr)) + return 0; + } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + return 0; + } while (pudp++, addr = next, addr != end); + + return 1; +} + +/* + * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to + * the regular GUP. It will only return non-negative values. + */ +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next, flags; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + start, len))) + return 0; + + /* + * Disable interrupts. We use the nested form as we can already have + * interrupts disabled by get_futex_key. + * + * With interrupts disabled, we block page table pages from being + * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h + * for more details. + * + * We do not adopt an rcu_read_lock(.) here as we also want to + * block IPIs that come from THPs splitting. + */ + + local_irq_save(flags); + pgdp = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none(*pgdp)) + break; + else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr)) + break; + } while (pgdp++, addr = next, addr != end); + local_irq_restore(flags); + + return nr; +} + +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Attempt to pin user pages in memory without taking mm->mmap_sem. + * If not successful, it will fall back to taking the lock and + * calling get_user_pages(). + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. + */ +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + int nr, ret; + + start &= PAGE_MASK; + nr = __get_user_pages_fast(start, nr_pages, write, pages); + ret = nr; + + if (nr < nr_pages) { + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, + nr_pages - nr, write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; + } + } + + return ret; +} + +#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ diff --git a/mm/highmem.c b/mm/highmem.c index b32b70c..123bcd3 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -44,6 +44,66 @@ DEFINE_PER_CPU(int, __kmap_atomic_idx); */ #ifdef CONFIG_HIGHMEM +/* + * Architecture with aliasing data cache may define the following family of + * helper functions in its asm/highmem.h to control cache color of virtual + * addresses where physical memory pages are mapped by kmap. + */ +#ifndef get_pkmap_color + +/* + * Determine color of virtual address where the page should be mapped. + */ +static inline unsigned int get_pkmap_color(struct page *page) +{ + return 0; +} +#define get_pkmap_color get_pkmap_color + +/* + * Get next index for mapping inside PKMAP region for page with given color. + */ +static inline unsigned int get_next_pkmap_nr(unsigned int color) +{ + static unsigned int last_pkmap_nr; + + last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; + return last_pkmap_nr; +} + +/* + * Determine if page index inside PKMAP region (pkmap_nr) of given color + * has wrapped around PKMAP region end. When this happens an attempt to + * flush all unused PKMAP slots is made. + */ +static inline int no_more_pkmaps(unsigned int pkmap_nr, unsigned int color) +{ + return pkmap_nr == 0; +} + +/* + * Get the number of PKMAP entries of the given color. If no free slot is + * found after checking that many entries, kmap will sleep waiting for + * someone to call kunmap and free PKMAP slot. + */ +static inline int get_pkmap_entries_count(unsigned int color) +{ + return LAST_PKMAP; +} + +/* + * Get head of a wait queue for PKMAP entries of the given color. + * Wait queues for different mapping colors should be independent to avoid + * unnecessary wakeups caused by freeing of slots of other colors. + */ +static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) +{ + static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); + + return &pkmap_map_wait; +} +#endif + unsigned long totalhigh_pages __read_mostly; EXPORT_SYMBOL(totalhigh_pages); @@ -68,13 +128,10 @@ unsigned int nr_free_highpages (void) } static int pkmap_count[LAST_PKMAP]; -static unsigned int last_pkmap_nr; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); pte_t * pkmap_page_table; -static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); - /* * Most architectures have no use for kmap_high_get(), so let's abstract * the disabling of IRQ out of the locking in that case to save on a @@ -161,15 +218,17 @@ static inline unsigned long map_new_virtual(struct page *page) { unsigned long vaddr; int count; + unsigned int last_pkmap_nr; + unsigned int color = get_pkmap_color(page); start: - count = LAST_PKMAP; + count = get_pkmap_entries_count(color); /* Find an empty entry */ for (;;) { - last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; - if (!last_pkmap_nr) { + last_pkmap_nr = get_next_pkmap_nr(color); + if (no_more_pkmaps(last_pkmap_nr, color)) { flush_all_zero_pkmaps(); - count = LAST_PKMAP; + count = get_pkmap_entries_count(color); } if (!pkmap_count[last_pkmap_nr]) break; /* Found a usable entry */ @@ -181,12 +240,14 @@ start: */ { DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *pkmap_map_wait = + get_pkmap_wait_queue_head(color); __set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&pkmap_map_wait, &wait); + add_wait_queue(pkmap_map_wait, &wait); unlock_kmap(); schedule(); - remove_wait_queue(&pkmap_map_wait, &wait); + remove_wait_queue(pkmap_map_wait, &wait); lock_kmap(); /* Somebody else might have mapped it while we slept */ @@ -274,6 +335,8 @@ void kunmap_high(struct page *page) unsigned long nr; unsigned long flags; int need_wakeup; + unsigned int color = get_pkmap_color(page); + wait_queue_head_t *pkmap_map_wait; lock_kmap_any(flags); vaddr = (unsigned long)page_address(page); @@ -299,13 +362,14 @@ void kunmap_high(struct page *page) * no need for the wait-queue-head's lock. Simply * test if the queue is empty. */ - need_wakeup = waitqueue_active(&pkmap_map_wait); + pkmap_map_wait = get_pkmap_wait_queue_head(color); + need_wakeup = waitqueue_active(pkmap_map_wait); } unlock_kmap_any(flags); /* do wake-up, if needed, race-free outside of the spin lock */ if (need_wakeup) - wake_up(&pkmap_map_wait); + wake_up(pkmap_map_wait); } EXPORT_SYMBOL(kunmap_high); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 33514d8..de98415 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -200,7 +200,7 @@ retry: preempt_disable(); if (cmpxchg(&huge_zero_page, NULL, zero_page)) { preempt_enable(); - __free_page(zero_page); + __free_pages(zero_page, compound_order(zero_page)); goto retry; } @@ -232,7 +232,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { struct page *zero_page = xchg(&huge_zero_page, NULL); BUG_ON(zero_page == NULL); - __free_page(zero_page); + __free_pages(zero_page, compound_order(zero_page)); return HPAGE_PMD_NR; } @@ -715,13 +715,20 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, unsigned long haddr, pmd_t *pmd, struct page *page) { + struct mem_cgroup *memcg; pgtable_t pgtable; spinlock_t *ptl; VM_BUG_ON_PAGE(!PageCompound(page), page); + + if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) + return VM_FAULT_OOM; + pgtable = pte_alloc_one(mm, haddr); - if (unlikely(!pgtable)) + if (unlikely(!pgtable)) { + mem_cgroup_cancel_charge(page, memcg); return VM_FAULT_OOM; + } clear_huge_page(page, haddr, HPAGE_PMD_NR); /* @@ -734,7 +741,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { spin_unlock(ptl); - mem_cgroup_uncharge_page(page); + mem_cgroup_cancel_charge(page, memcg); put_page(page); pte_free(mm, pgtable); } else { @@ -742,6 +749,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -794,7 +803,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - if (unlikely(khugepaged_enter(vma))) + if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; if (!(flags & FAULT_FLAG_WRITE) && transparent_hugepage_use_zero_page()) { @@ -827,13 +836,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) { - put_page(page); - count_vm_event(THP_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; - } if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { - mem_cgroup_uncharge_page(page); put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -979,6 +982,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct page *page, unsigned long haddr) { + struct mem_cgroup *memcg; spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; @@ -999,20 +1003,21 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, __GFP_OTHER_NODE, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_charge_anon(pages[i], mm, - GFP_KERNEL))) { + mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, + &memcg))) { if (pages[i]) put_page(pages[i]); - mem_cgroup_uncharge_start(); while (--i >= 0) { - mem_cgroup_uncharge_page(pages[i]); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + mem_cgroup_cancel_charge(pages[i], memcg); put_page(pages[i]); } - mem_cgroup_uncharge_end(); kfree(pages); ret |= VM_FAULT_OOM; goto out; } + set_page_private(pages[i], (unsigned long)memcg); } for (i = 0; i < HPAGE_PMD_NR; i++) { @@ -1041,7 +1046,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pte_t *pte, entry; entry = mk_pte(pages[i], vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); page_add_new_anon_rmap(pages[i], vma, haddr); + mem_cgroup_commit_charge(pages[i], memcg, false); + lru_cache_add_active_or_unevictable(pages[i], vma); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -1065,12 +1074,12 @@ out: out_free_pages: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { - mem_cgroup_uncharge_page(pages[i]); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + mem_cgroup_cancel_charge(pages[i], memcg); put_page(pages[i]); } - mem_cgroup_uncharge_end(); kfree(pages); goto out; } @@ -1081,12 +1090,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; int ret = 0; struct page *page = NULL, *new_page; + struct mem_cgroup *memcg; unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ ptl = pmd_lockptr(mm, pmd); - VM_BUG_ON(!vma->anon_vma); + VM_BUG_ON_VMA(!vma->anon_vma, vma); haddr = address & HPAGE_PMD_MASK; if (is_huge_zero_pmd(orig_pmd)) goto alloc; @@ -1132,7 +1142,8 @@ alloc: goto out; } - if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, + GFP_TRANSHUGE, &memcg))) { put_page(new_page); if (page) { split_huge_page(page); @@ -1161,7 +1172,7 @@ alloc: put_user_huge_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(ptl); - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); put_page(new_page); goto out_mn; } else { @@ -1170,6 +1181,8 @@ alloc: entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, address, pmd); if (!page) { @@ -1681,7 +1694,7 @@ static void __split_huge_page_refcount(struct page *page, &page_tail->_count); /* after clearing PageTail the gup refcount can be released */ - smp_mb(); + smp_mb__after_atomic(); /* * retain hwpoison flag of the poisoned tail page: @@ -1775,21 +1788,24 @@ static int __split_huge_page_map(struct page *page, if (pmd) { pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); + if (pmd_write(*pmd)) + BUG_ON(page_mapcount(page) != 1); haddr = address; for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { pte_t *pte, entry; BUG_ON(PageCompound(page+i)); + /* + * Note that pmd_numa is not transferred deliberately + * to avoid any possibility that pte_numa leaks to + * a PROT_NONE VMA by accident. + */ entry = mk_pte(page + i, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (!pmd_write(*pmd)) entry = pte_wrprotect(entry); - else - BUG_ON(page_mapcount(page) != 1); if (!pmd_young(*pmd)) entry = pte_mkold(entry); - if (pmd_numa(*pmd)) - entry = pte_mknuma(entry); pte = pte_offset_map(&_pmd, haddr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -1954,7 +1970,7 @@ int hugepage_madvise(struct vm_area_struct *vma, * register it here without waiting a page fault that * may not happen any time soon. */ - if (unlikely(khugepaged_enter_vma_merge(vma))) + if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags))) return -ENOMEM; break; case MADV_NOHUGEPAGE: @@ -2032,7 +2048,7 @@ int __khugepaged_enter(struct mm_struct *mm) return -ENOMEM; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON(khugepaged_test_exit(mm)); + VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); return 0; @@ -2055,7 +2071,8 @@ int __khugepaged_enter(struct mm_struct *mm) return 0; } -int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) { unsigned long hstart, hend; if (!vma->anon_vma) @@ -2067,11 +2084,11 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) if (vma->vm_ops) /* khugepaged not yet working on file or special mappings */ return 0; - VM_BUG_ON(vma->vm_flags & VM_NO_THP); + VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) - return khugepaged_enter(vma); + return khugepaged_enter(vma, vm_flags); return 0; } @@ -2233,6 +2250,30 @@ static void khugepaged_alloc_sleep(void) static int khugepaged_node_load[MAX_NUMNODES]; +static bool khugepaged_scan_abort(int nid) +{ + int i; + + /* + * If zone_reclaim_mode is disabled, then no extra effort is made to + * allocate memory locally. + */ + if (!zone_reclaim_mode) + return false; + + /* If there is a count for this node already, it must be acceptable */ + if (khugepaged_node_load[nid]) + return false; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (!khugepaged_node_load[i]) + continue; + if (node_distance(nid, i) > RECLAIM_DISTANCE) + return true; + } + return false; +} + #ifdef CONFIG_NUMA static int khugepaged_find_target_node(void) { @@ -2282,23 +2323,17 @@ static struct page int node) { VM_BUG_ON_PAGE(*hpage, *hpage); + /* - * Allocate the page while the vma is still valid and under - * the mmap_sem read mode so there is no memory allocation - * later when we take the mmap_sem in write mode. This is more - * friendly behavior (OTOH it may actually hide bugs) to - * filesystems in userland with daemons allocating memory in - * the userland I/O paths. Allocating memory with the - * mmap_sem in read mode is good idea also to allow greater - * scalability. + * Before allocating the hugepage, release the mmap_sem read lock. + * The allocation can take potentially a long time if it involves + * sync compaction, and we do not need to hold the mmap_sem during + * that. We will recheck the vma after taking it again in write mode. */ + up_read(&mm->mmap_sem); + *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); - /* - * After allocating the hugepage, release the mmap_sem read lock in - * preparation for taking it in write mode. - */ - up_read(&mm->mmap_sem); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); @@ -2372,7 +2407,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) return false; if (is_vma_temporary_stack(vma)) return false; - VM_BUG_ON(vma->vm_flags & VM_NO_THP); + VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); return true; } @@ -2389,6 +2424,7 @@ static void collapse_huge_page(struct mm_struct *mm, spinlock_t *pmd_ptl, *pte_ptl; int isolated; unsigned long hstart, hend; + struct mem_cgroup *memcg; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -2399,7 +2435,8 @@ static void collapse_huge_page(struct mm_struct *mm, if (!new_page) return; - if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) + if (unlikely(mem_cgroup_try_charge(new_page, mm, + GFP_TRANSHUGE, &memcg))) return; /* @@ -2486,6 +2523,8 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); @@ -2499,7 +2538,7 @@ out_up_write: return; out: - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); goto out_up_write; } @@ -2545,6 +2584,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * hit record. */ node = page_to_nid(page); + if (khugepaged_scan_abort(node)) + goto out_unmap; khugepaged_node_load[node]++; VM_BUG_ON_PAGE(PageCompound(page), page); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a0a73d..9fd7227 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,7 +35,6 @@ #include <linux/node.h> #include "internal.h" -const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; unsigned long hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; @@ -435,7 +434,7 @@ static inline struct resv_map *inode_resv_map(struct inode *inode) static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { - VM_BUG_ON(!is_vm_hugetlb_page(vma)); + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); if (vma->vm_flags & VM_MAYSHARE) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; @@ -450,8 +449,8 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { - VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); set_vma_private_data(vma, (get_vma_private_data(vma) & HPAGE_RESV_MASK) | (unsigned long)map); @@ -459,15 +458,15 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { - VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); set_vma_private_data(vma, get_vma_private_data(vma) | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) { - VM_BUG_ON(!is_vm_hugetlb_page(vma)); + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); return (get_vma_private_data(vma) & flag) != 0; } @@ -475,7 +474,7 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { - VM_BUG_ON(!is_vm_hugetlb_page(vma)); + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); if (!(vma->vm_flags & VM_MAYSHARE)) vma->vm_private_data = (void *)0; } @@ -1089,6 +1088,9 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) unsigned long pfn; struct hstate *h; + if (!hugepages_supported()) + return; + /* Set scan step to minimum hugepage size */ for_each_hstate(h) if (order > huge_page_order(h)) @@ -1734,21 +1736,13 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, return sprintf(buf, "%lu\n", nr_huge_pages); } -static ssize_t nr_hugepages_store_common(bool obey_mempolicy, - struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t len) +static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, + struct hstate *h, int nid, + unsigned long count, size_t len) { int err; - int nid; - unsigned long count; - struct hstate *h; NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - err = kstrtoul(buf, 10, &count); - if (err) - goto out; - - h = kobj_to_hstate(kobj, &nid); if (hstate_is_gigantic(h) && !gigantic_page_supported()) { err = -EINVAL; goto out; @@ -1784,6 +1778,23 @@ out: return err; } +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, + struct kobject *kobj, const char *buf, + size_t len) +{ + struct hstate *h; + unsigned long count; + int nid; + int err; + + err = kstrtoul(buf, 10, &count); + if (err) + return err; + + h = kobj_to_hstate(kobj, &nid); + return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); +} + static ssize_t nr_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1793,7 +1804,7 @@ static ssize_t nr_hugepages_show(struct kobject *kobj, static ssize_t nr_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - return nr_hugepages_store_common(false, kobj, attr, buf, len); + return nr_hugepages_store_common(false, kobj, buf, len); } HSTATE_ATTR(nr_hugepages); @@ -1812,7 +1823,7 @@ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - return nr_hugepages_store_common(true, kobj, attr, buf, len); + return nr_hugepages_store_common(true, kobj, buf, len); } HSTATE_ATTR(nr_hugepages_mempolicy); #endif @@ -2248,36 +2259,21 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; - unsigned long tmp; + unsigned long tmp = h->max_huge_pages; int ret; if (!hugepages_supported()) return -ENOTSUPP; - tmp = h->max_huge_pages; - - if (write && hstate_is_gigantic(h) && !gigantic_page_supported()) - return -EINVAL; - table->data = &tmp; table->maxlen = sizeof(unsigned long); ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); if (ret) goto out; - if (write) { - NODEMASK_ALLOC(nodemask_t, nodes_allowed, - GFP_KERNEL | __GFP_NORETRY); - if (!(obey_mempolicy && - init_nodemask_of_mempolicy(nodes_allowed))) { - NODEMASK_FREE(nodes_allowed); - nodes_allowed = &node_states[N_MEMORY]; - } - h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); - - if (nodes_allowed != &node_states[N_MEMORY]) - NODEMASK_FREE(nodes_allowed); - } + if (write) + ret = __nr_hugepages_store_common(obey_mempolicy, h, + NUMA_NO_NODE, tmp, *length); out: return ret; } @@ -2754,8 +2750,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ -static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, - struct page *page, unsigned long address) +static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, unsigned long address) { struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; @@ -2794,8 +2790,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, address + huge_page_size(h), page); } mutex_unlock(&mapping->i_mmap_mutex); - - return 1; } /* @@ -2810,7 +2804,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; - int outside_reserve = 0; + int ret = 0, outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -2840,14 +2834,14 @@ retry_avoidcopy: page_cache_get(old_page); - /* Drop page table lock as buddy allocator may be called */ + /* + * Drop page table lock as buddy allocator may be called. It will + * be acquired again before returning to the caller, as expected. + */ spin_unlock(ptl); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { - long err = PTR_ERR(new_page); - page_cache_release(old_page); - /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient @@ -2856,29 +2850,25 @@ retry_avoidcopy: * may get SIGKILLed if it later faults. */ if (outside_reserve) { + page_cache_release(old_page); BUG_ON(huge_pte_none(pte)); - if (unmap_ref_private(mm, vma, old_page, address)) { - BUG_ON(huge_pte_none(pte)); - spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(ptep && - pte_same(huge_ptep_get(ptep), pte))) - goto retry_avoidcopy; - /* - * race occurs while re-acquiring page table - * lock, and our job is done. - */ - return 0; - } - WARN_ON_ONCE(1); + unmap_ref_private(mm, vma, old_page, address); + BUG_ON(huge_pte_none(pte)); + spin_lock(ptl); + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + if (likely(ptep && + pte_same(huge_ptep_get(ptep), pte))) + goto retry_avoidcopy; + /* + * race occurs while re-acquiring page table + * lock, and our job is done. + */ + return 0; } - /* Caller expects lock to be held */ - spin_lock(ptl); - if (err == -ENOMEM) - return VM_FAULT_OOM; - else - return VM_FAULT_SIGBUS; + ret = (PTR_ERR(new_page) == -ENOMEM) ? + VM_FAULT_OOM : VM_FAULT_SIGBUS; + goto out_release_old; } /* @@ -2886,11 +2876,8 @@ retry_avoidcopy: * anon_vma prepared. */ if (unlikely(anon_vma_prepare(vma))) { - page_cache_release(new_page); - page_cache_release(old_page); - /* Caller expects lock to be held */ - spin_lock(ptl); - return VM_FAULT_OOM; + ret = VM_FAULT_OOM; + goto out_release_all; } copy_user_huge_page(new_page, old_page, address, vma, @@ -2900,6 +2887,7 @@ retry_avoidcopy: mmun_start = address & huge_page_mask(h); mmun_end = mmun_start + huge_page_size(h); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + /* * Retake the page table lock to check for racing updates * before the page tables are altered @@ -2920,12 +2908,13 @@ retry_avoidcopy: } spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +out_release_all: page_cache_release(new_page); +out_release_old: page_cache_release(old_page); - /* Caller expects lock to be held */ - spin_lock(ptl); - return 0; + spin_lock(ptl); /* Caller expects lock to be held */ + return ret; } /* Return the pagecache page at a given address within a VMA */ diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 9aae6f4..a67c26e 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -217,7 +217,7 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, if (hugetlb_cgroup_disabled()) return; - VM_BUG_ON(!spin_is_locked(&hugetlb_lock)); + lockdep_assert_held(&hugetlb_lock); h_cg = hugetlb_cgroup_from_page(page); if (unlikely(!h_cg)) return; @@ -275,6 +275,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; + val = ALIGN(val, 1ULL << huge_page_shift(&hstates[idx])); ret = res_counter_set_limit(&h_cg->hugepage[idx], val); break; default: diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 95487c7..329caf5 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -72,8 +72,7 @@ DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void pfn_inject_exit(void) { - if (hwpoison_dir) - debugfs_remove_recursive(hwpoison_dir); + debugfs_remove_recursive(hwpoison_dir); } static int pfn_inject_init(void) diff --git a/mm/internal.h b/mm/internal.h index 7f22a11f..a4f90ba 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -108,6 +108,31 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c */ + +/* + * Locate the struct page for both the matching buddy in our + * pair (buddy1) and the combined O(n+1) page they form (page). + * + * 1) Any buddy B1 will have an order O twin B2 which satisfies + * the following equation: + * B2 = B1 ^ (1 << O) + * For example, if the starting buddy (buddy2) is #8 its order + * 1 buddy is #10: + * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 + * + * 2) Any buddy B will have an order O+1 parent P which + * satisfies the following equation: + * P = B & ~(1 << O) + * + * Assumption: *_mem_map is contiguous at least up to MAX_ORDER + */ +static inline unsigned long +__find_buddy_index(unsigned long page_idx, unsigned int order) +{ + return page_idx ^ (1 << order); +} + +extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE @@ -142,10 +167,10 @@ struct compact_control { bool finished_update_migrate; int order; /* order a direct compactor needs */ - int migratetype; /* MOVABLE, RECLAIMABLE etc */ + const gfp_t gfp_mask; /* gfp mask of a direct compactor */ struct zone *zone; - bool contended; /* True if a lock was contended, or - * need_resched() true during async + int contended; /* Signal need_sched() or lock + * contention detected during * compaction */ }; @@ -154,8 +179,8 @@ unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); unsigned long -isolate_migratepages_range(struct zone *zone, struct compact_control *cc, - unsigned long low_pfn, unsigned long end_pfn, bool unevictable); +isolate_migratepages_range(struct compact_control *cc, + unsigned long low_pfn, unsigned long end_pfn); #endif @@ -164,7 +189,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, * general, page_zone(page)->lock must be held by the caller to prevent the * page from being allocated in parallel and returning garbage as the order. * If a caller does not hold page_zone(page)->lock, it must guarantee that the - * page cannot be allocated or merged in parallel. + * page cannot be allocated or merged in parallel. Alternatively, it must + * handle invalid values gracefully, and use page_order_unsafe() below. */ static inline unsigned long page_order(struct page *page) { @@ -172,6 +198,19 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +/* + * Like page_order(), but for callers who cannot afford to hold the zone lock. + * PageBuddy() should be checked first by the caller to minimize race window, + * and invalid values must be handled gracefully. + * + * ACCESS_ONCE is used so that if the caller assigns the result into a local + * variable and e.g. tests it for valid range before using, the compiler cannot + * decide to remove the variable and inline the page_private(page) multiple + * times, potentially observing different values in the tests and the actual + * use of the result. + */ +#define page_order_unsafe(page) ACCESS_ONCE(page_private(page)) + static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; @@ -247,7 +286,7 @@ static inline void mlock_migrate_page(struct page *new, struct page *old) { } static inline struct page *mem_map_offset(struct page *base, int offset) { if (unlikely(offset >= MAX_ORDER_NR_PAGES)) - return pfn_to_page(page_to_pfn(base) + offset); + return nth_page(base, offset); return base + offset; } diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 4a5822a..8da581f 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -34,7 +34,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *parent; unsigned long last = vma_last_pgoff(node); - VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev)); + VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); if (!prev->shared.linear.rb.rb_right) { parent = prev; diff --git a/mm/iov_iter.c b/mm/iov_iter.c index 7b5dbd1..e34a3cb 100644 --- a/mm/iov_iter.c +++ b/mm/iov_iter.c @@ -4,6 +4,96 @@ #include <linux/slab.h> #include <linux/vmalloc.h> +static size_t copy_to_iter_iovec(void *from, size_t bytes, struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + left = __copy_to_user(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} + +static size_t copy_from_iter_iovec(void *to, size_t bytes, struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + left = __copy_from_user(to, buf, copy); + copy -= left; + skip += copy; + to += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_from_user(to, buf, copy); + copy -= left; + skip = copy; + to += copy; + bytes -= copy; + } + + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} + static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { @@ -166,6 +256,50 @@ done: return wanted - bytes; } +static size_t zero_iovec(size_t bytes, struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + left = __clear_user(buf, copy); + copy -= left; + skip += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __clear_user(buf, copy); + copy -= left; + skip = copy; + bytes -= copy; + } + + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} + static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { @@ -310,7 +444,7 @@ void iov_iter_init(struct iov_iter *i, int direction, EXPORT_SYMBOL(iov_iter_init); static ssize_t get_pages_iovec(struct iov_iter *i, - struct page **pages, size_t maxsize, + struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { size_t offset = i->iov_offset; @@ -327,6 +461,8 @@ static ssize_t get_pages_iovec(struct iov_iter *i, len = maxsize; addr = (unsigned long)iov->iov_base + offset; len += *start = addr & (PAGE_SIZE - 1); + if (len > maxpages * PAGE_SIZE) + len = maxpages * PAGE_SIZE; addr &= ~(PAGE_SIZE - 1); n = (len + PAGE_SIZE - 1) / PAGE_SIZE; res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); @@ -412,12 +548,17 @@ static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t kunmap_atomic(to); } -static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) +static void memzero_page(struct page *page, size_t offset, size_t len) +{ + char *addr = kmap_atomic(page); + memset(addr + offset, 0, len); + kunmap_atomic(addr); +} + +static size_t copy_to_iter_bvec(void *from, size_t bytes, struct iov_iter *i) { size_t skip, copy, wanted; const struct bio_vec *bvec; - void *kaddr, *from; if (unlikely(bytes > i->count)) bytes = i->count; @@ -430,8 +571,6 @@ static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t by skip = i->iov_offset; copy = min_t(size_t, bytes, bvec->bv_len - skip); - kaddr = kmap_atomic(page); - from = kaddr + offset; memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy); skip += copy; from += copy; @@ -444,7 +583,6 @@ static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t by from += copy; bytes -= copy; } - kunmap_atomic(kaddr); if (skip == bvec->bv_len) { bvec++; skip = 0; @@ -456,12 +594,10 @@ static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t by return wanted - bytes; } -static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) +static size_t copy_from_iter_bvec(void *to, size_t bytes, struct iov_iter *i) { size_t skip, copy, wanted; const struct bio_vec *bvec; - void *kaddr, *to; if (unlikely(bytes > i->count)) bytes = i->count; @@ -473,10 +609,6 @@ static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t bvec = i->bvec; skip = i->iov_offset; - kaddr = kmap_atomic(page); - - to = kaddr + offset; - copy = min(bytes, bvec->bv_len - skip); memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy); @@ -493,7 +625,6 @@ static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t to += copy; bytes -= copy; } - kunmap_atomic(kaddr); if (skip == bvec->bv_len) { bvec++; skip = 0; @@ -505,6 +636,61 @@ static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t return wanted; } +static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, + size_t bytes, struct iov_iter *i) +{ + void *kaddr = kmap_atomic(page); + size_t wanted = copy_to_iter_bvec(kaddr + offset, bytes, i); + kunmap_atomic(kaddr); + return wanted; +} + +static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, + size_t bytes, struct iov_iter *i) +{ + void *kaddr = kmap_atomic(page); + size_t wanted = copy_from_iter_bvec(kaddr + offset, bytes, i); + kunmap_atomic(kaddr); + return wanted; +} + +static size_t zero_bvec(size_t bytes, struct iov_iter *i) +{ + size_t skip, copy, wanted; + const struct bio_vec *bvec; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + bvec = i->bvec; + skip = i->iov_offset; + copy = min_t(size_t, bytes, bvec->bv_len - skip); + + memzero_page(bvec->bv_page, skip + bvec->bv_offset, copy); + skip += copy; + bytes -= copy; + while (bytes) { + bvec++; + copy = min(bytes, (size_t)bvec->bv_len); + memzero_page(bvec->bv_page, bvec->bv_offset, copy); + skip = copy; + bytes -= copy; + } + if (skip == bvec->bv_len) { + bvec++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= bvec - i->bvec; + i->bvec = bvec; + i->iov_offset = skip; + return wanted - bytes; +} + static size_t copy_from_user_bvec(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes) { @@ -588,7 +774,7 @@ static unsigned long alignment_bvec(const struct iov_iter *i) } static ssize_t get_pages_bvec(struct iov_iter *i, - struct page **pages, size_t maxsize, + struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { const struct bio_vec *bvec = i->bvec; @@ -597,6 +783,7 @@ static ssize_t get_pages_bvec(struct iov_iter *i, len = i->count; if (len > maxsize) len = maxsize; + /* can't be more than PAGE_SIZE */ *start = bvec->bv_offset + i->iov_offset; get_page(*pages = bvec->bv_page); @@ -669,6 +856,34 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, } EXPORT_SYMBOL(copy_page_from_iter); +size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i) +{ + if (i->type & ITER_BVEC) + return copy_to_iter_bvec(addr, bytes, i); + else + return copy_to_iter_iovec(addr, bytes, i); +} +EXPORT_SYMBOL(copy_to_iter); + +size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) +{ + if (i->type & ITER_BVEC) + return copy_from_iter_bvec(addr, bytes, i); + else + return copy_from_iter_iovec(addr, bytes, i); +} +EXPORT_SYMBOL(copy_from_iter); + +size_t iov_iter_zero(size_t bytes, struct iov_iter *i) +{ + if (i->type & ITER_BVEC) { + return zero_bvec(bytes, i); + } else { + return zero_iovec(bytes, i); + } +} +EXPORT_SYMBOL(iov_iter_zero); + size_t iov_iter_copy_from_user_atomic(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes) { @@ -696,9 +911,9 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) if (i->nr_segs == 1) return i->count; else if (i->type & ITER_BVEC) - return min(i->count, i->iov->iov_len - i->iov_offset); - else return min(i->count, i->bvec->bv_len - i->iov_offset); + else + return min(i->count, i->iov->iov_len - i->iov_offset); } EXPORT_SYMBOL(iov_iter_single_seg_count); @@ -712,13 +927,13 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) EXPORT_SYMBOL(iov_iter_alignment); ssize_t iov_iter_get_pages(struct iov_iter *i, - struct page **pages, size_t maxsize, + struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { if (i->type & ITER_BVEC) - return get_pages_bvec(i, pages, maxsize, start); + return get_pages_bvec(i, pages, maxsize, maxpages, start); else - return get_pages_iovec(i, pages, maxsize, start); + return get_pages_iovec(i, pages, maxsize, maxpages, start); } EXPORT_SYMBOL(iov_iter_get_pages); diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index fd814fd..cab58bb 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -2,6 +2,7 @@ #include <linux/mm_types.h> #include <linux/mm.h> #include <linux/slab.h> +#include "slab.h" #include <linux/kmemcheck.h> void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) @@ -2310,7 +2310,7 @@ static int __init ksm_init(void) ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); if (IS_ERR(ksm_thread)) { - printk(KERN_ERR "ksm: creating kthread failed\n"); + pr_err("ksm: creating kthread failed\n"); err = PTR_ERR(ksm_thread); goto out_free; } @@ -2318,7 +2318,7 @@ static int __init ksm_init(void) #ifdef CONFIG_SYSFS err = sysfs_create_group(mm_kobj, &ksm_attr_group); if (err) { - printk(KERN_ERR "ksm: register sysfs failed\n"); + pr_err("ksm: register sysfs failed\n"); kthread_stop(ksm_thread); goto out_free; } diff --git a/mm/madvise.c b/mm/madvise.c index a402f8f..0938b30 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -292,9 +292,6 @@ static long madvise_dontneed(struct vm_area_struct *vma, /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. - * - * NOTE: Currently, only shmfs/tmpfs is supported for this operation. - * Other filesystems return -ENOSYS. */ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, diff --git a/mm/memblock.c b/mm/memblock.c index 6d2f219..6ecb0d9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -192,8 +192,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, int nid) { - int ret; - phys_addr_t kernel_end; + phys_addr_t kernel_end, ret; /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) @@ -817,6 +816,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, if (nid != NUMA_NO_NODE && nid != m_nid) continue; + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f009a14..d6ac0e3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -292,6 +292,9 @@ struct mem_cgroup { /* vmpressure notifications */ struct vmpressure vmpressure; + /* css_online() has been completed */ + int initialized; + /* * the counter to account for mem+swap usage. */ @@ -315,9 +318,6 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; - /* set when res.limit == memsw.limit */ - bool memsw_is_minimum; - /* protect arrays of thresholds */ struct mutex thresholds_lock; @@ -481,14 +481,6 @@ enum res_type { #define OOM_CONTROL (0) /* - * Reclaim flags for mem_cgroup_hierarchical_reclaim - */ -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 -#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) -#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 -#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) - -/* * The memcg_create_mutex will be held whenever a new cgroup is created. * As a consequence, any change that needs to protect against new child cgroups * appearing has to hold it as well. @@ -646,11 +638,13 @@ int memcg_limited_groups_array_size; struct static_key memcg_kmem_enabled_key; EXPORT_SYMBOL(memcg_kmem_enabled_key); +static void memcg_free_cache_id(int id); + static void disarm_kmem_keys(struct mem_cgroup *memcg) { if (memcg_kmem_is_active(memcg)) { static_key_slow_dec(&memcg_kmem_enabled_key); - ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); + memcg_free_cache_id(memcg->kmemcg_id); } /* * This check can't live in kmem destruction function, @@ -754,9 +748,11 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz) { - spin_lock(&mctz->lock); + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); __mem_cgroup_remove_exceeded(mz, mctz); - spin_unlock(&mctz->lock); + spin_unlock_irqrestore(&mctz->lock, flags); } @@ -779,7 +775,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) * mem is over its softlimit. */ if (excess || mz->on_tree) { - spin_lock(&mctz->lock); + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); /* if on-tree, remove it */ if (mz->on_tree) __mem_cgroup_remove_exceeded(mz, mctz); @@ -788,7 +786,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) * If excess is 0, no tree ops. */ __mem_cgroup_insert_exceeded(mz, mctz, excess); - spin_unlock(&mctz->lock); + spin_unlock_irqrestore(&mctz->lock, flags); } } } @@ -839,9 +837,9 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) { struct mem_cgroup_per_zone *mz; - spin_lock(&mctz->lock); + spin_lock_irq(&mctz->lock); mz = __mem_cgroup_largest_soft_limit_node(mctz); - spin_unlock(&mctz->lock); + spin_unlock_irq(&mctz->lock); return mz; } @@ -882,13 +880,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg, return val; } -static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - bool charge) -{ - int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); -} - static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { @@ -909,13 +900,13 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - bool anon, int nr_pages) + int nr_pages) { /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is * counted as CACHE even if it's on ANON LRU. */ - if (anon) + if (PageAnon(page)) __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); else @@ -1013,7 +1004,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, */ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { - preempt_disable(); /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { @@ -1026,8 +1016,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) do_numainfo = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_NUMAINFO); #endif - preempt_enable(); - mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) mem_cgroup_update_tree(memcg, page); @@ -1035,8 +1023,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); #endif - } else - preempt_enable(); + } } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) @@ -1106,10 +1093,21 @@ skip_node: * skipping css reference should be safe. */ if (next_css) { - if ((next_css == &root->css) || - ((next_css->flags & CSS_ONLINE) && - css_tryget_online(next_css))) - return mem_cgroup_from_css(next_css); + struct mem_cgroup *memcg = mem_cgroup_from_css(next_css); + + if (next_css == &root->css) + return memcg; + + if (css_tryget_online(next_css)) { + /* + * Make sure the memcg is initialized: + * mem_cgroup_css_online() orders the the + * initialization against setting the flag. + */ + if (smp_load_acquire(&memcg->initialized)) + return memcg; + css_put(next_css); + } prev_css = next_css; goto skip_node; @@ -1347,20 +1345,6 @@ out: return lruvec; } -/* - * Following LRU functions are allowed to be used without PCG_LOCK. - * Operations are called by routine of global LRU independently from memcg. - * What we have to take care of here is validness of pc->mem_cgroup. - * - * Changes to pc->mem_cgroup happens when - * 1. charge - * 2. moving account - * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. - * It is added to LRU before charge. - * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. - * When moving account, the page is not on LRU. It's isolated. - */ - /** * mem_cgroup_page_lruvec - return lruvec for adding an lru page * @page: the page @@ -1552,12 +1536,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) * start move here. */ -/* for quick checking without looking up memcg */ -atomic_t memcg_moving __read_mostly; - static void mem_cgroup_start_move(struct mem_cgroup *memcg) { - atomic_inc(&memcg_moving); atomic_inc(&memcg->moving_account); synchronize_rcu(); } @@ -1568,10 +1548,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) * Now, mem_cgroup_clear_mc() may call this function with NULL. * We check NULL in callee rather than caller. */ - if (memcg) { - atomic_dec(&memcg_moving); + if (memcg) atomic_dec(&memcg->moving_account); - } } /* @@ -1813,42 +1791,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, NULL, "Memory cgroup out of memory"); } -static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, - gfp_t gfp_mask, - unsigned long flags) -{ - unsigned long total = 0; - bool noswap = false; - int loop; - - if (flags & MEM_CGROUP_RECLAIM_NOSWAP) - noswap = true; - if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) - noswap = true; - - for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { - if (loop) - drain_all_stock_async(memcg); - total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); - /* - * Allow limit shrinkers, which are triggered directly - * by userspace, to catch signals and stop reclaim - * after minimal progress, regardless of the margin. - */ - if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) - break; - if (mem_cgroup_margin(memcg)) - break; - /* - * If nothing was reclaimed after two attempts, there - * may be no reclaimable pages in this hierarchy. - */ - if (loop && !total) - break; - } - return total; -} - /** * test_mem_cgroup_node_reclaimable * @memcg: the target memcg @@ -2256,49 +2198,52 @@ cleanup: return true; } -/* - * Used to update mapped file or writeback or other statistics. - * - * Notes: Race condition +/** + * mem_cgroup_begin_page_stat - begin a page state statistics transaction + * @page: page that is going to change accounted state + * @locked: &memcg->move_lock slowpath was taken + * @flags: IRQ-state flags for &memcg->move_lock * - * We usually use lock_page_cgroup() for accessing page_cgroup member but - * it tends to be costly. But considering some conditions, we doesn't need - * to do so _always_. + * This function must mark the beginning of an accounted page state + * change to prevent double accounting when the page is concurrently + * being moved to another memcg: * - * Considering "charge", lock_page_cgroup() is not required because all - * file-stat operations happen after a page is attached to radix-tree. There - * are no race with "charge". + * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + * if (TestClearPageState(page)) + * mem_cgroup_update_page_stat(memcg, state, -1); + * mem_cgroup_end_page_stat(memcg, locked, flags); * - * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup - * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even - * if there are race with "uncharge". Statistics itself is properly handled - * by flags. + * The RCU lock is held throughout the transaction. The fast path can + * get away without acquiring the memcg->move_lock (@locked is false) + * because page moving starts with an RCU grace period. * - * Considering "move", this is an only case we see a race. To make the race - * small, we check memcg->moving_account and detect there are possibility - * of race or not. If there is, we take a lock. + * The RCU lock also protects the memcg from being freed when the page + * state that is going to change is the only thing preventing the page + * from being uncharged. E.g. end-writeback clearing PageWriteback(), + * which allows migration to go ahead and uncharge the page before the + * account transaction might be complete. */ - -void __mem_cgroup_begin_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, + bool *locked, + unsigned long *flags) { struct mem_cgroup *memcg; struct page_cgroup *pc; + rcu_read_lock(); + + if (mem_cgroup_disabled()) + return NULL; + pc = lookup_page_cgroup(page); again: memcg = pc->mem_cgroup; if (unlikely(!memcg || !PageCgroupUsed(pc))) - return; - /* - * If this memory cgroup is not under account moving, we don't - * need to take move_lock_mem_cgroup(). Because we already hold - * rcu_read_lock(), any calls to move_account will be delayed until - * rcu_read_unlock(). - */ - VM_BUG_ON(!rcu_read_lock_held()); + return NULL; + + *locked = false; if (atomic_read(&memcg->moving_account) <= 0) - return; + return memcg; move_lock_mem_cgroup(memcg, flags); if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { @@ -2306,36 +2251,40 @@ again: goto again; } *locked = true; + + return memcg; } -void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) +/** + * mem_cgroup_end_page_stat - finish a page state statistics transaction + * @memcg: the memcg that was accounted against + * @locked: value received from mem_cgroup_begin_page_stat() + * @flags: value received from mem_cgroup_begin_page_stat() + */ +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, + unsigned long flags) { - struct page_cgroup *pc = lookup_page_cgroup(page); + if (memcg && locked) + move_unlock_mem_cgroup(memcg, &flags); - /* - * It's guaranteed that pc->mem_cgroup never changes while - * lock is held because a routine modifies pc->mem_cgroup - * should take move_lock_mem_cgroup(). - */ - move_unlock_mem_cgroup(pc->mem_cgroup, flags); + rcu_read_unlock(); } -void mem_cgroup_update_page_stat(struct page *page, +/** + * mem_cgroup_update_page_stat - update page state statistics + * @memcg: memcg to account against + * @idx: page state item to account + * @val: number of pages (positive or negative) + * + * See mem_cgroup_begin_page_stat() for locking requirements. + */ +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx, int val) { - struct mem_cgroup *memcg; - struct page_cgroup *pc = lookup_page_cgroup(page); - unsigned long uninitialized_var(flags); - - if (mem_cgroup_disabled()) - return; - VM_BUG_ON(!rcu_read_lock_held()); - memcg = pc->mem_cgroup; - if (unlikely(!memcg || !PageCgroupUsed(pc))) - return; - this_cpu_add(memcg->stat->count[idx], val); + if (memcg) + this_cpu_add(memcg->stat->count[idx], val); } /* @@ -2551,55 +2500,74 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } - -/* See mem_cgroup_try_charge() for details */ -enum { - CHARGE_OK, /* success */ - CHARGE_RETRY, /* need to retry but retry is not bad */ - CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ - CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ -}; - -static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, unsigned int min_pages, - bool invoke_oom) +static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages) { - unsigned long csize = nr_pages * PAGE_SIZE; + unsigned int batch = max(CHARGE_BATCH, nr_pages); + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; - unsigned long flags = 0; - int ret; - - ret = res_counter_charge(&memcg->res, csize, &fail_res); + unsigned long nr_reclaimed; + unsigned long long size; + bool may_swap = true; + bool drained = false; + int ret = 0; - if (likely(!ret)) { - if (!do_swap_account) - return CHARGE_OK; - ret = res_counter_charge(&memcg->memsw, csize, &fail_res); - if (likely(!ret)) - return CHARGE_OK; + if (mem_cgroup_is_root(memcg)) + goto done; +retry: + if (consume_stock(memcg, nr_pages)) + goto done; - res_counter_uncharge(&memcg->res, csize); - mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); - flags |= MEM_CGROUP_RECLAIM_NOSWAP; - } else + size = batch * PAGE_SIZE; + if (!do_swap_account || + !res_counter_charge(&memcg->memsw, size, &fail_res)) { + if (!res_counter_charge(&memcg->res, size, &fail_res)) + goto done_restock; + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, size); mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + } else { + mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); + may_swap = false; + } + + if (batch > nr_pages) { + batch = nr_pages; + goto retry; + } + /* - * Never reclaim on behalf of optional batching, retry with a - * single page instead. + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. */ - if (nr_pages > min_pages) - return CHARGE_RETRY; + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current) || + current->flags & PF_EXITING)) + goto bypass; + + if (unlikely(task_in_memcg_oom(current))) + goto nomem; if (!(gfp_mask & __GFP_WAIT)) - return CHARGE_WOULDBLOCK; + goto nomem; - if (gfp_mask & __GFP_NORETRY) - return CHARGE_NOMEM; + nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, + gfp_mask, may_swap); - ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) - return CHARGE_RETRY; + goto retry; + + if (!drained) { + drain_all_stock_async(mem_over_limit); + drained = true; + goto retry; + } + + if (gfp_mask & __GFP_NORETRY) + goto nomem; /* * Even though the limit is exceeded at this point, reclaim * may have been able to free some pages. Retry the charge @@ -2609,142 +2577,48 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) - return CHARGE_RETRY; - + if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) + goto retry; /* * At task move, charge accounts can be doubly counted. So, it's * better to wait until the end of task_move if something is going on. */ if (mem_cgroup_wait_acct_move(mem_over_limit)) - return CHARGE_RETRY; - - if (invoke_oom) - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - - return CHARGE_NOMEM; -} - -/** - * mem_cgroup_try_charge - try charging a memcg - * @memcg: memcg to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns 0 if @memcg was charged successfully, -EINTR if the charge - * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. - */ -static int mem_cgroup_try_charge(struct mem_cgroup *memcg, - gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) -{ - unsigned int batch = max(CHARGE_BATCH, nr_pages); - int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - int ret; - - if (mem_cgroup_is_root(memcg)) - goto done; - /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE) || - fatal_signal_pending(current) || - current->flags & PF_EXITING)) - goto bypass; + goto retry; - if (unlikely(task_in_memcg_oom(current))) - goto nomem; + if (nr_retries--) + goto retry; if (gfp_mask & __GFP_NOFAIL) - oom = false; -again: - if (consume_stock(memcg, nr_pages)) - goto done; - - do { - bool invoke_oom = oom && !nr_oom_retries; - - /* If killed, bypass charge */ - if (fatal_signal_pending(current)) - goto bypass; + goto bypass; - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, - nr_pages, invoke_oom); - switch (ret) { - case CHARGE_OK: - break; - case CHARGE_RETRY: /* not in OOM situation but retry */ - batch = nr_pages; - goto again; - case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - goto nomem; - case CHARGE_NOMEM: /* OOM routine works */ - if (!oom || invoke_oom) - goto nomem; - nr_oom_retries--; - break; - } - } while (ret != CHARGE_OK); + if (fatal_signal_pending(current)) + goto bypass; - if (batch > nr_pages) - refill_stock(memcg, batch - nr_pages); -done: - return 0; + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; bypass: return -EINTR; -} - -/** - * mem_cgroup_try_charge_mm - try charging a mm - * @mm: mm_struct to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns the charged mem_cgroup associated with the given mm_struct or - * NULL the charge failed. - */ -static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, - gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) - -{ - struct mem_cgroup *memcg; - int ret; - - memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); - css_put(&memcg->css); - if (ret == -EINTR) - memcg = root_mem_cgroup; - else if (ret) - memcg = NULL; - return memcg; +done_restock: + if (batch > nr_pages) + refill_stock(memcg, batch - nr_pages); +done: + return ret; } -/* - * Somemtimes we have to undo a charge we got by try_charge(). - * This function is for that and do uncharge, put css's refcnt. - * gotten by try_charge(). - */ -static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) +static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - if (!mem_cgroup_is_root(memcg)) { - unsigned long bytes = nr_pages * PAGE_SIZE; + unsigned long bytes = nr_pages * PAGE_SIZE; - res_counter_uncharge(&memcg->res, bytes); - if (do_swap_account) - res_counter_uncharge(&memcg->memsw, bytes); - } + if (mem_cgroup_is_root(memcg)) + return; + + res_counter_uncharge(&memcg->res, bytes); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, bytes); } /* @@ -2779,6 +2653,16 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) return mem_cgroup_from_id(id); } +/* + * try_get_mem_cgroup_from_page - look up page's memcg association + * @page: the page + * + * Look up, get a css reference, and return the memcg that owns @page. + * + * The page must be locked to prevent racing with swap-in and page + * cache charges. If coming from an unlocked page table, the caller + * must ensure the page is on the LRU or this can race with charging. + */ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { struct mem_cgroup *memcg = NULL; @@ -2789,7 +2673,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) VM_BUG_ON_PAGE(!PageLocked(page), page); pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { memcg = pc->mem_cgroup; if (memcg && !css_tryget_online(&memcg->css)) @@ -2803,23 +2686,46 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) memcg = NULL; rcu_read_unlock(); } - unlock_page_cgroup(pc); return memcg; } -static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, - struct page *page, - unsigned int nr_pages, - enum charge_type ctype, - bool lrucare) +static void lock_page_lru(struct page *page, int *isolated) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_page_lruvec(page, zone); + ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, page_lru(page)); + *isolated = 1; + } else + *isolated = 0; +} + +static void unlock_page_lru(struct page *page, int isolated) +{ + struct zone *zone = page_zone(page); + + if (isolated) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_page_lruvec(page, zone); + VM_BUG_ON_PAGE(PageLRU(page), page); + SetPageLRU(page); + add_page_to_lru_list(page, lruvec, page_lru(page)); + } + spin_unlock_irq(&zone->lru_lock); +} + +static void commit_charge(struct page *page, struct mem_cgroup *memcg, + bool lrucare) { struct page_cgroup *pc = lookup_page_cgroup(page); - struct zone *uninitialized_var(zone); - struct lruvec *lruvec; - bool was_on_lru = false; - bool anon; + int isolated; - lock_page_cgroup(pc); VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); /* * we don't need page_cgroup_lock about tail pages, becase they are not @@ -2830,52 +2736,28 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page * may already be on some other mem_cgroup's LRU. Take care of it. */ - if (lrucare) { - zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page)) { - lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_lru(page)); - was_on_lru = true; - } - } + if (lrucare) + lock_page_lru(page, &isolated); - pc->mem_cgroup = memcg; /* - * We access a page_cgroup asynchronously without lock_page_cgroup(). - * Especially when a page_cgroup is taken from a page, pc->mem_cgroup - * is accessed after testing USED bit. To make pc->mem_cgroup visible - * before USED bit, we need memory barrier here. - * See mem_cgroup_add_lru_list(), etc. + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point: + * + * - the page is uncharged + * + * - the page is off-LRU + * + * - an anonymous fault has exclusive page access, except for + * a locked page table + * + * - a page cache insertion, a swapin fault, or a migration + * have the page locked */ - smp_wmb(); - SetPageCgroupUsed(pc); - - if (lrucare) { - if (was_on_lru) { - lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); - VM_BUG_ON_PAGE(PageLRU(page), page); - SetPageLRU(page); - add_page_to_lru_list(page, lruvec, page_lru(page)); - } - spin_unlock_irq(&zone->lru_lock); - } - - if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) - anon = true; - else - anon = false; - - mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); - unlock_page_cgroup(pc); + pc->mem_cgroup = memcg; + pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); - /* - * "charge_statistics" updated event counter. Then, check it. - * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. - * if they exceeds softlimit. - */ - memcg_check_events(memcg, page); + if (lrucare) + unlock_page_lru(page, isolated); } static DEFINE_MUTEX(set_limit_mutex); @@ -2889,12 +2771,6 @@ static DEFINE_MUTEX(memcg_slab_mutex); static DEFINE_MUTEX(activate_kmem_mutex); -static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) -{ - return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && - memcg_kmem_is_active(memcg); -} - /* * This is a bit cumbersome, but it is rarely used and avoids a backpointer * in the memcg_cache_params struct. @@ -2914,7 +2790,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); struct memcg_cache_params *params; - if (!memcg_can_account_kmem(memcg)) + if (!memcg_kmem_is_active(memcg)) return -EIO; print_slabinfo_header(m); @@ -2937,22 +2813,21 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) if (ret) return ret; - ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, - oom_gfp_allowed(gfp)); + ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); if (ret == -EINTR) { /* - * mem_cgroup_try_charge() chosed to bypass to root due to - * OOM kill or fatal signal. Since our only options are to - * either fail the allocation or charge it to this cgroup, do - * it as a temporary condition. But we can't fail. From a - * kmem/slab perspective, the cache has already been selected, - * by mem_cgroup_kmem_get_cache(), so it is too late to change + * try_charge() chose to bypass to root due to OOM kill or + * fatal signal. Since our only options are to either fail + * the allocation or charge it to this cgroup, do it as a + * temporary condition. But we can't fail. From a kmem/slab + * perspective, the cache has already been selected, by + * mem_cgroup_kmem_get_cache(), so it is too late to change * our minds. * * This condition will only trigger if the task entered - * memcg_charge_kmem in a sane state, but was OOM-killed during - * mem_cgroup_try_charge() above. Tasks that were already - * dying when the allocation triggers should have been already + * memcg_charge_kmem in a sane state, but was OOM-killed + * during try_charge() above. Tasks that were already dying + * when the allocation triggers should have been already * directed to the root cgroup in memcontrol.h */ res_counter_charge_nofail(&memcg->res, size, &fail_res); @@ -2998,19 +2873,44 @@ int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -static size_t memcg_caches_array_size(int num_groups) +static int memcg_alloc_cache_id(void) { - ssize_t size; - if (num_groups <= 0) - return 0; + int id, size; + int err; - size = 2 * num_groups; + id = ida_simple_get(&kmem_limited_groups, + 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + if (id < 0) + return id; + + if (id < memcg_limited_groups_array_size) + return id; + + /* + * There's no space for the new id in memcg_caches arrays, + * so we have to grow them. + */ + + size = 2 * (id + 1); if (size < MEMCG_CACHES_MIN_SIZE) size = MEMCG_CACHES_MIN_SIZE; else if (size > MEMCG_CACHES_MAX_SIZE) size = MEMCG_CACHES_MAX_SIZE; - return size; + mutex_lock(&memcg_slab_mutex); + err = memcg_update_all_caches(size); + mutex_unlock(&memcg_slab_mutex); + + if (err) { + ida_simple_remove(&kmem_limited_groups, id); + return err; + } + return id; +} + +static void memcg_free_cache_id(int id) +{ + ida_simple_remove(&kmem_limited_groups, id); } /* @@ -3020,97 +2920,7 @@ static size_t memcg_caches_array_size(int num_groups) */ void memcg_update_array_size(int num) { - if (num > memcg_limited_groups_array_size) - memcg_limited_groups_array_size = memcg_caches_array_size(num); -} - -int memcg_update_cache_size(struct kmem_cache *s, int num_groups) -{ - struct memcg_cache_params *cur_params = s->memcg_params; - - VM_BUG_ON(!is_root_cache(s)); - - if (num_groups > memcg_limited_groups_array_size) { - int i; - struct memcg_cache_params *new_params; - ssize_t size = memcg_caches_array_size(num_groups); - - size *= sizeof(void *); - size += offsetof(struct memcg_cache_params, memcg_caches); - - new_params = kzalloc(size, GFP_KERNEL); - if (!new_params) - return -ENOMEM; - - new_params->is_root_cache = true; - - /* - * There is the chance it will be bigger than - * memcg_limited_groups_array_size, if we failed an allocation - * in a cache, in which case all caches updated before it, will - * have a bigger array. - * - * But if that is the case, the data after - * memcg_limited_groups_array_size is certainly unused - */ - for (i = 0; i < memcg_limited_groups_array_size; i++) { - if (!cur_params->memcg_caches[i]) - continue; - new_params->memcg_caches[i] = - cur_params->memcg_caches[i]; - } - - /* - * Ideally, we would wait until all caches succeed, and only - * then free the old one. But this is not worth the extra - * pointer per-cache we'd have to have for this. - * - * It is not a big deal if some caches are left with a size - * bigger than the others. And all updates will reset this - * anyway. - */ - rcu_assign_pointer(s->memcg_params, new_params); - if (cur_params) - kfree_rcu(cur_params, rcu_head); - } - return 0; -} - -int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, - struct kmem_cache *root_cache) -{ - size_t size; - - if (!memcg_kmem_enabled()) - return 0; - - if (!memcg) { - size = offsetof(struct memcg_cache_params, memcg_caches); - size += memcg_limited_groups_array_size * sizeof(void *); - } else - size = sizeof(struct memcg_cache_params); - - s->memcg_params = kzalloc(size, GFP_KERNEL); - if (!s->memcg_params) - return -ENOMEM; - - if (memcg) { - s->memcg_params->memcg = memcg; - s->memcg_params->root_cache = root_cache; - css_get(&memcg->css); - } else - s->memcg_params->is_root_cache = true; - - return 0; -} - -void memcg_free_cache_params(struct kmem_cache *s) -{ - if (!s->memcg_params) - return; - if (!s->memcg_params->is_root_cache) - css_put(&s->memcg_params->memcg->css); - kfree(s->memcg_params); + memcg_limited_groups_array_size = num; } static void memcg_register_cache(struct mem_cgroup *memcg, @@ -3143,6 +2953,7 @@ static void memcg_register_cache(struct mem_cgroup *memcg, if (!cachep) return; + css_get(&memcg->css); list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); /* @@ -3176,6 +2987,9 @@ static void memcg_unregister_cache(struct kmem_cache *cachep) list_del(&cachep->memcg_params->list); kmem_cache_destroy(cachep); + + /* drop the reference taken in memcg_register_cache */ + css_put(&memcg->css); } /* @@ -3353,7 +3167,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); - if (!memcg_can_account_kmem(memcg)) + if (!memcg_kmem_is_active(memcg)) goto out; memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); @@ -3438,7 +3252,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) memcg = get_mem_cgroup_from_mm(current->mm); - if (!memcg_can_account_kmem(memcg)) { + if (!memcg_kmem_is_active(memcg)) { css_put(&memcg->css); return true; } @@ -3463,12 +3277,13 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, memcg_uncharge_kmem(memcg, PAGE_SIZE << order); return; } - + /* + * The page is freshly allocated and not visible to any + * outside callers yet. Set up pc non-atomically. + */ pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); pc->mem_cgroup = memcg; - SetPageCgroupUsed(pc); - unlock_page_cgroup(pc); + pc->flags = PCG_USED; } void __memcg_kmem_uncharge_pages(struct page *page, int order) @@ -3478,19 +3293,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) pc = lookup_page_cgroup(page); - /* - * Fast unlocked return. Theoretically might have changed, have to - * check again after locking. - */ if (!PageCgroupUsed(pc)) return; - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); + memcg = pc->mem_cgroup; + pc->flags = 0; /* * We trust that only if there is a memcg associated with the page, it @@ -3510,7 +3317,6 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) /* * Because tail pages are not marked as "used", set it. We're under * zone->lru_lock, 'splitting on pmd' and compound_lock. @@ -3531,8 +3337,7 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; pc->mem_cgroup = memcg; - smp_wmb();/* see __commit_charge() */ - pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + pc->flags = head_pc->flags; } __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], HPAGE_PMD_NR); @@ -3562,7 +3367,6 @@ static int mem_cgroup_move_account(struct page *page, { unsigned long flags; int ret; - bool anon = PageAnon(page); VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -3576,15 +3380,21 @@ static int mem_cgroup_move_account(struct page *page, if (nr_pages > 1 && !PageTransHuge(page)) goto out; - lock_page_cgroup(pc); + /* + * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup + * of its source page while we change it: page migration takes + * both pages off the LRU, but page cache replacement doesn't. + */ + if (!trylock_page(page)) + goto out; ret = -EINVAL; if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) - goto unlock; + goto out_unlock; move_lock_mem_cgroup(from, &flags); - if (!anon && page_mapped(page)) { + if (!PageAnon(page) && page_mapped(page)) { __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], nr_pages); __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], @@ -3598,20 +3408,25 @@ static int mem_cgroup_move_account(struct page *page, nr_pages); } - mem_cgroup_charge_statistics(from, page, anon, -nr_pages); + /* + * It is safe to change pc->mem_cgroup here because the page + * is referenced, charged, and isolated - we can't race with + * uncharging, charging, migration, or LRU putback. + */ /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, page, anon, nr_pages); move_unlock_mem_cgroup(from, &flags); ret = 0; -unlock: - unlock_page_cgroup(pc); - /* - * check events - */ + + local_irq_disable(); + mem_cgroup_charge_statistics(to, page, nr_pages); memcg_check_events(to, page); + mem_cgroup_charge_statistics(from, page, -nr_pages); memcg_check_events(from, page); + local_irq_enable(); +out_unlock: + unlock_page(page); out: return ret; } @@ -3682,456 +3497,12 @@ out: return ret; } -int mem_cgroup_charge_anon(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) -{ - unsigned int nr_pages = 1; - struct mem_cgroup *memcg; - bool oom = true; - - if (mem_cgroup_disabled()) - return 0; - - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - VM_BUG_ON(!mm); - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - /* - * Never OOM-kill a process for a huge page. The - * fault handler will fall back to regular pages. - */ - oom = false; - } - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); - if (!memcg) - return -ENOMEM; - __mem_cgroup_commit_charge(memcg, page, nr_pages, - MEM_CGROUP_CHARGE_TYPE_ANON, false); - return 0; -} - -/* - * While swap-in, try_charge -> commit or cancel, the page is locked. - * And when try_charge() successfully returns, one refcnt to memcg without - * struct page_cgroup is acquired. This refcnt will be consumed by - * "commit()" or removed by "cancel()" - */ -static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, - gfp_t mask, - struct mem_cgroup **memcgp) -{ - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - int ret; - - pc = lookup_page_cgroup(page); - /* - * Every swap fault against a single page tries to charge the - * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. The USED bit is protected by - * the page lock, which serializes swap cache removal, which - * in turn serializes uncharging. - */ - if (PageCgroupUsed(pc)) - goto out; - if (do_swap_account) - memcg = try_get_mem_cgroup_from_page(page); - if (!memcg) - memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, mask, 1, true); - css_put(&memcg->css); - if (ret == -EINTR) - memcg = root_mem_cgroup; - else if (ret) - return ret; -out: - *memcgp = memcg; - return 0; -} - -int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, - gfp_t gfp_mask, struct mem_cgroup **memcgp) -{ - if (mem_cgroup_disabled()) { - *memcgp = NULL; - return 0; - } - /* - * A racing thread's fault, or swapoff, may have already - * updated the pte, and even removed page from swap cache: in - * those cases unuse_pte()'s pte_same() test will fail; but - * there's also a KSM case which does need to charge the page. - */ - if (!PageSwapCache(page)) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); - if (!memcg) - return -ENOMEM; - *memcgp = memcg; - return 0; - } - return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); -} - -void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) -{ - if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - __mem_cgroup_cancel_charge(memcg, 1); -} - -static void -__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, - enum charge_type ctype) -{ - if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - - __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); - /* - * Now swap is on-memory. This means this page may be - * counted both as mem and swap....double count. - * Fix it by uncharging from memsw. Basically, this SwapCache is stable - * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() - * may call delete_from_swap_cache() before reach here. - */ - if (do_swap_account && PageSwapCache(page)) { - swp_entry_t ent = {.val = page_private(page)}; - mem_cgroup_uncharge_swap(ent); - } -} - -void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *memcg) -{ - __mem_cgroup_commit_charge_swapin(page, memcg, - MEM_CGROUP_CHARGE_TYPE_ANON); -} - -int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) -{ - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - struct mem_cgroup *memcg; - int ret; - - if (mem_cgroup_disabled()) - return 0; - if (PageCompound(page)) - return 0; - - if (PageSwapCache(page)) { /* shmem */ - ret = __mem_cgroup_try_charge_swapin(mm, page, - gfp_mask, &memcg); - if (ret) - return ret; - __mem_cgroup_commit_charge_swapin(page, memcg, type); - return 0; - } - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); - if (!memcg) - return -ENOMEM; - __mem_cgroup_commit_charge(memcg, page, 1, type, false); - return 0; -} - -static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, - unsigned int nr_pages, - const enum charge_type ctype) -{ - struct memcg_batch_info *batch = NULL; - bool uncharge_memsw = true; - - /* If swapout, usage of swap doesn't decrease */ - if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) - uncharge_memsw = false; - - batch = ¤t->memcg_batch; - /* - * In usual, we do css_get() when we remember memcg pointer. - * But in this case, we keep res->usage until end of a series of - * uncharges. Then, it's ok to ignore memcg's refcnt. - */ - if (!batch->memcg) - batch->memcg = memcg; - /* - * do_batch > 0 when unmapping pages or inode invalidate/truncate. - * In those cases, all pages freed continuously can be expected to be in - * the same cgroup and we have chance to coalesce uncharges. - * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) - * because we want to do uncharge as soon as possible. - */ - - if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) - goto direct_uncharge; - - if (nr_pages > 1) - goto direct_uncharge; - - /* - * In typical case, batch->memcg == mem. This means we can - * merge a series of uncharges to an uncharge of res_counter. - * If not, we uncharge res_counter ony by one. - */ - if (batch->memcg != memcg) - goto direct_uncharge; - /* remember freed charge and uncharge it later */ - batch->nr_pages++; - if (uncharge_memsw) - batch->memsw_nr_pages++; - return; -direct_uncharge: - res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); - if (uncharge_memsw) - res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); - if (unlikely(batch->memcg != memcg)) - memcg_oom_recover(memcg); -} - -/* - * uncharge if !page_mapped(page) - */ -static struct mem_cgroup * -__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, - bool end_migration) -{ - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - struct page_cgroup *pc; - bool anon; - - if (mem_cgroup_disabled()) - return NULL; - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - /* - * Check if our page_cgroup is valid - */ - pc = lookup_page_cgroup(page); - if (unlikely(!PageCgroupUsed(pc))) - return NULL; - - lock_page_cgroup(pc); - - memcg = pc->mem_cgroup; - - if (!PageCgroupUsed(pc)) - goto unlock_out; - - anon = PageAnon(page); - - switch (ctype) { - case MEM_CGROUP_CHARGE_TYPE_ANON: - /* - * Generally PageAnon tells if it's the anon statistics to be - * updated; but sometimes e.g. mem_cgroup_uncharge_page() is - * used before page reached the stage of being marked PageAnon. - */ - anon = true; - /* fallthrough */ - case MEM_CGROUP_CHARGE_TYPE_DROP: - /* See mem_cgroup_prepare_migration() */ - if (page_mapped(page)) - goto unlock_out; - /* - * Pages under migration may not be uncharged. But - * end_migration() /must/ be the one uncharging the - * unused post-migration page and so it has to call - * here with the migration bit still set. See the - * res_counter handling below. - */ - if (!end_migration && PageCgroupMigration(pc)) - goto unlock_out; - break; - case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: - if (!PageAnon(page)) { /* Shared memory */ - if (page->mapping && !page_is_file_cache(page)) - goto unlock_out; - } else if (page_mapped(page)) /* Anon */ - goto unlock_out; - break; - default: - break; - } - - mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); - - ClearPageCgroupUsed(pc); - /* - * pc->mem_cgroup is not cleared here. It will be accessed when it's - * freed from LRU. This is safe because uncharged page is expected not - * to be reused (freed soon). Exception is SwapCache, it's handled by - * special functions. - */ - - unlock_page_cgroup(pc); - /* - * even after unlock, we have memcg->res.usage here and this memcg - * will never be freed, so it's safe to call css_get(). - */ - memcg_check_events(memcg, page); - if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { - mem_cgroup_swap_statistics(memcg, true); - css_get(&memcg->css); - } - /* - * Migration does not charge the res_counter for the - * replacement page, so leave it alone when phasing out the - * page that is unused after the migration. - */ - if (!end_migration && !mem_cgroup_is_root(memcg)) - mem_cgroup_do_uncharge(memcg, nr_pages, ctype); - - return memcg; - -unlock_out: - unlock_page_cgroup(pc); - return NULL; -} - -void mem_cgroup_uncharge_page(struct page *page) -{ - /* early check. */ - if (page_mapped(page)) - return; - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - /* - * If the page is in swap cache, uncharge should be deferred - * to the swap path, which also properly accounts swap usage - * and handles memcg lifetime. - * - * Note that this check is not stable and reclaim may add the - * page to swap cache at any time after this. However, if the - * page is not in swap cache by the time page->mapcount hits - * 0, there won't be any page table references to the swap - * slot, and reclaim will free it and not actually write the - * page to disk. - */ - if (PageSwapCache(page)) - return; - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); -} - -void mem_cgroup_uncharge_cache_page(struct page *page) -{ - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping, page); - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); -} - -/* - * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. - * In that cases, pages are freed continuously and we can expect pages - * are in the same memcg. All these calls itself limits the number of - * pages freed at once, then uncharge_start/end() is called properly. - * This may be called prural(2) times in a context, - */ - -void mem_cgroup_uncharge_start(void) -{ - current->memcg_batch.do_batch++; - /* We can do nest. */ - if (current->memcg_batch.do_batch == 1) { - current->memcg_batch.memcg = NULL; - current->memcg_batch.nr_pages = 0; - current->memcg_batch.memsw_nr_pages = 0; - } -} - -void mem_cgroup_uncharge_end(void) -{ - struct memcg_batch_info *batch = ¤t->memcg_batch; - - if (!batch->do_batch) - return; - - batch->do_batch--; - if (batch->do_batch) /* If stacked, do nothing. */ - return; - - if (!batch->memcg) - return; - /* - * This "batch->memcg" is valid without any css_get/put etc... - * bacause we hide charges behind us. - */ - if (batch->nr_pages) - res_counter_uncharge(&batch->memcg->res, - batch->nr_pages * PAGE_SIZE); - if (batch->memsw_nr_pages) - res_counter_uncharge(&batch->memcg->memsw, - batch->memsw_nr_pages * PAGE_SIZE); - memcg_oom_recover(batch->memcg); - /* forget this pointer (for sanity check) */ - batch->memcg = NULL; -} - -#ifdef CONFIG_SWAP -/* - * called after __delete_from_swap_cache() and drop "page" account. - * memcg information is recorded to swap_cgroup of "ent" - */ -void -mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) -{ - struct mem_cgroup *memcg; - int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; - - if (!swapout) /* this was a swap cache but the swap is unused ! */ - ctype = MEM_CGROUP_CHARGE_TYPE_DROP; - - memcg = __mem_cgroup_uncharge_common(page, ctype, false); - - /* - * record memcg information, if swapout && memcg != NULL, - * css_get() was called in uncharge(). - */ - if (do_swap_account && swapout && memcg) - swap_cgroup_record(ent, mem_cgroup_id(memcg)); -} -#endif - #ifdef CONFIG_MEMCG_SWAP -/* - * called from swap_entry_free(). remove record in swap_cgroup and - * uncharge "memsw" account. - */ -void mem_cgroup_uncharge_swap(swp_entry_t ent) +static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, + bool charge) { - struct mem_cgroup *memcg; - unsigned short id; - - if (!do_swap_account) - return; - - id = swap_cgroup_record(ent, 0); - rcu_read_lock(); - memcg = mem_cgroup_lookup(id); - if (memcg) { - /* - * We uncharge this because swap is freed. This memcg can - * be obsolete one. We avoid calling css_tryget_online(). - */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); - mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); - } - rcu_read_unlock(); + int val = (charge) ? 1 : -1; + this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); } /** @@ -4183,175 +3554,6 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif -/* - * Before starting migration, account PAGE_SIZE to mem_cgroup that the old - * page belongs to. - */ -void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, - struct mem_cgroup **memcgp) -{ - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - struct page_cgroup *pc; - enum charge_type ctype; - - *memcgp = NULL; - - if (mem_cgroup_disabled()) - return; - - if (PageTransHuge(page)) - nr_pages <<= compound_order(page); - - pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - css_get(&memcg->css); - /* - * At migrating an anonymous page, its mapcount goes down - * to 0 and uncharge() will be called. But, even if it's fully - * unmapped, migration may fail and this page has to be - * charged again. We set MIGRATION flag here and delay uncharge - * until end_migration() is called - * - * Corner Case Thinking - * A) - * When the old page was mapped as Anon and it's unmap-and-freed - * while migration was ongoing. - * If unmap finds the old page, uncharge() of it will be delayed - * until end_migration(). If unmap finds a new page, it's - * uncharged when it make mapcount to be 1->0. If unmap code - * finds swap_migration_entry, the new page will not be mapped - * and end_migration() will find it(mapcount==0). - * - * B) - * When the old page was mapped but migraion fails, the kernel - * remaps it. A charge for it is kept by MIGRATION flag even - * if mapcount goes down to 0. We can do remap successfully - * without charging it again. - * - * C) - * The "old" page is under lock_page() until the end of - * migration, so, the old page itself will not be swapped-out. - * If the new page is swapped out before end_migraton, our - * hook to usual swap-out path will catch the event. - */ - if (PageAnon(page)) - SetPageCgroupMigration(pc); - } - unlock_page_cgroup(pc); - /* - * If the page is not charged at this point, - * we return here. - */ - if (!memcg) - return; - - *memcgp = memcg; - /* - * We charge new page before it's used/mapped. So, even if unlock_page() - * is called before end_migration, we can catch all events on this new - * page. In the case new page is migrated but not remapped, new page's - * mapcount will be finally 0 and we call uncharge in end_migration(). - */ - if (PageAnon(page)) - ctype = MEM_CGROUP_CHARGE_TYPE_ANON; - else - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; - /* - * The page is committed to the memcg, but it's not actually - * charged to the res_counter since we plan on replacing the - * old one and only one page is going to be left afterwards. - */ - __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); -} - -/* remove redundant charge if migration failed*/ -void mem_cgroup_end_migration(struct mem_cgroup *memcg, - struct page *oldpage, struct page *newpage, bool migration_ok) -{ - struct page *used, *unused; - struct page_cgroup *pc; - bool anon; - - if (!memcg) - return; - - if (!migration_ok) { - used = oldpage; - unused = newpage; - } else { - used = newpage; - unused = oldpage; - } - anon = PageAnon(used); - __mem_cgroup_uncharge_common(unused, - anon ? MEM_CGROUP_CHARGE_TYPE_ANON - : MEM_CGROUP_CHARGE_TYPE_CACHE, - true); - css_put(&memcg->css); - /* - * We disallowed uncharge of pages under migration because mapcount - * of the page goes down to zero, temporarly. - * Clear the flag and check the page should be charged. - */ - pc = lookup_page_cgroup(oldpage); - lock_page_cgroup(pc); - ClearPageCgroupMigration(pc); - unlock_page_cgroup(pc); - - /* - * If a page is a file cache, radix-tree replacement is very atomic - * and we can skip this check. When it was an Anon page, its mapcount - * goes down to 0. But because we added MIGRATION flage, it's not - * uncharged yet. There are several case but page->mapcount check - * and USED bit check in mem_cgroup_uncharge_page() will do enough - * check. (see prepare_charge() also) - */ - if (anon) - mem_cgroup_uncharge_page(used); -} - -/* - * At replace page cache, newpage is not under any memcg but it's on - * LRU. So, this function doesn't touch res_counter but handles LRU - * in correct way. Both pages are locked so we cannot race with uncharge. - */ -void mem_cgroup_replace_page_cache(struct page *oldpage, - struct page *newpage) -{ - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - - if (mem_cgroup_disabled()) - return; - - pc = lookup_page_cgroup(oldpage); - /* fix accounting on old pages */ - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, oldpage, false, -1); - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); - - /* - * When called from shmem_replace_page(), in some cases the - * oldpage has already been charged, and in some cases not. - */ - if (!memcg) - return; - /* - * Even if newpage->mapping was NULL before starting replacement, - * the newpage may be on LRU(or pagevec for LRU) already. We lock - * LRU while we overwrite pc->mem_cgroup. - */ - __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); -} - #ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { @@ -4392,7 +3594,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - u64 memswlimit, memlimit; int ret = 0; int children = mem_cgroup_count_children(memcg); u64 curusage, oldusage; @@ -4419,31 +3620,23 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, * We have to guarantee memcg->res.limit <= memcg->memsw.limit. */ mutex_lock(&set_limit_mutex); - memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - if (memswlimit < val) { + if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) { ret = -EINVAL; mutex_unlock(&set_limit_mutex); break; } - memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); - if (memlimit < val) + if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) enlarge = 1; ret = res_counter_set_limit(&memcg->res, val); - if (!ret) { - if (memswlimit == val) - memcg->memsw_is_minimum = true; - else - memcg->memsw_is_minimum = false; - } mutex_unlock(&set_limit_mutex); if (!ret) break; - mem_cgroup_reclaim(memcg, GFP_KERNEL, - MEM_CGROUP_RECLAIM_SHRINK); + try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); + curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -4461,7 +3654,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - u64 memlimit, memswlimit, oldusage, curusage; + u64 oldusage, curusage; int children = mem_cgroup_count_children(memcg); int ret = -EBUSY; int enlarge = 0; @@ -4480,30 +3673,21 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, * We have to guarantee memcg->res.limit <= memcg->memsw.limit. */ mutex_lock(&set_limit_mutex); - memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); - if (memlimit > val) { + if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) { ret = -EINVAL; mutex_unlock(&set_limit_mutex); break; } - memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - if (memswlimit < val) + if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) enlarge = 1; ret = res_counter_set_limit(&memcg->memsw, val); - if (!ret) { - if (memlimit == val) - memcg->memsw_is_minimum = true; - else - memcg->memsw_is_minimum = false; - } mutex_unlock(&set_limit_mutex); if (!ret) break; - mem_cgroup_reclaim(memcg, GFP_KERNEL, - MEM_CGROUP_RECLAIM_NOSWAP | - MEM_CGROUP_RECLAIM_SHRINK); + try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); + curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -4550,7 +3734,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; - spin_lock(&mctz->lock); + spin_lock_irq(&mctz->lock); /* * If we failed to reclaim anything from this memory cgroup @@ -4590,7 +3774,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, */ /* If excess == 0, no tree ops */ __mem_cgroup_insert_exceeded(mz, mctz, excess); - spin_unlock(&mctz->lock); + spin_unlock_irq(&mctz->lock); css_put(&mz->memcg->css); loop++; /* @@ -4752,8 +3936,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) if (signal_pending(current)) return -EINTR; - progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, - false); + progress = try_to_free_mem_cgroup_pages(memcg, 1, + GFP_KERNEL, true); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -4817,7 +4001,6 @@ out: return retval; } - static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { @@ -4857,38 +4040,29 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) return val << PAGE_SHIFT; } + static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) + struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - u64 val; - int name; - enum res_type type; - - type = MEMFILE_TYPE(cft->private); - name = MEMFILE_ATTR(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); + int name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, false); - else - val = res_counter_read_u64(&memcg->res, name); - break; + return mem_cgroup_usage(memcg, false); + return res_counter_read_u64(&memcg->res, name); case _MEMSWAP: if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, true); - else - val = res_counter_read_u64(&memcg->memsw, name); - break; + return mem_cgroup_usage(memcg, true); + return res_counter_read_u64(&memcg->memsw, name); case _KMEM: - val = res_counter_read_u64(&memcg->kmem, name); + return res_counter_read_u64(&memcg->kmem, name); break; default: BUG(); } - - return val; } #ifdef CONFIG_MEMCG_KMEM @@ -4928,23 +4102,12 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, if (err) goto out; - memcg_id = ida_simple_get(&kmem_limited_groups, - 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + memcg_id = memcg_alloc_cache_id(); if (memcg_id < 0) { err = memcg_id; goto out; } - /* - * Make sure we have enough space for this cgroup in each root cache's - * memcg_params. - */ - mutex_lock(&memcg_slab_mutex); - err = memcg_update_all_caches(memcg_id + 1); - mutex_unlock(&memcg_slab_mutex); - if (err) - goto out_rmid; - memcg->kmemcg_id = memcg_id; INIT_LIST_HEAD(&memcg->memcg_slab_caches); @@ -4965,10 +4128,6 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, out: memcg_resume_kmem_account(); return err; - -out_rmid: - ida_simple_remove(&kmem_limited_groups, memcg_id); - goto out; } static int memcg_activate_kmem(struct mem_cgroup *memcg, @@ -5446,15 +4605,15 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = mem_cgroup_usage(memcg, true); + } else BUG(); - usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before adding a new one */ if (thresholds->primary) __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -5534,18 +4693,19 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, int i, j, size; mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = mem_cgroup_usage(memcg, true); + } else BUG(); if (!thresholds->primary) goto unlock; - usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before removing */ __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -6276,6 +5436,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); + int ret; if (css->id > MEM_CGROUP_ID_MAX) return -ENOSPC; @@ -6312,7 +5473,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) } mutex_unlock(&memcg_create_mutex); - return memcg_init_kmem(memcg, &memory_cgrp_subsys); + ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); + if (ret) + return ret; + + /* + * Make sure the memcg is initialized: mem_cgroup_iter() + * orders reading memcg->initialized against its callers + * reading the memcg members. + */ + smp_store_release(&memcg->initialized, 1); + + return 0; } /* @@ -6435,55 +5607,38 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ -#define PRECHARGE_COUNT_AT_ONCE 256 static int mem_cgroup_do_precharge(unsigned long count) { - int ret = 0; - int batch_count = PRECHARGE_COUNT_AT_ONCE; - struct mem_cgroup *memcg = mc.to; + int ret; - if (mem_cgroup_is_root(memcg)) { + /* Try a single bulk charge without reclaim first */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + if (!ret) { mc.precharge += count; - /* we don't need css_get for root */ return ret; } - /* try to charge at once */ - if (count > 1) { - struct res_counter *dummy; - /* - * "memcg" cannot be under rmdir() because we've already checked - * by cgroup_lock_live_cgroup() that it is not removed and we - * are still under the same cgroup_mutex. So we can postpone - * css_get(). - */ - if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) - goto one_by_one; - if (do_swap_account && res_counter_charge(&memcg->memsw, - PAGE_SIZE * count, &dummy)) { - res_counter_uncharge(&memcg->res, PAGE_SIZE * count); - goto one_by_one; - } - mc.precharge += count; + if (ret == -EINTR) { + cancel_charge(root_mem_cgroup, count); return ret; } -one_by_one: - /* fall back to one by one charge */ + + /* Try charges one by one with reclaim */ while (count--) { - if (signal_pending(current)) { - ret = -EINTR; - break; - } - if (!batch_count--) { - batch_count = PRECHARGE_COUNT_AT_ONCE; - cond_resched(); - } - ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); + /* + * In case of failure, any residual charges against + * mc.to will be dropped by mem_cgroup_clear_mc() + * later on. However, cancel any charges that are + * bypassed to root right away or they'll be lost. + */ + if (ret == -EINTR) + cancel_charge(root_mem_cgroup, 1); if (ret) - /* mem_cgroup_clear_mc() will do uncharge later */ return ret; mc.precharge++; + cond_resched(); } - return ret; + return 0; } /** @@ -6619,9 +5774,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (page) { pc = lookup_page_cgroup(page); /* - * Do only loose check w/o page_cgroup lock. - * mem_cgroup_move_account() checks the pc is valid or not under - * the lock. + * Do only loose check w/o serialization. + * mem_cgroup_move_account() checks the pc is valid or + * not under LRU exclusion. */ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; @@ -6746,7 +5901,7 @@ static void __mem_cgroup_clear_mc(void) /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { - __mem_cgroup_cancel_charge(mc.to, mc.precharge); + cancel_charge(mc.to, mc.precharge); mc.precharge = 0; } /* @@ -6754,7 +5909,7 @@ static void __mem_cgroup_clear_mc(void) * we must uncharge here. */ if (mc.moved_charge) { - __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); + cancel_charge(mc.from, mc.moved_charge); mc.moved_charge = 0; } /* we must fixup refcnts and charges */ @@ -6762,19 +5917,18 @@ static void __mem_cgroup_clear_mc(void) /* uncharge swap account from the old cgroup */ if (!mem_cgroup_is_root(mc.from)) res_counter_uncharge(&mc.from->memsw, - PAGE_SIZE * mc.moved_swap); + PAGE_SIZE * mc.moved_swap); for (i = 0; i < mc.moved_swap; i++) css_put(&mc.from->css); - if (!mem_cgroup_is_root(mc.to)) { - /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. - */ + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + if (!mem_cgroup_is_root(mc.to)) res_counter_uncharge(&mc.to->res, - PAGE_SIZE * mc.moved_swap); - } + PAGE_SIZE * mc.moved_swap); /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } @@ -7086,6 +6240,402 @@ static void __init enable_swap_cgroup(void) } #endif +#ifdef CONFIG_MEMCG_SWAP +/** + * mem_cgroup_swapout - transfer a memsw charge to swap + * @page: page whose memsw charge to transfer + * @entry: swap entry to move the charge to + * + * Transfer the memsw charge of @page to @entry. + */ +void mem_cgroup_swapout(struct page *page, swp_entry_t entry) +{ + struct page_cgroup *pc; + unsigned short oldid; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + if (!do_swap_account) + return; + + pc = lookup_page_cgroup(page); + + /* Readahead page, never charged */ + if (!PageCgroupUsed(pc)) + return; + + VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); + + oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); + VM_BUG_ON_PAGE(oldid, page); + + pc->flags &= ~PCG_MEMSW; + css_get(&pc->mem_cgroup->css); + mem_cgroup_swap_statistics(pc->mem_cgroup, true); +} + +/** + * mem_cgroup_uncharge_swap - uncharge a swap entry + * @entry: swap entry to uncharge + * + * Drop the memsw charge associated with @entry. + */ +void mem_cgroup_uncharge_swap(swp_entry_t entry) +{ + struct mem_cgroup *memcg; + unsigned short id; + + if (!do_swap_account) + return; + + id = swap_cgroup_record(entry, 0); + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); + if (memcg) { + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); + css_put(&memcg->css); + } + rcu_read_unlock(); +} +#endif + +/** + * mem_cgroup_try_charge - try charging a page + * @page: page to charge + * @mm: mm context of the victim + * @gfp_mask: reclaim mode + * @memcgp: charged memcg return + * + * Try to charge @page to the memcg that @mm belongs to, reclaiming + * pages according to @gfp_mask if necessary. + * + * Returns 0 on success, with *@memcgp pointing to the charged memcg. + * Otherwise, an error code is returned. + * + * After page->mapping has been set up, the caller must finalize the + * charge with mem_cgroup_commit_charge(). Or abort the transaction + * with mem_cgroup_cancel_charge() in case page instantiation fails. + */ +int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp) +{ + struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; + int ret = 0; + + if (mem_cgroup_disabled()) + goto out; + + if (PageSwapCache(page)) { + struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * Every swap fault against a single page tries to charge the + * page, bail as early as possible. shmem_unuse() encounters + * already charged pages, too. The USED bit is protected by + * the page lock, which serializes swap cache removal, which + * in turn serializes uncharging. + */ + if (PageCgroupUsed(pc)) + goto out; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + if (do_swap_account && PageSwapCache(page)) + memcg = try_get_mem_cgroup_from_page(page); + if (!memcg) + memcg = get_mem_cgroup_from_mm(mm); + + ret = try_charge(memcg, gfp_mask, nr_pages); + + css_put(&memcg->css); + + if (ret == -EINTR) { + memcg = root_mem_cgroup; + ret = 0; + } +out: + *memcgp = memcg; + return ret; +} + +/** + * mem_cgroup_commit_charge - commit a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * @lrucare: page might be on LRU already + * + * Finalize a charge transaction started by mem_cgroup_try_charge(), + * after page->mapping has been set up. This must happen atomically + * as part of the page instantiation, i.e. under the page table lock + * for anonymous pages, under the page lock for page and swap cache. + * + * In addition, the page must not be on the LRU during the commit, to + * prevent racing with task migration. If it might be, use @lrucare. + * + * Use mem_cgroup_cancel_charge() to cancel the transaction instead. + */ +void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, + bool lrucare) +{ + unsigned int nr_pages = 1; + + VM_BUG_ON_PAGE(!page->mapping, page); + VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + commit_charge(page, memcg, lrucare); + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, page, nr_pages); + memcg_check_events(memcg, page); + local_irq_enable(); + + if (do_swap_account && PageSwapCache(page)) { + swp_entry_t entry = { .val = page_private(page) }; + /* + * The swap entry might not get freed for a long time, + * let's not wait for it. The page already received a + * memory+swap charge, drop the swap entry duplicate. + */ + mem_cgroup_uncharge_swap(entry); + } +} + +/** + * mem_cgroup_cancel_charge - cancel a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * + * Cancel a charge transaction started by mem_cgroup_try_charge(). + */ +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) +{ + unsigned int nr_pages = 1; + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + cancel_charge(memcg, nr_pages); +} + +static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, + unsigned long nr_mem, unsigned long nr_memsw, + unsigned long nr_anon, unsigned long nr_file, + unsigned long nr_huge, struct page *dummy_page) +{ + unsigned long flags; + + if (!mem_cgroup_is_root(memcg)) { + if (nr_mem) + res_counter_uncharge(&memcg->res, + nr_mem * PAGE_SIZE); + if (nr_memsw) + res_counter_uncharge(&memcg->memsw, + nr_memsw * PAGE_SIZE); + memcg_oom_recover(memcg); + } + + local_irq_save(flags); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); + __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); + __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); + memcg_check_events(memcg, dummy_page); + local_irq_restore(flags); +} + +static void uncharge_list(struct list_head *page_list) +{ + struct mem_cgroup *memcg = NULL; + unsigned long nr_memsw = 0; + unsigned long nr_anon = 0; + unsigned long nr_file = 0; + unsigned long nr_huge = 0; + unsigned long pgpgout = 0; + unsigned long nr_mem = 0; + struct list_head *next; + struct page *page; + + next = page_list->next; + do { + unsigned int nr_pages = 1; + struct page_cgroup *pc; + + page = list_entry(next, struct page, lru); + next = page->lru.next; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + pc = lookup_page_cgroup(page); + if (!PageCgroupUsed(pc)) + continue; + + /* + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point, we have + * fully exclusive access to the page. + */ + + if (memcg != pc->mem_cgroup) { + if (memcg) { + uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, + nr_anon, nr_file, nr_huge, page); + pgpgout = nr_mem = nr_memsw = 0; + nr_anon = nr_file = nr_huge = 0; + } + memcg = pc->mem_cgroup; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + nr_huge += nr_pages; + } + + if (PageAnon(page)) + nr_anon += nr_pages; + else + nr_file += nr_pages; + + if (pc->flags & PCG_MEM) + nr_mem += nr_pages; + if (pc->flags & PCG_MEMSW) + nr_memsw += nr_pages; + pc->flags = 0; + + pgpgout++; + } while (next != page_list); + + if (memcg) + uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, + nr_anon, nr_file, nr_huge, page); +} + +/** + * mem_cgroup_uncharge - uncharge a page + * @page: page to uncharge + * + * Uncharge a page previously charged with mem_cgroup_try_charge() and + * mem_cgroup_commit_charge(). + */ +void mem_cgroup_uncharge(struct page *page) +{ + struct page_cgroup *pc; + + if (mem_cgroup_disabled()) + return; + + /* Don't touch page->lru of any random page, pre-check: */ + pc = lookup_page_cgroup(page); + if (!PageCgroupUsed(pc)) + return; + + INIT_LIST_HEAD(&page->lru); + uncharge_list(&page->lru); +} + +/** + * mem_cgroup_uncharge_list - uncharge a list of page + * @page_list: list of pages to uncharge + * + * Uncharge a list of pages previously charged with + * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). + */ +void mem_cgroup_uncharge_list(struct list_head *page_list) +{ + if (mem_cgroup_disabled()) + return; + + if (!list_empty(page_list)) + uncharge_list(page_list); +} + +/** + * mem_cgroup_migrate - migrate a charge to another page + * @oldpage: currently charged page + * @newpage: page to transfer the charge to + * @lrucare: both pages might be on the LRU already + * + * Migrate the charge from @oldpage to @newpage. + * + * Both pages must be locked, @newpage->mapping must be set up. + */ +void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, + bool lrucare) +{ + struct page_cgroup *pc; + int isolated; + + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); + VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); + VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); + VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), + newpage); + + if (mem_cgroup_disabled()) + return; + + /* Page cache replacement: new page already charged? */ + pc = lookup_page_cgroup(newpage); + if (PageCgroupUsed(pc)) + return; + + /* Re-entrant migration: old page already uncharged? */ + pc = lookup_page_cgroup(oldpage); + if (!PageCgroupUsed(pc)) + return; + + VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); + VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); + + if (lrucare) + lock_page_lru(oldpage, &isolated); + + pc->flags = 0; + + if (lrucare) + unlock_page_lru(oldpage, isolated); + + commit_charge(newpage, pc->mem_cgroup, lrucare); +} + /* * subsys_initcall() for memory controller. * diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a013bc9..8639f6b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -148,7 +148,7 @@ static int hwpoison_filter_task(struct page *p) ino = cgroup_ino(css->cgroup); css_put(css); - if (!ino || ino != hwpoison_filter_memcg) + if (ino != hwpoison_filter_memcg) return -EINVAL; return 0; @@ -1173,6 +1173,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) lock_page(hpage); /* + * The page could have changed compound pages during the locking. + * If this happens just bail out. + */ + if (compound_head(p) != hpage) { + action_result(pfn, "different compound page after locking", IGNORED); + res = -EBUSY; + goto out; + } + + /* * We use page flags to determine what action should be taken, but * the flags can be modified by the error containment action. One * example is an mlocked page, where PG_mlocked is cleared by diff --git a/mm/memory.c b/mm/memory.c index 8b44f76..3e50383 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -118,6 +118,8 @@ __setup("norandmaps", disable_randmaps); unsigned long zero_pfn __read_mostly; unsigned long highest_memmap_pfn __read_mostly; +EXPORT_SYMBOL(zero_pfn); + /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ @@ -751,7 +753,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { - if (likely(!pte_special(pte) || pte_numa(pte))) + if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; @@ -777,15 +779,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } + if (is_zero_pfn(pfn)) + return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } - if (is_zero_pfn(pfn)) - return NULL; - /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. @@ -884,7 +885,7 @@ out_set_pte: return 0; } -int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { @@ -1126,7 +1127,7 @@ again: addr) != page->index) { pte_t ptfile = pgoff_to_pte(page->index); if (pte_soft_dirty(ptent)) - pte_file_mksoft_dirty(ptfile); + ptfile = pte_file_mksoft_dirty(ptfile); set_pte_at(mm, addr, pte, ptfile); } if (PageAnon(page)) @@ -1146,6 +1147,7 @@ again: print_bad_pte(vma, addr, ptent, page); if (unlikely(!__tlb_remove_page(tlb, page))) { force_flush = 1; + addr += PAGE_SIZE; break; } continue; @@ -1292,7 +1294,6 @@ static void unmap_page_range(struct mmu_gather *tlb, details = NULL; BUG_ON(addr >= end); - mem_cgroup_uncharge_start(); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { @@ -1302,7 +1303,6 @@ static void unmap_page_range(struct mmu_gather *tlb, next = zap_pud_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); - mem_cgroup_uncharge_end(); } @@ -2049,11 +2049,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *dirty_page = NULL; unsigned long mmun_start = 0; /* For mmu_notifiers */ unsigned long mmun_end = 0; /* For mmu_notifiers */ + struct mem_cgroup *memcg; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { /* - * VM_MIXEDMAP !pfn_valid() case + * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a + * VM_PFNMAP VMA. * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable as we can't do any dirty @@ -2204,7 +2206,7 @@ gotten: } __SetPageUptodate(new_page); - if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) goto oom_free_new; mmun_start = address & PAGE_MASK; @@ -2234,6 +2236,8 @@ gotten: */ ptep_clear_flush(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the @@ -2271,7 +2275,7 @@ gotten: new_page = old_page; ret |= VM_FAULT_WRITE; } else - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); if (new_page) page_cache_release(new_page); @@ -2399,7 +2403,10 @@ EXPORT_SYMBOL(unmap_mapping_range); /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * + * We return with the mmap_sem locked or unlocked in the same cases + * as does filemap_fault(). */ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, @@ -2407,10 +2414,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, { spinlock_t *ptl; struct page *page, *swapcache; + struct mem_cgroup *memcg; swp_entry_t entry; pte_t pte; int locked; - struct mem_cgroup *ptr; int exclusive = 0; int ret = 0; @@ -2486,7 +2493,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_page; } - if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) { ret = VM_FAULT_OOM; goto out_page; } @@ -2511,10 +2518,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * while the page is counted on swap but not yet in mapcount i.e. * before page_add_anon_rmap() and swap_free(); try_to_free_swap() * must be called after the swap_free(), or it will never succeed. - * Because delete_from_swap_page() may be called by reuse_swap_page(), - * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry - * in page->private. In this case, a record in swap_cgroup is silently - * discarded at swap_free(). */ inc_mm_counter_fast(mm, MM_ANONPAGES); @@ -2530,12 +2533,14 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); set_pte_at(mm, address, page_table, pte); - if (page == swapcache) + if (page == swapcache) { do_page_add_anon_rmap(page, vma, address, exclusive); - else /* ksm created a completely new copy */ + mem_cgroup_commit_charge(page, memcg, true); + } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, address); - /* It's better to call commit-charge after rmap is established */ - mem_cgroup_commit_charge_swapin(page, ptr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); + } swap_free(entry); if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) @@ -2568,7 +2573,7 @@ unlock: out: return ret; out_nomap: - mem_cgroup_cancel_charge_swapin(ptr); + mem_cgroup_cancel_charge(page, memcg); pte_unmap_unlock(page_table, ptl); out_page: unlock_page(page); @@ -2624,6 +2629,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags) { + struct mem_cgroup *memcg; struct page *page; spinlock_t *ptl; pte_t entry; @@ -2657,7 +2663,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, */ __SetPageUptodate(page); - if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL)) + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); @@ -2670,6 +2676,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(mm, address, page_table, entry); @@ -2679,7 +2687,7 @@ unlock: pte_unmap_unlock(page_table, ptl); return 0; release: - mem_cgroup_uncharge_page(page); + mem_cgroup_cancel_charge(page, memcg); page_cache_release(page); goto unlock; oom_free_page: @@ -2688,6 +2696,11 @@ oom: return VM_FAULT_OOM; } +/* + * The mmap_sem must have been held on entry, and may have been + * released depending on flags and vma->vm_ops->fault() return value. + * See filemap_fault() and __lock_page_retry(). + */ static int __do_fault(struct vm_area_struct *vma, unsigned long address, pgoff_t pgoff, unsigned int flags, struct page **page) { @@ -2744,7 +2757,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) - pte_mksoft_dirty(entry); + entry = pte_mksoft_dirty(entry); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); @@ -2758,17 +2771,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, update_mmu_cache(vma, address, pte); } -static unsigned long fault_around_bytes = rounddown_pow_of_two(65536); - -static inline unsigned long fault_around_pages(void) -{ - return fault_around_bytes >> PAGE_SHIFT; -} - -static inline unsigned long fault_around_mask(void) -{ - return ~(fault_around_bytes - 1) & PAGE_MASK; -} +static unsigned long fault_around_bytes __read_mostly = + rounddown_pow_of_two(65536); #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) @@ -2834,12 +2838,15 @@ late_initcall(fault_around_debugfs); static void do_fault_around(struct vm_area_struct *vma, unsigned long address, pte_t *pte, pgoff_t pgoff, unsigned int flags) { - unsigned long start_addr; + unsigned long start_addr, nr_pages, mask; pgoff_t max_pgoff; struct vm_fault vmf; int off; - start_addr = max(address & fault_around_mask(), vma->vm_start); + nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; + mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; + + start_addr = max(address & mask, vma->vm_start); off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); pte -= off; pgoff -= off; @@ -2851,7 +2858,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, - pgoff + fault_around_pages() - 1); + pgoff + nr_pages - 1); /* Check if it makes any sense to call ->map_pages */ while (!pte_none(*pte)) { @@ -2886,7 +2893,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, * something). */ if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && - fault_around_pages() > 1) { + fault_around_bytes >> PAGE_SHIFT > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); do_fault_around(vma, address, pte, pgoff, flags); if (!pte_same(*pte, orig_pte)) @@ -2917,6 +2924,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { struct page *fault_page, *new_page; + struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret; @@ -2928,7 +2936,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!new_page) return VM_FAULT_OOM; - if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) { + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { page_cache_release(new_page); return VM_FAULT_OOM; } @@ -2948,12 +2956,14 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto uncharge_out; } do_set_pte(vma, address, new_page, pte, true, true); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); unlock_page(fault_page); page_cache_release(fault_page); return ret; uncharge_out: - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); page_cache_release(new_page); return ret; } @@ -3016,6 +3026,12 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults). + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) @@ -3040,7 +3056,9 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, @@ -3172,7 +3190,10 @@ out: * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -3181,7 +3202,7 @@ static int handle_pte_fault(struct mm_struct *mm, pte_t entry; spinlock_t *ptl; - entry = *pte; + entry = ACCESS_ONCE(*pte); if (!pte_present(entry)) { if (pte_none(entry)) { if (vma->vm_ops) { @@ -3232,6 +3253,9 @@ unlock: /* * By the time we get here, we already hold the mm semaphore + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -3313,6 +3337,12 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_pte_fault(mm, vma, address, pte, pmd, flags); } +/* + * By the time we get here, we already hold the mm semaphore + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { @@ -3403,44 +3433,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -#if !defined(__HAVE_ARCH_GATE_AREA) - -#if defined(AT_SYSINFO_EHDR) -static struct vm_area_struct gate_vma; - -static int __init gate_vma_init(void) -{ - gate_vma.vm_mm = NULL; - gate_vma.vm_start = FIXADDR_USER_START; - gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; - gate_vma.vm_page_prot = __P101; - - return 0; -} -__initcall(gate_vma_init); -#endif - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ -#ifdef AT_SYSINFO_EHDR - return &gate_vma; -#else - return NULL; -#endif -} - -int in_gate_area_no_mm(unsigned long addr) -{ -#ifdef AT_SYSINFO_EHDR - if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) - return 1; -#endif - return 0; -} - -#endif /* __HAVE_ARCH_GATE_AREA */ - static int __follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { @@ -3591,11 +3583,13 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, ret = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma); if (ret <= 0) { +#ifndef CONFIG_HAVE_IOREMAP_PROT + break; +#else /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ -#ifdef CONFIG_HAVE_IOREMAP_PROT vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) break; @@ -3603,9 +3597,9 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, ret = vma->vm_ops->access(vma, addr, buf, len, write); if (ret <= 0) -#endif break; bytes = ret; +#endif } else { bytes = len; offset = addr & (PAGE_SIZE-1); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 469bbf5..1bf4807 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -31,6 +31,7 @@ #include <linux/stop_machine.h> #include <linux/hugetlb.h> #include <linux/memblock.h> +#include <linux/bootmem.h> #include <asm/tlbflush.h> @@ -284,8 +285,8 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static void grow_zone_span(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) +static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) { unsigned long old_zone_end_pfn; @@ -427,8 +428,8 @@ out_fail: return -1; } -static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, - unsigned long end_pfn) +static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long end_pfn) { unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); @@ -977,15 +978,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ zone = page_zone(pfn_to_page(pfn)); ret = -EINVAL; - if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && + if ((zone_idx(zone) > ZONE_NORMAL || + online_type == MMOP_ONLINE_MOVABLE) && !can_online_high_movable(zone)) goto out; - if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { + if (online_type == MMOP_ONLINE_KERNEL && + zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) goto out; } - if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { + if (online_type == MMOP_ONLINE_MOVABLE && + zone_idx(zone) == ZONE_MOVABLE - 1) { if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) goto out; } @@ -1063,6 +1067,16 @@ out: } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ +static void reset_node_present_pages(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + z->present_pages = 0; + + pgdat->node_present_pages = 0; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) { @@ -1093,6 +1107,21 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) build_all_zonelists(pgdat, NULL); mutex_unlock(&zonelists_mutex); + /* + * zone->managed_pages is set to an approximate value in + * free_area_init_core(), which will cause + * /sys/device/system/node/nodeX/meminfo has wrong data. + * So reset it to 0 before any memory is onlined. + */ + reset_node_managed_pages(pgdat); + + /* + * When memory is hot-added, all the memory is in offline state. So + * clear all zones' present_pages because they will be updated in + * online_pages() and offline_pages(). + */ + reset_node_present_pages(pgdat); + return pgdat; } @@ -1156,6 +1185,34 @@ static int check_hotplug_memory_range(u64 start, u64 size) return 0; } +/* + * If movable zone has already been setup, newly added memory should be check. + * If its address is higher than movable zone, it should be added as movable. + * Without this check, movable zone may overlap with other zone. + */ +static int should_add_memory_movable(int nid, u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; + + if (zone_is_empty(movable_zone)) + return 0; + + if (movable_zone->zone_start_pfn <= start_pfn) + return 1; + + return 0; +} + +int zone_for_memory(int nid, u64 start, u64 size, int zone_default) +{ + if (should_add_memory_movable(nid, start, size)) + return ZONE_MOVABLE; + + return zone_default; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory(int nid, u64 start, u64 size) { @@ -1276,7 +1333,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) /* * Confirm all pages in a range [start, end) is belongs to the same zone. */ -static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) +int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct zone *zone = NULL; @@ -1881,7 +1938,6 @@ void try_offline_node(int nid) unsigned long start_pfn = pgdat->node_start_pfn; unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; unsigned long pfn; - struct page *pgdat_page = virt_to_page(pgdat); int i; for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { @@ -1910,10 +1966,6 @@ void try_offline_node(int nid) node_set_offline(nid); unregister_one_node(nid); - if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) - /* node data is allocated from boot memory */ - return; - /* free waittable in each zone */ for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8f5330d..e58725a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -123,25 +123,23 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; -static struct mempolicy *get_task_policy(struct task_struct *p) +struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; + int node; - if (!pol) { - int node = numa_node_id(); + if (pol) + return pol; - if (node != NUMA_NO_NODE) { - pol = &preferred_node_policy[node]; - /* - * preferred_node_policy is not initialised early in - * boot - */ - if (!pol->mode) - pol = NULL; - } + node = numa_node_id(); + if (node != NUMA_NO_NODE) { + pol = &preferred_node_policy[node]; + /* preferred_node_policy is not initialised early in boot */ + if (pol->mode) + return pol; } - return pol; + return &default_policy; } static const struct mempolicy_operations { @@ -683,7 +681,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, } if (flags & MPOL_MF_LAZY) { - change_prot_numa(vma, start, endvma); + /* Similar to task_numa_work, skip inaccessible VMAs */ + if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + change_prot_numa(vma, start, endvma); goto next; } @@ -804,7 +804,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *new, *old; - struct mm_struct *mm = current->mm; NODEMASK_SCRATCH(scratch); int ret; @@ -816,20 +815,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, ret = PTR_ERR(new); goto out; } - /* - * prevent changing our mempolicy while show_numa_maps() - * is using it. - * Note: do_set_mempolicy() can be called at init time - * with no 'mm'. - */ - if (mm) - down_write(&mm->mmap_sem); + task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); - if (mm) - up_write(&mm->mmap_sem); mpol_put(new); goto out; } @@ -839,9 +829,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); task_unlock(current); - if (mm) - up_write(&mm->mmap_sem); - mpol_put(old); ret = 0; out: @@ -1605,32 +1592,14 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, #endif -/* - * get_vma_policy(@task, @vma, @addr) - * @task: task for fallback if vma policy == default - * @vma: virtual memory area whose policy is sought - * @addr: address in @vma for shared policy lookup - * - * Returns effective policy for a VMA at specified address. - * Falls back to @task or system default policy, as necessary. - * Current or other task's task mempolicy and non-shared vma policies must be - * protected by task_lock(task) by the caller. - * Shared policies [those marked as MPOL_F_SHARED] require an extra reference - * count--added by the get_policy() vm_op, as appropriate--to protect against - * freeing by another task. It is the caller's responsibility to free the - * extra reference for shared policies. - */ -struct mempolicy *get_vma_policy(struct task_struct *task, - struct vm_area_struct *vma, unsigned long addr) +struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, + unsigned long addr) { - struct mempolicy *pol = get_task_policy(task); + struct mempolicy *pol = NULL; if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { - struct mempolicy *vpol = vma->vm_ops->get_policy(vma, - addr); - if (vpol) - pol = vpol; + pol = vma->vm_ops->get_policy(vma, addr); } else if (vma->vm_policy) { pol = vma->vm_policy; @@ -1644,31 +1613,51 @@ struct mempolicy *get_vma_policy(struct task_struct *task, mpol_get(pol); } } + + return pol; +} + +/* + * get_vma_policy(@vma, @addr) + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup + * + * Returns effective policy for a VMA at specified address. + * Falls back to current->mempolicy or system default policy, as necessary. + * Shared policies [those marked as MPOL_F_SHARED] require an extra reference + * count--added by the get_policy() vm_op, as appropriate--to protect against + * freeing by another task. It is the caller's responsibility to free the + * extra reference for shared policies. + */ +static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct mempolicy *pol = __get_vma_policy(vma, addr); + if (!pol) - pol = &default_policy; + pol = get_task_policy(current); + return pol; } -bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) +bool vma_policy_mof(struct vm_area_struct *vma) { - struct mempolicy *pol = get_task_policy(task); - if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) { - bool ret = false; + struct mempolicy *pol; - pol = vma->vm_ops->get_policy(vma, vma->vm_start); - if (pol && (pol->flags & MPOL_F_MOF)) - ret = true; - mpol_cond_put(pol); + if (vma->vm_ops && vma->vm_ops->get_policy) { + bool ret = false; - return ret; - } else if (vma->vm_policy) { - pol = vma->vm_policy; - } + pol = vma->vm_ops->get_policy(vma, vma->vm_start); + if (pol && (pol->flags & MPOL_F_MOF)) + ret = true; + mpol_cond_put(pol); + + return ret; } + pol = vma->vm_policy; if (!pol) - return default_policy.flags & MPOL_F_MOF; + pol = get_task_policy(current); return pol->flags & MPOL_F_MOF; } @@ -1874,7 +1863,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, { struct zonelist *zl; - *mpol = get_vma_policy(current, vma, addr); + *mpol = get_vma_policy(vma, addr); *nodemask = NULL; /* assume !MPOL_BIND */ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { @@ -2029,7 +2018,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned int cpuset_mems_cookie; retry_cpuset: - pol = get_vma_policy(current, vma, addr); + pol = get_vma_policy(vma, addr); cpuset_mems_cookie = read_mems_allowed_begin(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { @@ -2046,8 +2035,7 @@ retry_cpuset: page = __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol, node), policy_nodemask(gfp, pol)); - if (unlikely(mpol_needs_cond_ref(pol))) - __mpol_put(pol); + mpol_cond_put(pol); if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -2074,12 +2062,12 @@ retry_cpuset: */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { - struct mempolicy *pol = get_task_policy(current); + struct mempolicy *pol = &default_policy; struct page *page; unsigned int cpuset_mems_cookie; - if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) - pol = &default_policy; + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); @@ -2296,7 +2284,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long BUG_ON(!vma); - pol = get_vma_policy(current, vma, addr); + pol = get_vma_policy(vma, addr); if (!(pol->flags & MPOL_F_MOF)) goto out; diff --git a/mm/migrate.c b/mm/migrate.c index be6dbf9..0143995 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -146,8 +146,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); if (pte_swp_soft_dirty(*ptep)) pte = pte_mksoft_dirty(pte); + + /* Recheck VMA as permissions can change since migration started */ if (is_write_migration_entry(entry)) - pte = pte_mkwrite(pte); + pte = maybe_mkwrite(pte, vma); + #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { pte = pte_mkhuge(pte); @@ -780,6 +783,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, if (rc != MIGRATEPAGE_SUCCESS) { newpage->mapping = NULL; } else { + mem_cgroup_migrate(page, newpage, false); if (remap_swapcache) remove_migration_ptes(page, newpage); page->mapping = NULL; @@ -795,7 +799,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, { int rc = -EAGAIN; int remap_swapcache = 1; - struct mem_cgroup *mem; struct anon_vma *anon_vma = NULL; if (!trylock_page(page)) { @@ -821,9 +824,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, lock_page(page); } - /* charge against new page */ - mem_cgroup_prepare_migration(page, newpage, &mem); - if (PageWriteback(page)) { /* * Only in the case of a full synchronous migration is it @@ -833,10 +833,10 @@ static int __unmap_and_move(struct page *page, struct page *newpage, */ if (mode != MIGRATE_SYNC) { rc = -EBUSY; - goto uncharge; + goto out_unlock; } if (!force) - goto uncharge; + goto out_unlock; wait_on_page_writeback(page); } /* @@ -872,11 +872,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage, */ remap_swapcache = 0; } else { - goto uncharge; + goto out_unlock; } } - if (unlikely(balloon_page_movable(page))) { + if (unlikely(isolated_balloon_page(page))) { /* * A ballooned page does not need any special attention from * physical to virtual reverse mapping procedures. @@ -885,7 +885,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * the page migration right away (proteced by page lock). */ rc = balloon_page_migrate(newpage, page, mode); - goto uncharge; + goto out_unlock; } /* @@ -904,7 +904,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, VM_BUG_ON_PAGE(PageAnon(page), page); if (page_has_private(page)) { try_to_free_buffers(page); - goto uncharge; + goto out_unlock; } goto skip_unmap; } @@ -923,10 +923,7 @@ skip_unmap: if (anon_vma) put_anon_vma(anon_vma); -uncharge: - mem_cgroup_end_migration(mem, page, newpage, - (rc == MIGRATEPAGE_SUCCESS || - rc == MIGRATEPAGE_BALLOON_SUCCESS)); +out_unlock: unlock_page(page); out: return rc; @@ -958,17 +955,6 @@ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, rc = __unmap_and_move(page, newpage, force, mode); - if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { - /* - * A ballooned page has been migrated already. - * Now, it's the time to wrap-up counters, - * handle the page back to Buddy and return. - */ - dec_zone_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); - balloon_page_free(page); - return MIGRATEPAGE_SUCCESS; - } out: if (rc != -EAGAIN) { /* @@ -991,6 +977,9 @@ out: if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { ClearPageSwapBacked(newpage); put_new_page(newpage, private); + } else if (unlikely(__is_movable_balloon_page(newpage))) { + /* drop our reference, page already in the balloon */ + put_page(newpage); } else putback_lru_page(newpage); @@ -1786,7 +1775,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; struct page *new_page = NULL; - struct mem_cgroup *memcg = NULL; int page_lru = page_is_file_cache(page); unsigned long mmun_start = address & HPAGE_PMD_MASK; unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; @@ -1852,15 +1840,6 @@ fail_putback: goto out_unlock; } - /* - * Traditional migration needs to prepare the memcg charge - * transaction early to prevent the old page from being - * uncharged when installing migration entries. Here we can - * save the potential rollback and start the charge transfer - * only when migration is already known to end successfully. - */ - mem_cgroup_prepare_migration(page, new_page, &memcg); - orig_entry = *pmd; entry = mk_pmd(new_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); @@ -1888,14 +1867,10 @@ fail_putback: goto fail_putback; } + mem_cgroup_migrate(page, new_page, false); + page_remove_rmap(page); - /* - * Finish the charge transaction under the page table lock to - * prevent split_huge_page() from dividing up the charge - * before it's fully transferred to the new page. - */ - mem_cgroup_end_migration(memcg, page, new_page, true); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -210,12 +210,19 @@ out: * @vma: target vma * @start: start address * @end: end address + * @nonblocking: * * This takes care of making the pages present too. * * return 0 on success, negative error code on error. * - * vma->vm_mm->mmap_sem must be held for at least read. + * vma->vm_mm->mmap_sem must be held. + * + * If @nonblocking is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @nonblocking is non-NULL, it must held for read only and may be + * released. If it's released, *@nonblocking will be set to 0. */ long __mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *nonblocking) @@ -226,9 +233,9 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(end & ~PAGE_MASK); - VM_BUG_ON(start < vma->vm_start); - VM_BUG_ON(end > vma->vm_end); - VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + VM_BUG_ON_VMA(start < vma->vm_start, vma); + VM_BUG_ON_VMA(end > vma->vm_end, vma); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); gup_flags = FOLL_TOUCH | FOLL_MLOCK; /* @@ -782,7 +789,7 @@ static int do_mlockall(int flags) /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); - cond_resched(); + cond_resched_rcu_qs(); } out: return 0; @@ -31,6 +31,7 @@ #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> +#include <linux/mmdebug.h> #include <linux/perf_event.h> #include <linux/audit.h> #include <linux/khugepaged.h> @@ -69,7 +70,7 @@ static void unmap_region(struct mm_struct *mm, * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (yes) yes w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * + * * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes @@ -88,6 +89,25 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags) } EXPORT_SYMBOL(vm_get_page_prot); +static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) +{ + return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); +} + +/* Update vma->vm_page_prot to reflect vma->vm_flags. */ +void vma_set_page_prot(struct vm_area_struct *vma) +{ + unsigned long vm_flags = vma->vm_flags; + + vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); + if (vma_wants_writenotify(vma)) { + vm_flags &= ~VM_SHARED; + vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, + vm_flags); + } +} + + int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ unsigned long sysctl_overcommit_kbytes __read_mostly; @@ -134,6 +154,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed, reserve; + VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < + -(s64)vm_committed_as_batch * num_online_cpus(), + "memory commitment underflow"); + vm_acct_memory(pages); /* @@ -216,7 +240,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, if (vma->vm_flags & VM_DENYWRITE) atomic_inc(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable--; + mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) @@ -263,7 +287,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len); SYSCALL_DEFINE1(brk, unsigned long, brk) { - unsigned long rlim, retval; + unsigned long retval; unsigned long newbrk, oldbrk; struct mm_struct *mm = current->mm; unsigned long min_brk; @@ -293,9 +317,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ - rlim = rlimit(RLIMIT_DATA); - if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + - (mm->end_data - mm->start_data) > rlim) + if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, + mm->end_data, mm->start_data)) goto out; newbrk = PAGE_ALIGN(brk); @@ -364,20 +387,22 @@ static int browse_rb(struct rb_root *root) struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); if (vma->vm_start < prev) { - pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev); + pr_emerg("vm_start %lx < prev %lx\n", + vma->vm_start, prev); bug = 1; } if (vma->vm_start < pend) { - pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend); + pr_emerg("vm_start %lx < pend %lx\n", + vma->vm_start, pend); bug = 1; } if (vma->vm_start > vma->vm_end) { - pr_info("vm_end %lx < vm_start %lx\n", - vma->vm_end, vma->vm_start); + pr_emerg("vm_start %lx > vm_end %lx\n", + vma->vm_start, vma->vm_end); bug = 1; } if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { - pr_info("free gap %lx, correct %lx\n", + pr_emerg("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; @@ -391,7 +416,7 @@ static int browse_rb(struct rb_root *root) for (nd = pn; nd; nd = rb_prev(nd)) j++; if (i != j) { - pr_info("backwards %d, forwards %d\n", j, i); + pr_emerg("backwards %d, forwards %d\n", j, i); bug = 1; } return bug ? -1 : i; @@ -404,8 +429,9 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) for (nd = rb_first(root); nd; nd = rb_next(nd)) { struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); - BUG_ON(vma != ignore && - vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); + VM_BUG_ON_VMA(vma != ignore && + vma->rb_subtree_gap != vma_compute_subtree_gap(vma), + vma); } } @@ -415,8 +441,10 @@ static void validate_mm(struct mm_struct *mm) int i = 0; unsigned long highest_address = 0; struct vm_area_struct *vma = mm->mmap; + while (vma) { struct anon_vma_chain *avc; + vma_lock_anon_vma(vma); list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_verify(avc); @@ -426,20 +454,21 @@ static void validate_mm(struct mm_struct *mm) i++; } if (i != mm->map_count) { - pr_info("map_count %d vm_next %d\n", mm->map_count, i); + pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); bug = 1; } if (highest_address != mm->highest_vm_end) { - pr_info("mm->highest_vm_end %lx, found %lx\n", - mm->highest_vm_end, highest_address); + pr_emerg("mm->highest_vm_end %lx, found %lx\n", + mm->highest_vm_end, highest_address); bug = 1; } i = browse_rb(&mm->mm_rb); if (i != mm->map_count) { - pr_info("map_count %d rb %d\n", mm->map_count, i); + if (i != -1) + pr_emerg("map_count %d rb %d\n", mm->map_count, i); bug = 1; } - BUG_ON(bug); + VM_BUG_ON_MM(bug, mm); } #else #define validate_mm_rb(root, ignore) do { } while (0) @@ -617,7 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (vma->vm_flags & VM_DENYWRITE) atomic_dec(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) @@ -736,7 +765,7 @@ again: remove_next = 1 + (end > next->vm_end); * split_vma inserting another: so it must be * mprotect case 4 shifting the boundary down. */ - adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); + adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); exporter = vma; importer = next; } @@ -782,8 +811,8 @@ again: remove_next = 1 + (end > next->vm_end); if (!anon_vma && adjust_next) anon_vma = next->anon_vma; if (anon_vma) { - VM_BUG_ON(adjust_next && next->anon_vma && - anon_vma != next->anon_vma); + VM_BUG_ON_VMA(adjust_next && next->anon_vma && + anon_vma != next->anon_vma, next); anon_vma_lock_write(anon_vma); anon_vma_interval_tree_pre_update_vma(vma); if (adjust_next) @@ -1005,7 +1034,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, + struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; @@ -1031,7 +1060,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, * Can it merge with the predecessor? */ if (prev && prev->vm_end == addr && - mpol_equal(vma_policy(prev), policy) && + mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff)) { /* @@ -1051,7 +1080,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, end, prev->vm_pgoff, NULL); if (err) return NULL; - khugepaged_enter_vma_merge(prev); + khugepaged_enter_vma_merge(prev, vm_flags); return prev; } @@ -1059,7 +1088,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, * Can this new request be merged in front of next? */ if (next && end == next->vm_start && - mpol_equal(policy, vma_policy(next)) && + mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen)) { if (prev && addr < prev->vm_end) /* case 4 */ @@ -1070,7 +1099,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, next->vm_pgoff - pglen, NULL); if (err) return NULL; - khugepaged_enter_vma_merge(area); + khugepaged_enter_vma_merge(area, vm_flags); return area; } @@ -1230,7 +1259,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long flags, unsigned long pgoff, unsigned long *populate) { - struct mm_struct * mm = current->mm; + struct mm_struct *mm = current->mm; vm_flags_t vm_flags; *populate = 0; @@ -1258,7 +1287,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, /* offset overflow? */ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) - return -EOVERFLOW; + return -EOVERFLOW; /* Too many mappings? */ if (mm->map_count > sysctl_max_map_count) @@ -1465,11 +1494,16 @@ int vma_wants_writenotify(struct vm_area_struct *vma) if (vma->vm_ops && vma->vm_ops->page_mkwrite) return 1; - /* The open routine did something to the protections already? */ + /* The open routine did something to the protections that pgprot_modify + * won't preserve? */ if (pgprot_val(vma->vm_page_prot) != - pgprot_val(vm_get_page_prot(vm_flags))) + pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags))) return 0; + /* Do we need to track softdirty? */ + if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY)) + return 1; + /* Specialty mapping? */ if (vm_flags & VM_PFNMAP) return 0; @@ -1572,6 +1606,17 @@ munmap_back: if (error) goto free_vma; } + if (vm_flags & VM_SHARED) { + error = mapping_map_writable(file->f_mapping); + if (error) + goto allow_write_and_free_vma; + } + + /* ->mmap() can change vma->vm_file, but must guarantee that + * vma_link() below can deny write-access if VM_DENYWRITE is set + * and map writably if VM_SHARED is set. This usually means the + * new file must not have been exposed to user-space, yet. + */ vma->vm_file = get_file(file); error = file->f_op->mmap(file, vma); if (error) @@ -1594,25 +1639,14 @@ munmap_back: goto free_vma; } - if (vma_wants_writenotify(vma)) { - pgprot_t pprot = vma->vm_page_prot; - - /* Can vma->vm_page_prot have changed?? - * - * Answer: Yes, drivers may have changed it in their - * f_op->mmap method. - * - * Ensures that vmas marked as uncached stay that way. - */ - vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); - if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot))) - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - } - vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); + if (file) { + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + } file = vma->vm_file; out: perf_event_mmap(vma); @@ -1638,17 +1672,22 @@ out: */ vma->vm_flags |= VM_SOFTDIRTY; + vma_set_page_prot(vma); + return addr; unmap_and_free_vma: - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); vma->vm_file = NULL; fput(file); /* Undo any partial mapping done by a device driver. */ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); charged = 0; + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); +allow_write_and_free_vma: + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: @@ -1898,7 +1937,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.align_mask = 0; return vm_unmapped_area(&info); } -#endif +#endif /* * This mmap-allocator allocates new areas top-down from below the @@ -2169,7 +2208,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } vma_unlock_anon_vma(vma); - khugepaged_enter_vma_merge(vma); + khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(vma->vm_mm); return error; } @@ -2238,7 +2277,7 @@ int expand_downwards(struct vm_area_struct *vma, } } vma_unlock_anon_vma(vma); - khugepaged_enter_vma_merge(vma); + khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(vma->vm_mm); return error; } @@ -2298,13 +2337,13 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address) } struct vm_area_struct * -find_extend_vma(struct mm_struct * mm, unsigned long addr) +find_extend_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct * vma; + struct vm_area_struct *vma; unsigned long start; addr &= PAGE_MASK; - vma = find_vma(mm,addr); + vma = find_vma(mm, addr); if (!vma) return NULL; if (vma->vm_start <= addr) @@ -2353,7 +2392,7 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end) { - struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; + struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; struct mmu_gather tlb; lru_add_drain(); @@ -2400,7 +2439,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, * __split_vma() bypasses sysctl_max_map_count checking. We use this on the * munmap path where it doesn't make sense to fail. */ -static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, +static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; @@ -2489,7 +2528,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; - if ((len = PAGE_ALIGN(len)) == 0) + len = PAGE_ALIGN(len); + if (len == 0) return -EINVAL; /* Find the first overlapping VMA */ @@ -2535,7 +2575,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) if (error) return error; } - vma = prev? prev->vm_next: mm->mmap; + vma = prev ? prev->vm_next : mm->mmap; /* * unlock any mlock()ed ranges before detaching vmas @@ -2598,10 +2638,10 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) */ static unsigned long do_brk(unsigned long addr, unsigned long len) { - struct mm_struct * mm = current->mm; - struct vm_area_struct * vma, * prev; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; unsigned long flags; - struct rb_node ** rb_link, * rb_parent; + struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; @@ -2825,7 +2865,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, * safe. It is only safe to keep the vm_pgoff * linear if there are no pages mapped yet. */ - VM_BUG_ON(faulted_in_anon_vma); + VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); *vmap = vma = new_vma; } *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); @@ -3173,7 +3213,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 41cefdf..2c8da98 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -23,6 +23,25 @@ static struct srcu_struct srcu; /* + * This function allows mmu_notifier::release callback to delay a call to + * a function that will free appropriate resources. The function must be + * quick and must not block. + */ +void mmu_notifier_call_srcu(struct rcu_head *rcu, + void (*func)(struct rcu_head *rcu)) +{ + call_srcu(&srcu, rcu, func); +} +EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); + +void mmu_notifier_synchronize(void) +{ + /* Wait for any running method to finish. */ + srcu_barrier(&srcu); +} +EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); + +/* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap * runs with mm_users == 0. Other tasks may still invoke mmu notifiers @@ -53,7 +72,6 @@ void __mmu_notifier_release(struct mm_struct *mm) */ if (mn->ops->release) mn->ops->release(mn, mm); - srcu_read_unlock(&srcu, id); spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { @@ -69,6 +87,7 @@ void __mmu_notifier_release(struct mm_struct *mm) hlist_del_init_rcu(&mn->hlist); } spin_unlock(&mm->mmu_notifier_mm->lock); + srcu_read_unlock(&srcu, id); /* * synchronize_srcu here prevents mmu_notifier_release from returning to @@ -88,7 +107,8 @@ void __mmu_notifier_release(struct mm_struct *mm) * existed or not. */ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long address) + unsigned long start, + unsigned long end) { struct mmu_notifier *mn; int young = 0, id; @@ -96,7 +116,7 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->clear_flush_young) - young |= mn->ops->clear_flush_young(mn, mm, address); + young |= mn->ops->clear_flush_young(mn, mm, start, end); } srcu_read_unlock(&srcu, id); @@ -325,6 +345,25 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmu_notifier_unregister); +/* + * Same as mmu_notifier_unregister but no callback and no srcu synchronization. + */ +void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + spin_lock(&mm->mmu_notifier_mm->lock); + /* + * Can not use list_del_rcu() since __mmu_notifier_release + * can delete it before we hold the lock. + */ + hlist_del_init_rcu(&mn->hlist); + spin_unlock(&mm->mmu_notifier_mm->lock); + + BUG_ON(atomic_read(&mm->mm_count) <= 0); + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + static int __init mmu_notifier_init(void) { return init_srcu_struct(&srcu); diff --git a/mm/mprotect.c b/mm/mprotect.c index c43d557..ace9345 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,13 +29,6 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> -#ifndef pgprot_modify -static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) -{ - return newprot; -} -#endif - /* * For a prot_numa update we only hold mmap_sem for read so there is a * potential race with faulting where a pmd was temporarily none. This @@ -93,7 +86,9 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * Avoid taking write faults for pages we * know to be dirty. */ - if (dirty_accountable && pte_dirty(ptent)) + if (dirty_accountable && pte_dirty(ptent) && + (pte_soft_dirty(ptent) || + !(vma->vm_flags & VM_SOFTDIRTY))) ptent = pte_mkwrite(ptent); ptep_modify_prot_commit(mm, addr, pte, ptent); updated = true; @@ -320,13 +315,8 @@ success: * held in write mode. */ vma->vm_flags = newflags; - vma->vm_page_prot = pgprot_modify(vma->vm_page_prot, - vm_get_page_prot(newflags)); - - if (vma_wants_writenotify(vma)) { - vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED); - dirty_accountable = 1; - } + dirty_accountable = vma_wants_writenotify(vma); + vma_set_page_prot(vma); change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); diff --git a/mm/mremap.c b/mm/mremap.c index 05f1180..b147f66 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -21,8 +21,8 @@ #include <linux/syscalls.h> #include <linux/mmu_notifier.h> #include <linux/sched/sysctl.h> +#include <linux/uaccess.h> -#include <asm/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -195,7 +195,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (pmd_trans_huge(*old_pmd)) { int err = 0; if (extent == HPAGE_PMD_SIZE) { - VM_BUG_ON(vma->vm_file || !vma->anon_vma); + VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, + vma); /* See comment in move_ptes() */ if (need_rmap_locks) anon_vma_lock_write(vma->anon_vma); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 7ed5860..90b5046 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -119,6 +119,8 @@ static unsigned long __init free_low_memory_core_early(void) phys_addr_t start, end; u64 i; + memblock_clear_hotplug(0, -1); + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) count += __free_memory_core(start, end); @@ -143,12 +145,10 @@ static unsigned long __init free_low_memory_core_early(void) static int reset_managed_pages_done __initdata; -static inline void __init reset_node_managed_pages(pg_data_t *pgdat) +void reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - if (reset_managed_pages_done) - return; for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) z->managed_pages = 0; } @@ -157,8 +157,12 @@ void __init reset_all_zones_managed_pages(void) { struct pglist_data *pgdat; + if (reset_managed_pages_done) + return; + for_each_online_pgdat(pgdat) reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } @@ -539,7 +539,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } @@ -1981,11 +1981,6 @@ error: return -ENOMEM; } -int in_gate_area_no_mm(unsigned long addr) -{ - return 0; -} - int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { BUG(); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3291e82..5340f6b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -258,8 +258,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, bool force_kill) { - if (task->exit_state) - return OOM_SCAN_CONTINUE; if (oom_unkillable_task(task, NULL, nodemask)) return OOM_SCAN_CONTINUE; @@ -406,6 +404,23 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, dump_tasks(memcg, nodemask); } +/* + * Number of OOM killer invocations (including memcg OOM killer). + * Primarily used by PM freezer to check for potential races with + * OOM killed frozen task. + */ +static atomic_t oom_kills = ATOMIC_INIT(0); + +int oom_kills_count(void) +{ + return atomic_read(&oom_kills); +} + +void note_oom_kill(void) +{ + atomic_inc(&oom_kills); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon @@ -559,28 +574,25 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); * if a parallel OOM killing is already taking place that includes a zone in * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. */ -int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) +bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; - int ret = 1; + bool ret = true; spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { - if (zone_is_oom_locked(zone)) { - ret = 0; + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) + if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) { + ret = false; goto out; } - } - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { - /* - * Lock each zone in the zonelist under zone_scan_lock so a - * parallel invocation of try_set_zonelist_oom() doesn't succeed - * when it shouldn't. - */ - zone_set_flag(zone, ZONE_OOM_LOCKED); - } + /* + * Lock each zone in the zonelist under zone_scan_lock so a parallel + * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. + */ + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) + set_bit(ZONE_OOM_LOCKED, &zone->flags); out: spin_unlock(&zone_scan_lock); @@ -592,15 +604,14 @@ out: * allocation attempts with zonelists containing them may now recall the OOM * killer, if necessary. */ -void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) +void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { - zone_clear_flag(zone, ZONE_OOM_LOCKED); - } + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) + clear_bit(ZONE_OOM_LOCKED, &zone->flags); spin_unlock(&zone_scan_lock); } @@ -694,9 +705,9 @@ void pagefault_out_of_memory(void) if (mem_cgroup_oom_synchronize(true)) return; - zonelist = node_zonelist(first_online_node, GFP_KERNEL); - if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { + zonelist = node_zonelist(first_memory_node, GFP_KERNEL); + if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { out_of_memory(NULL, 0, 0, NULL, false); - clear_zonelist_oom(zonelist, GFP_KERNEL); + oom_zonelist_unlock(zonelist, GFP_KERNEL); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e0c9430..19ceae8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -261,14 +261,11 @@ static unsigned long global_dirtyable_memory(void) */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { + const unsigned long available_memory = global_dirtyable_memory(); unsigned long background; unsigned long dirty; - unsigned long uninitialized_var(available_memory); struct task_struct *tsk; - if (!vm_dirty_bytes || !dirty_background_bytes) - available_memory = global_dirtyable_memory(); - if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); else @@ -1078,13 +1075,13 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, } if (dirty < setpoint) { - x = min(bdi->balanced_dirty_ratelimit, - min(balanced_dirty_ratelimit, task_ratelimit)); + x = min3(bdi->balanced_dirty_ratelimit, + balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { - x = max(bdi->balanced_dirty_ratelimit, - max(balanced_dirty_ratelimit, task_ratelimit)); + x = max3(bdi->balanced_dirty_ratelimit, + balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit > x) step = dirty_ratelimit - x; } @@ -1780,7 +1777,7 @@ void __init page_writeback_init(void) writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - fprop_global_init(&writeout_completions); + fprop_global_init(&writeout_completions, GFP_KERNEL); } /** @@ -2119,23 +2116,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) EXPORT_SYMBOL(account_page_dirtied); /* - * Helper function for set_page_writeback family. - * - * The caller must hold mem_cgroup_begin/end_update_page_stat() lock - * while calling this function. - * See test_set_page_writeback for example. - * - * NOTE: Unlike account_page_dirtied this does not rely on being atomic - * wrt interrupts. - */ -void account_page_writeback(struct page *page) -{ - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); - inc_zone_page_state(page, NR_WRITEBACK); -} -EXPORT_SYMBOL(account_page_writeback); - -/* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. * @@ -2347,11 +2327,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - int ret; - bool locked; unsigned long memcg_flags; + struct mem_cgroup *memcg; + bool locked; + int ret; - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2372,22 +2353,23 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg, locked, memcg_flags); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - int ret; - bool locked; unsigned long memcg_flags; + struct mem_cgroup *memcg; + bool locked; + int ret; - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2413,9 +2395,11 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } else { ret = TestSetPageWriteback(page); } - if (!ret) - account_page_writeback(page); - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); + if (!ret) { + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + inc_zone_page_state(page, NR_WRITEBACK); + } + mem_cgroup_end_page_stat(memcg, locked, memcg_flags); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef44ad7..616a2c9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,8 +53,6 @@ #include <linux/kmemleak.h> #include <linux/compaction.h> #include <trace/events/kmem.h> -#include <linux/ftrace_event.h> -#include <linux/memcontrol.h> #include <linux/prefetch.h> #include <linux/mm_inline.h> #include <linux/migrate.h> @@ -85,6 +83,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node); */ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ EXPORT_PER_CPU_SYMBOL(_numa_mem_); +int _node_numa_mem_[MAX_NUMNODES]; #endif /* @@ -468,29 +467,6 @@ static inline void rmv_page_order(struct page *page) } /* - * Locate the struct page for both the matching buddy in our - * pair (buddy1) and the combined O(n+1) page they form (page). - * - * 1) Any buddy B1 will have an order O twin B2 which satisfies - * the following equation: - * B2 = B1 ^ (1 << O) - * For example, if the starting buddy (buddy2) is #8 its order - * 1 buddy is #10: - * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 - * - * 2) Any buddy B will have an order O+1 parent P which - * satisfies the following equation: - * P = B & ~(1 << O) - * - * Assumption: *_mem_map is contiguous at least up to MAX_ORDER - */ -static inline unsigned long -__find_buddy_index(unsigned long page_idx, unsigned int order) -{ - return page_idx ^ (1 << order); -} - -/* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if * (a) the buddy is not in a hole && @@ -570,6 +546,7 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; + int max_order = MAX_ORDER; VM_BUG_ON(!zone_is_initialized(zone)); @@ -578,13 +555,24 @@ static inline void __free_one_page(struct page *page, return; VM_BUG_ON(migratetype == -1); + if (is_migrate_isolate(migratetype)) { + /* + * We restrict max order of merging to prevent merge + * between freepages on isolate pageblock and normal + * pageblock. Without this, pageblock isolation + * could cause incorrect freepage accounting. + */ + max_order = min(MAX_ORDER, pageblock_order + 1); + } else { + __mod_zone_freepage_state(zone, 1 << order, migratetype); + } - page_idx = pfn & ((1 << MAX_ORDER) - 1); + page_idx = pfn & ((1 << max_order) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); - while (order < MAX_ORDER-1) { + while (order < max_order - 1) { buddy_idx = __find_buddy_index(page_idx, order); buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) @@ -595,9 +583,11 @@ static inline void __free_one_page(struct page *page, */ if (page_is_guard(buddy)) { clear_page_guard_flag(buddy); - set_page_private(page, 0); - __mod_zone_freepage_state(zone, 1 << order, - migratetype); + set_page_private(buddy, 0); + if (!is_migrate_isolate(migratetype)) { + __mod_zone_freepage_state(zone, 1 << order, + migratetype); + } } else { list_del(&buddy->lru); zone->free_area[order].nr_free--; @@ -680,9 +670,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, int migratetype = 0; int batch_free = 0; int to_free = count; + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); while (to_free) { struct page *page; @@ -713,14 +706,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* must delete as __free_one_page list manipulates */ list_del(&page->lru); mt = get_freepage_migratetype(page); + if (unlikely(has_isolate_pageblock(zone))) + mt = get_pageblock_migratetype(page); + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - if (likely(!is_migrate_isolate_page(page))) { - __mod_zone_page_state(zone, NR_FREE_PAGES, 1); - if (is_migrate_cma(mt)) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); - } } while (--to_free && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); @@ -731,12 +722,17 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + if (unlikely(has_isolate_pageblock(zone) || + is_migrate_isolate(migratetype))) { + migratetype = get_pfnblock_migratetype(page, pfn); + } __free_one_page(page, pfn, zone, order, migratetype); - if (unlikely(!is_migrate_isolate(migratetype))) - __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); } @@ -1008,7 +1004,7 @@ int move_freepages(struct zone *zone, * Remove at a later date when no bug reports exist related to * grouping pages by mobility */ - BUG_ON(page_zone(start_page) != page_zone(end_page)); + VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); #endif for (page = start_page; page <= end_page;) { @@ -1257,15 +1253,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; - int to_drain; - unsigned long batch; + int to_drain, batch; local_irq_save(flags); batch = ACCESS_ONCE(pcp->batch); - if (pcp->count >= batch) - to_drain = batch; - else - to_drain = pcp->count; + to_drain = min(pcp->count, batch); if (to_drain > 0) { free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; @@ -1483,7 +1475,7 @@ void split_page(struct page *page, unsigned int order) } EXPORT_SYMBOL_GPL(split_page); -static int __isolate_free_page(struct page *page, unsigned int order) +int __isolate_free_page(struct page *page, unsigned int order) { unsigned long watermark; struct zone *zone; @@ -1610,6 +1602,9 @@ again: } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && + !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) + set_bit(ZONE_FAIR_DEPLETED, &zone->flags); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1712,7 +1707,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, { /* free_pages my go negative - that's OK */ long min = mark; - long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; long free_cma = 0; @@ -1727,7 +1721,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + lowmem_reserve) + if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -1922,6 +1916,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) #endif /* CONFIG_NUMA */ +static void reset_alloc_batches(struct zone *preferred_zone) +{ + struct zone *zone = preferred_zone->zone_pgdat->node_zones; + + do { + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); + } while (zone++ != preferred_zone); +} + /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. @@ -1939,8 +1945,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, int did_zlc_setup = 0; /* just call zlc_setup() one time */ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE); + int nr_fair_skipped = 0; + bool zonelist_rescan; zonelist_scan: + zonelist_rescan = false; + /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. @@ -1964,9 +1974,11 @@ zonelist_scan: */ if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) + break; + if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { + nr_fair_skipped++; continue; - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) - continue; + } } /* * When allocating a page cache page for writing, we @@ -2072,13 +2084,7 @@ this_zone_full: zlc_mark_zone_full(zonelist, z); } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - goto zonelist_scan; - } - - if (page) + if (page) { /* * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was * necessary to allocate the page. The expectation is @@ -2087,8 +2093,37 @@ this_zone_full: * for !PFMEMALLOC purposes. */ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return page; + } - return page; + /* + * The first pass makes sure allocations are spread fairly within the + * local node. However, the local node might have free pages left + * after the fairness batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without fairness, and + * include remote zones now, before entering the slowpath and waking + * kswapd: prefer spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + alloc_flags &= ~ALLOC_FAIR; + if (nr_fair_skipped) { + zonelist_rescan = true; + reset_alloc_batches(preferred_zone); + } + if (nr_online_nodes > 1) + zonelist_rescan = true; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + zonelist_rescan = true; + } + + if (zonelist_rescan) + goto zonelist_scan; + + return NULL; } /* @@ -2201,13 +2236,21 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, { struct page *page; - /* Acquire the OOM killer lock for the zones in zonelist */ - if (!try_set_zonelist_oom(zonelist, gfp_mask)) { + /* Acquire the per-zone oom lock for each zone */ + if (!oom_zonelist_trylock(zonelist, gfp_mask)) { schedule_timeout_uninterruptible(1); return NULL; } /* + * PM-freezer should be notified that there might be an OOM killer on + * its way to kill and wake somebody up. This is too early and we might + * end up not killing anything but false positives are acceptable. + * See freeze_processes. + */ + note_oom_kill(); + + /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if * we're still under heavy pressure. @@ -2240,7 +2283,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, out_of_memory(zonelist, gfp_mask, order, nodemask, false); out: - clear_zonelist_oom(zonelist, gfp_mask); + oom_zonelist_unlock(zonelist, gfp_mask); return page; } @@ -2251,58 +2294,72 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, int classzone_idx, int migratetype, enum migrate_mode mode, - bool *contended_compaction, bool *deferred_compaction, - unsigned long *did_some_progress) + int *contended_compaction, bool *deferred_compaction) { - if (!order) - return NULL; + struct zone *last_compact_zone = NULL; + unsigned long compact_result; + struct page *page; - if (compaction_deferred(preferred_zone, order)) { - *deferred_compaction = true; + if (!order) return NULL; - } current->flags |= PF_MEMALLOC; - *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, + compact_result = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, mode, - contended_compaction); + contended_compaction, + &last_compact_zone); current->flags &= ~PF_MEMALLOC; - if (*did_some_progress != COMPACT_SKIPPED) { - struct page *page; + switch (compact_result) { + case COMPACT_DEFERRED: + *deferred_compaction = true; + /* fall-through */ + case COMPACT_SKIPPED: + return NULL; + default: + break; + } - /* Page migration frees to the PCP lists but we want merging */ - drain_pages(get_cpu()); - put_cpu(); + /* + * At least in one zone compaction wasn't deferred or skipped, so let's + * count a compaction stall + */ + count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, nodemask, - order, zonelist, high_zoneidx, - alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, migratetype); - if (page) { - preferred_zone->compact_blockskip_flush = false; - compaction_defer_reset(preferred_zone, order, true); - count_vm_event(COMPACTSUCCESS); - return page; - } + /* Page migration frees to the PCP lists but we want merging */ + drain_pages(get_cpu()); + put_cpu(); - /* - * It's bad if compaction run occurs and fails. - * The most likely reason is that pages exist, - * but not enough to satisfy watermarks. - */ - count_vm_event(COMPACTFAIL); + page = get_page_from_freelist(gfp_mask, nodemask, + order, zonelist, high_zoneidx, + alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, classzone_idx, migratetype); - /* - * As async compaction considers a subset of pageblocks, only - * defer if the failure was a sync compaction failure. - */ - if (mode != MIGRATE_ASYNC) - defer_compaction(preferred_zone, order); + if (page) { + struct zone *zone = page_zone(page); - cond_resched(); + zone->compact_blockskip_flush = false; + compaction_defer_reset(zone, order, true); + count_vm_event(COMPACTSUCCESS); + return page; } + /* + * last_compact_zone is where try_to_compact_pages thought allocation + * should succeed, so it did not defer compaction. But here we know + * that it didn't succeed, so we do the defer. + */ + if (last_compact_zone && mode != MIGRATE_ASYNC) + defer_compaction(last_compact_zone, order); + + /* + * It's bad if compaction run occurs and fails. The most likely reason + * is that pages exist, but not enough to satisfy watermarks. + */ + count_vm_event(COMPACTFAIL); + + cond_resched(); + return NULL; } #else @@ -2310,9 +2367,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int classzone_idx, int migratetype, - enum migrate_mode mode, bool *contended_compaction, - bool *deferred_compaction, unsigned long *did_some_progress) + int classzone_idx, int migratetype, enum migrate_mode mode, + int *contended_compaction, bool *deferred_compaction) { return NULL; } @@ -2409,37 +2465,17 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void reset_alloc_batches(struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - /* - * Only reset the batches of zones that were actually - * considered in the fairness pass, we don't want to - * trash fairness information for zones that are not - * actually part of this zonelist's round-robin cycle. - */ - if (!zone_local(preferred_zone, zone)) - continue; - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - } -} - static void wake_all_kswapds(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, - struct zone *preferred_zone) + struct zone *preferred_zone, + nodemask_t *nodemask) { struct zoneref *z; struct zone *zone; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + for_each_zone_zonelist_nodemask(zone, z, zonelist, + high_zoneidx, nodemask) wakeup_kswapd(zone, order, zone_idx(preferred_zone)); } @@ -2486,7 +2522,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_NO_WATERMARKS; } #ifdef CONFIG_CMA - if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif return alloc_flags; @@ -2510,7 +2546,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; enum migrate_mode migration_mode = MIGRATE_ASYNC; bool deferred_compaction = false; - bool contended_compaction = false; + int contended_compaction = COMPACT_CONTENDED_NONE; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2537,7 +2573,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, restart: if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); + wake_all_kswapds(order, zonelist, high_zoneidx, + preferred_zone, nodemask); /* * OK, we're below the kswapd watermark and have kicked background @@ -2610,29 +2647,50 @@ rebalance: preferred_zone, classzone_idx, migratetype, migration_mode, &contended_compaction, - &deferred_compaction, - &did_some_progress); + &deferred_compaction); if (page) goto got_pg; + /* Checks for THP-specific high-order allocations */ + if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) { + /* + * If compaction is deferred for high-order allocations, it is + * because sync compaction recently failed. If this is the case + * and the caller requested a THP allocation, we do not want + * to heavily disrupt the system, so we fail the allocation + * instead of entering direct reclaim. + */ + if (deferred_compaction) + goto nopage; + + /* + * In all zones where compaction was attempted (and not + * deferred or skipped), lock contention has been detected. + * For THP allocation we do not want to disrupt the others + * so we fallback to base pages instead. + */ + if (contended_compaction == COMPACT_CONTENDED_LOCK) + goto nopage; + + /* + * If compaction was aborted due to need_resched(), we do not + * want to further increase allocation latency, unless it is + * khugepaged trying to collapse. + */ + if (contended_compaction == COMPACT_CONTENDED_SCHED + && !(current->flags & PF_KTHREAD)) + goto nopage; + } + /* * It can become very expensive to allocate transparent hugepages at * fault, so use asynchronous memory compaction for THP unless it is * khugepaged trying to collapse. */ - if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) + if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || + (current->flags & PF_KTHREAD)) migration_mode = MIGRATE_SYNC_LIGHT; - /* - * If compaction is deferred for high-order allocations, it is because - * sync compaction recently failed. In this is the case and the caller - * requested a movable allocation that does not heavily disrupt the - * system then fail the allocation instead of entering direct reclaim. - */ - if ((deferred_compaction || contended_compaction) && - (gfp_mask & __GFP_NO_KSWAPD)) - goto nopage; - /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -2702,8 +2760,7 @@ rebalance: preferred_zone, classzone_idx, migratetype, migration_mode, &contended_compaction, - &deferred_compaction, - &did_some_progress); + &deferred_compaction); if (page) goto got_pg; } @@ -2729,7 +2786,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zone *preferred_zone; struct zoneref *preferred_zoneref; struct page *page = NULL; - int migratetype = allocflags_to_migratetype(gfp_mask); + int migratetype = gfpflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; int classzone_idx; @@ -2751,6 +2808,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; + retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); @@ -2762,33 +2822,12 @@ retry_cpuset: goto out; classzone_idx = zonelist_zone_idx(preferred_zoneref); -#ifdef CONFIG_CMA - if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif -retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { /* - * The first pass makes sure allocations are spread - * fairly within the local node. However, the local - * node might have free pages left after the fairness - * batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without - * fairness, and include remote zones now, before - * entering the slowpath and waking kswapd: prefer - * spilling to a remote zone over swapping locally. - */ - if (alloc_flags & ALLOC_FAIR) { - reset_alloc_batches(zonelist, high_zoneidx, - preferred_zone); - alloc_flags &= ~ALLOC_FAIR; - goto retry; - } - /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not * complete. @@ -2962,7 +3001,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * Note this is not alloc_pages_exact_node() which allocates on a specific node, * but is not exact. */ -void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { unsigned order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); @@ -2970,7 +3009,6 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); } -EXPORT_SYMBOL(alloc_pages_exact_nid); /** * free_pages_exact - release memory allocated via alloc_pages_exact() @@ -3052,7 +3090,7 @@ static inline void show_node(struct zone *zone) void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; - val->sharedram = 0; + val->sharedram = global_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; @@ -3072,6 +3110,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; + val->sharedram = node_page_state(nid, NR_SHMEM); val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; @@ -3253,12 +3292,12 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_BOUNCE)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - zone->pages_scanned, + K(zone_page_state(zone, NR_PAGES_SCANNED)), (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %lu", zone->lowmem_reserve[i]); + printk(" %ld", zone->lowmem_reserve[i]); printk("\n"); } @@ -3572,68 +3611,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) zonelist->_zonerefs[pos].zone_idx = 0; } +#if defined(CONFIG_64BIT) +/* + * Devices that require DMA32/DMA are relatively rare and do not justify a + * penalty to every machine in case the specialised case applies. Default + * to Node-ordering on 64-bit NUMA machines + */ +static int default_zonelist_order(void) +{ + return ZONELIST_ORDER_NODE; +} +#else +/* + * On 32-bit, the Normal zone needs to be preserved for allocations accessible + * by the kernel. If processes running on node 0 deplete the low memory zone + * then reclaim will occur more frequency increasing stalls and potentially + * be easier to OOM if a large percentage of the zone is under writeback or + * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. + * Hence, default to zone ordering on 32-bit. + */ static int default_zonelist_order(void) { - int nid, zone_type; - unsigned long low_kmem_size, total_size; - struct zone *z; - int average_size; - /* - * ZONE_DMA and ZONE_DMA32 can be very small area in the system. - * If they are really small and used heavily, the system can fall - * into OOM very easily. - * This function detect ZONE_DMA/DMA32 size and configures zone order. - */ - /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ - low_kmem_size = 0; - total_size = 0; - for_each_online_node(nid) { - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { - z = &NODE_DATA(nid)->node_zones[zone_type]; - if (populated_zone(z)) { - if (zone_type < ZONE_NORMAL) - low_kmem_size += z->managed_pages; - total_size += z->managed_pages; - } else if (zone_type == ZONE_NORMAL) { - /* - * If any node has only lowmem, then node order - * is preferred to allow kernel allocations - * locally; otherwise, they can easily infringe - * on other nodes when there is an abundance of - * lowmem available to allocate from. - */ - return ZONELIST_ORDER_NODE; - } - } - } - if (!low_kmem_size || /* there are no DMA area. */ - low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ - return ZONELIST_ORDER_NODE; - /* - * look into each node's config. - * If there is a node whose DMA/DMA32 memory is very big area on - * local memory, NODE_ORDER may be suitable. - */ - average_size = total_size / - (nodes_weight(node_states[N_MEMORY]) + 1); - for_each_online_node(nid) { - low_kmem_size = 0; - total_size = 0; - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { - z = &NODE_DATA(nid)->node_zones[zone_type]; - if (populated_zone(z)) { - if (zone_type < ZONE_NORMAL) - low_kmem_size += z->present_pages; - total_size += z->present_pages; - } - } - if (low_kmem_size && - total_size > average_size && /* ignore small node */ - low_kmem_size > total_size * 70/100) - return ZONELIST_ORDER_NODE; - } return ZONELIST_ORDER_ZONE; } +#endif /* CONFIG_64BIT */ static void set_zonelist_order(void) { @@ -4969,6 +4970,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, + (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); @@ -5579,7 +5582,7 @@ static void calculate_totalreserve_pages(void) for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - unsigned long max = 0; + long max = 0; /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { @@ -5694,9 +5697,8 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); __mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - - low_wmark_pages(zone) - - zone_page_state(zone, NR_ALLOC_BATCH)); + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); @@ -6271,8 +6273,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, if (list_empty(&cc->migratepages)) { cc->nr_migratepages = 0; - pfn = isolate_migratepages_range(cc->zone, cc, - pfn, end, true); + pfn = isolate_migratepages_range(cc, pfn, end); if (!pfn) { ret = -EINTR; break; @@ -6398,13 +6399,12 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { - pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", - outer_start, end); + pr_info("%s: [%lx, %lx) PFNs busy\n", + __func__, outer_start, end); ret = -EBUSY; goto done; } - /* Grab isolated pages from freelists. */ outer_end = isolate_freepages_range(&cc, outer_start, end); if (!outer_end) { @@ -6548,97 +6548,3 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } #endif - -static const struct trace_print_flags pageflag_names[] = { - {1UL << PG_locked, "locked" }, - {1UL << PG_error, "error" }, - {1UL << PG_referenced, "referenced" }, - {1UL << PG_uptodate, "uptodate" }, - {1UL << PG_dirty, "dirty" }, - {1UL << PG_lru, "lru" }, - {1UL << PG_active, "active" }, - {1UL << PG_slab, "slab" }, - {1UL << PG_owner_priv_1, "owner_priv_1" }, - {1UL << PG_arch_1, "arch_1" }, - {1UL << PG_reserved, "reserved" }, - {1UL << PG_private, "private" }, - {1UL << PG_private_2, "private_2" }, - {1UL << PG_writeback, "writeback" }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED - {1UL << PG_head, "head" }, - {1UL << PG_tail, "tail" }, -#else - {1UL << PG_compound, "compound" }, -#endif - {1UL << PG_swapcache, "swapcache" }, - {1UL << PG_mappedtodisk, "mappedtodisk" }, - {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_swapbacked, "swapbacked" }, - {1UL << PG_unevictable, "unevictable" }, -#ifdef CONFIG_MMU - {1UL << PG_mlocked, "mlocked" }, -#endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - {1UL << PG_uncached, "uncached" }, -#endif -#ifdef CONFIG_MEMORY_FAILURE - {1UL << PG_hwpoison, "hwpoison" }, -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - {1UL << PG_compound_lock, "compound_lock" }, -#endif -}; - -static void dump_page_flags(unsigned long flags) -{ - const char *delim = ""; - unsigned long mask; - int i; - - BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); - - printk(KERN_ALERT "page flags: %#lx(", flags); - - /* remove zone id */ - flags &= (1UL << NR_PAGEFLAGS) - 1; - - for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { - - mask = pageflag_names[i].mask; - if ((flags & mask) != mask) - continue; - - flags &= ~mask; - printk("%s%s", delim, pageflag_names[i].name); - delim = "|"; - } - - /* check for left over flags */ - if (flags) - printk("%s%#lx", delim, flags); - - printk(")\n"); -} - -void dump_page_badflags(struct page *page, const char *reason, - unsigned long badflags) -{ - printk(KERN_ALERT - "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", - page, atomic_read(&page->_count), page_mapcount(page), - page->mapping, page->index); - dump_page_flags(page->flags); - if (reason) - pr_alert("page dumped because: %s\n", reason); - if (page->flags & badflags) { - pr_alert("bad because of flags:\n"); - dump_page_flags(page->flags & badflags); - } - mem_cgroup_print_bad_page(page); -} - -void dump_page(struct page *page, const char *reason) -{ - dump_page_badflags(page, reason, 0); -} -EXPORT_SYMBOL(dump_page); diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3708264..5331c2b 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -171,6 +171,7 @@ static void free_page_cgroup(void *addr) sizeof(struct page_cgroup) * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); + kmemleak_free(addr); free_pages_exact(addr, table_size); } } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index d1473b2..c8778f7 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -60,6 +60,7 @@ out: int migratetype = get_pageblock_migratetype(page); set_pageblock_migratetype(page, MIGRATE_ISOLATE); + zone->nr_isolate_pageblock++; nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); __mod_zone_freepage_state(zone, -nr_pages, migratetype); @@ -75,16 +76,54 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; unsigned long flags, nr_pages; + struct page *isolated_page = NULL; + unsigned int order; + unsigned long page_idx, buddy_idx; + struct page *buddy; zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; - nr_pages = move_freepages_block(zone, page, migratetype); - __mod_zone_freepage_state(zone, nr_pages, migratetype); + + /* + * Because freepage with more than pageblock_order on isolated + * pageblock is restricted to merge due to freepage counting problem, + * it is possible that there is free buddy page. + * move_freepages_block() doesn't care of merge so we need other + * approach in order to merge them. Isolation and free will make + * these pages to be merged. + */ + if (PageBuddy(page)) { + order = page_order(page); + if (order >= pageblock_order) { + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); + + if (!is_migrate_isolate_page(buddy)) { + __isolate_free_page(page, order); + set_page_refcounted(page); + isolated_page = page; + } + } + } + + /* + * If we isolate freepage with more than pageblock_order, there + * should be no freepage in the range, so we could avoid costly + * pageblock scanning for freepage moving. + */ + if (!isolated_page) { + nr_pages = move_freepages_block(zone, page, migratetype); + __mod_zone_freepage_state(zone, nr_pages, migratetype); + } set_pageblock_migratetype(page, migratetype); + zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); + if (isolated_page) + __free_pages(isolated_page, order); } static inline struct page * diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2beeabf..ad83195 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -177,7 +177,7 @@ int walk_page_range(unsigned long addr, unsigned long end, if (!walk->mm) return -EINVAL; - VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); pgd = pgd_offset(walk->mm, addr); do { diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 89633fe..10e3d0b 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -33,17 +33,14 @@ #include <linux/log2.h> -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - unsigned int cpu; - - for_each_possible_cpu(cpu) - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); - return 0; } -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { /* nada */ } @@ -70,6 +67,11 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->data = pages; chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; + + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, 0, nr_pages); + spin_unlock_irq(&pcpu_lock); + return chunk; } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 3707c71..538998a 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -20,46 +20,25 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, } /** - * pcpu_get_pages_and_bitmap - get temp pages array and bitmap + * pcpu_get_pages - get temp pages array * @chunk: chunk of interest - * @bitmapp: output parameter for bitmap - * @may_alloc: may allocate the array * - * Returns pointer to array of pointers to struct page and bitmap, - * both of which can be indexed with pcpu_page_idx(). The returned - * array is cleared to zero and *@bitmapp is copied from - * @chunk->populated. Note that there is only one array and bitmap - * and access exclusion is the caller's responsibility. - * - * CONTEXT: - * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. - * Otherwise, don't care. + * Returns pointer to array of pointers to struct page which can be indexed + * with pcpu_page_idx(). Note that there is only one array and accesses + * should be serialized by pcpu_alloc_mutex. * * RETURNS: - * Pointer to temp pages array on success, NULL on failure. + * Pointer to temp pages array on success. */ -static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, - unsigned long **bitmapp, - bool may_alloc) +static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc) { static struct page **pages; - static unsigned long *bitmap; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); - size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * - sizeof(unsigned long); - - if (!pages || !bitmap) { - if (may_alloc && !pages) - pages = pcpu_mem_zalloc(pages_size); - if (may_alloc && !bitmap) - bitmap = pcpu_mem_zalloc(bitmap_size); - if (!pages || !bitmap) - return NULL; - } - bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); + lockdep_assert_held(&pcpu_alloc_mutex); - *bitmapp = bitmap; + if (!pages) + pages = pcpu_mem_zalloc(pages_size); return pages; } @@ -67,7 +46,6 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, * pcpu_free_pages - free pages which were allocated for @chunk * @chunk: chunk pages were allocated for * @pages: array of pages to be freed, indexed by pcpu_page_idx() - * @populated: populated bitmap * @page_start: page index of the first page to be freed * @page_end: page index of the last page to be freed + 1 * @@ -75,8 +53,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, * The pages were allocated for @chunk. */ static void pcpu_free_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; @@ -95,7 +72,6 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * pcpu_alloc_pages - allocates pages for @chunk * @chunk: target chunk * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() - * @populated: populated bitmap * @page_start: page index of the first page to be allocated * @page_end: page index of the last page to be allocated + 1 * @@ -104,11 +80,10 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * content of @pages and will pass it verbatim to pcpu_map_pages(). */ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; - unsigned int cpu; + unsigned int cpu, tcpu; int i; for_each_possible_cpu(cpu) { @@ -116,14 +91,23 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); - if (!*pagep) { - pcpu_free_pages(chunk, pages, populated, - page_start, page_end); - return -ENOMEM; - } + if (!*pagep) + goto err; } } return 0; + +err: + while (--i >= page_start) + __free_page(pages[pcpu_page_idx(cpu, i)]); + + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + for (i = page_start; i < page_end; i++) + __free_page(pages[pcpu_page_idx(tcpu, i)]); + } + return -ENOMEM; } /** @@ -155,7 +139,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) * pcpu_unmap_pages - unmap pages out of a pcpu_chunk * @chunk: chunk of interest * @pages: pages array which can be used to pass information to free - * @populated: populated bitmap * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 * @@ -166,8 +149,7 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) * proper pre/post flush functions. */ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; @@ -183,8 +165,6 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), page_end - page_start); } - - bitmap_clear(populated, page_start, page_end - page_start); } /** @@ -219,7 +199,6 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest * @pages: pages array containing pages to be mapped - * @populated: populated bitmap * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * @@ -227,13 +206,11 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, * caller is responsible for calling pcpu_post_map_flush() after all * mappings are complete. * - * This function is responsible for setting corresponding bits in - * @chunk->populated bitmap and whatever is necessary for reverse - * lookup (addr -> chunk). + * This function is responsible for setting up whatever is necessary for + * reverse lookup (addr -> chunk). */ static int pcpu_map_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu, tcpu; int i, err; @@ -244,18 +221,12 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk, page_end - page_start); if (err < 0) goto err; - } - /* mapping successful, link chunk and mark populated */ - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) + for (i = page_start; i < page_end; i++) pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], chunk); - __set_bit(i, populated); } - return 0; - err: for_each_possible_cpu(tcpu) { if (tcpu == cpu) @@ -263,6 +234,7 @@ err: __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), page_end - page_start); } + pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); return err; } @@ -289,123 +261,69 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, /** * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest - * @off: offset to the area to populate - * @size: size of the area to populate in bytes + * @page_start: the start page + * @page_end: the end page * * For each cpu, populate and map pages [@page_start,@page_end) into - * @chunk. The area is cleared on return. + * @chunk. * * CONTEXT: * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int free_end = page_start, unmap_end = page_start; struct page **pages; - unsigned long *populated; - unsigned int cpu; - int rs, re, rc; - - /* quick path, check whether all pages are already there */ - rs = page_start; - pcpu_next_pop(chunk, &rs, &re, page_end); - if (rs == page_start && re == page_end) - goto clear; - /* need to allocate and map pages, this chunk can't be immutable */ - WARN_ON(chunk->immutable); - - pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); + pages = pcpu_get_pages(chunk); if (!pages) return -ENOMEM; - /* alloc and map */ - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); - if (rc) - goto err_free; - free_end = re; - } + if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) + return -ENOMEM; - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_map_pages(chunk, pages, populated, rs, re); - if (rc) - goto err_unmap; - unmap_end = re; + if (pcpu_map_pages(chunk, pages, page_start, page_end)) { + pcpu_free_pages(chunk, pages, page_start, page_end); + return -ENOMEM; } pcpu_post_map_flush(chunk, page_start, page_end); - /* commit new bitmap */ - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); -clear: - for_each_possible_cpu(cpu) - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); return 0; - -err_unmap: - pcpu_pre_unmap_flush(chunk, page_start, unmap_end); - pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) - pcpu_unmap_pages(chunk, pages, populated, rs, re); - pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); -err_free: - pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) - pcpu_free_pages(chunk, pages, populated, rs, re); - return rc; } /** * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate - * @off: offset to the area to depopulate - * @size: size of the area to depopulate in bytes + * @page_start: the start page + * @page_end: the end page * * For each cpu, depopulate and unmap pages [@page_start,@page_end) - * from @chunk. If @flush is true, vcache is flushed before unmapping - * and tlb after. + * from @chunk. * * CONTEXT: * pcpu_alloc_mutex. */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); struct page **pages; - unsigned long *populated; - int rs, re; - - /* quick path, check whether it's empty already */ - rs = page_start; - pcpu_next_unpop(chunk, &rs, &re, page_end); - if (rs == page_start && re == page_end) - return; - - /* immutable chunks can't be depopulated */ - WARN_ON(chunk->immutable); /* * If control reaches here, there must have been at least one * successful population attempt so the temp pages array must * be available now. */ - pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); + pages = pcpu_get_pages(chunk); BUG_ON(!pages); /* unmap and free */ pcpu_pre_unmap_flush(chunk, page_start, page_end); - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_unmap_pages(chunk, pages, page_start, page_end); /* no need to flush tlb, vmalloc will handle it lazily */ - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_free_pages(chunk, pages, populated, rs, re); - - /* commit new bitmap */ - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); + pcpu_free_pages(chunk, pages, page_start, page_end); } static struct pcpu_chunk *pcpu_create_chunk(void) diff --git a/mm/percpu.c b/mm/percpu.c index 2139e30..014bab6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -76,6 +76,10 @@ #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ +#define PCPU_ATOMIC_MAP_MARGIN_LOW 32 +#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 +#define PCPU_EMPTY_POP_PAGES_LOW 2 +#define PCPU_EMPTY_POP_PAGES_HIGH 4 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ @@ -102,12 +106,16 @@ struct pcpu_chunk { int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ void *base_addr; /* base address of this chunk */ + int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ + struct work_struct map_extend_work;/* async ->map[] extension */ + void *data; /* chunk data */ int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ + int nr_populated; /* # of populated pages */ unsigned long populated[]; /* populated bitmap */ }; @@ -151,38 +159,33 @@ static struct pcpu_chunk *pcpu_first_chunk; static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; +static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ + +static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ + /* - * Synchronization rules. - * - * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former - * protects allocation/reclaim paths, chunks, populated bitmap and - * vmalloc mapping. The latter is a spinlock and protects the index - * data structures - chunk slots, chunks and area maps in chunks. - * - * During allocation, pcpu_alloc_mutex is kept locked all the time and - * pcpu_lock is grabbed and released as necessary. All actual memory - * allocations are done using GFP_KERNEL with pcpu_lock released. In - * general, percpu memory can't be allocated with irq off but - * irqsave/restore are still used in alloc path so that it can be used - * from early init path - sched_init() specifically. - * - * Free path accesses and alters only the index data structures, so it - * can be safely called from atomic context. When memory needs to be - * returned to the system, free path schedules reclaim_work which - * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be - * reclaimed, release both locks and frees the chunks. Note that it's - * necessary to grab both locks to remove a chunk from circulation as - * allocation path might be referencing the chunk with only - * pcpu_alloc_mutex locked. + * The number of empty populated pages, protected by pcpu_lock. The + * reserved chunk doesn't contribute to the count. */ -static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ -static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ +static int pcpu_nr_empty_pop_pages; -static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +/* + * Balance work is used to populate or destroy chunks asynchronously. We + * try to keep the number of populated free pages between + * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one + * empty chunk. + */ +static void pcpu_balance_workfn(struct work_struct *work); +static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); +static bool pcpu_async_enabled __read_mostly; +static bool pcpu_atomic_alloc_failed; -/* reclaim work to release fully free chunks, scheduled from free path */ -static void pcpu_reclaim(struct work_struct *work); -static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); +static void pcpu_schedule_balance_work(void) +{ + if (pcpu_async_enabled) + schedule_work(&pcpu_balance_work); +} static bool pcpu_addr_in_first_chunk(void *addr) { @@ -315,6 +318,38 @@ static void pcpu_mem_free(void *ptr, size_t size) } /** + * pcpu_count_occupied_pages - count the number of pages an area occupies + * @chunk: chunk of interest + * @i: index of the area in question + * + * Count the number of pages chunk's @i'th area occupies. When the area's + * start and/or end address isn't aligned to page boundary, the straddled + * page is included in the count iff the rest of the page is free. + */ +static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) +{ + int off = chunk->map[i] & ~1; + int end = chunk->map[i + 1] & ~1; + + if (!PAGE_ALIGNED(off) && i > 0) { + int prev = chunk->map[i - 1]; + + if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) + off = round_down(off, PAGE_SIZE); + } + + if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { + int next = chunk->map[i + 1]; + int nend = chunk->map[i + 2] & ~1; + + if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) + end = round_up(end, PAGE_SIZE); + } + + return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); +} + +/** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on @@ -342,9 +377,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) /** * pcpu_need_to_extend - determine whether chunk area map needs to be extended * @chunk: chunk of interest + * @is_atomic: the allocation context * - * Determine whether area map of @chunk needs to be extended to - * accommodate a new allocation. + * Determine whether area map of @chunk needs to be extended. If + * @is_atomic, only the amount necessary for a new allocation is + * considered; however, async extension is scheduled if the left amount is + * low. If !@is_atomic, it aims for more empty space. Combined, this + * ensures that the map is likely to have enough available space to + * accomodate atomic allocations which can't extend maps directly. * * CONTEXT: * pcpu_lock. @@ -353,15 +393,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) * New target map allocation length if extension is necessary, 0 * otherwise. */ -static int pcpu_need_to_extend(struct pcpu_chunk *chunk) +static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) { - int new_alloc; + int margin, new_alloc; + + if (is_atomic) { + margin = 3; + + if (chunk->map_alloc < + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && + pcpu_async_enabled) + schedule_work(&chunk->map_extend_work); + } else { + margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; + } - if (chunk->map_alloc >= chunk->map_used + 3) + if (chunk->map_alloc >= chunk->map_used + margin) return 0; new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + 3) + while (new_alloc < chunk->map_used + margin) new_alloc *= 2; return new_alloc; @@ -418,11 +469,76 @@ out_unlock: return 0; } +static void pcpu_map_extend_workfn(struct work_struct *work) +{ + struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, + map_extend_work); + int new_alloc; + + spin_lock_irq(&pcpu_lock); + new_alloc = pcpu_need_to_extend(chunk, false); + spin_unlock_irq(&pcpu_lock); + + if (new_alloc) + pcpu_extend_area_map(chunk, new_alloc); +} + +/** + * pcpu_fit_in_area - try to fit the requested allocation in a candidate area + * @chunk: chunk the candidate area belongs to + * @off: the offset to the start of the candidate area + * @this_size: the size of the candidate area + * @size: the size of the target allocation + * @align: the alignment of the target allocation + * @pop_only: only allocate from already populated region + * + * We're trying to allocate @size bytes aligned at @align. @chunk's area + * at @off sized @this_size is a candidate. This function determines + * whether the target allocation fits in the candidate area and returns the + * number of bytes to pad after @off. If the target area doesn't fit, -1 + * is returned. + * + * If @pop_only is %true, this function only considers the already + * populated part of the candidate area. + */ +static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, + int size, int align, bool pop_only) +{ + int cand_off = off; + + while (true) { + int head = ALIGN(cand_off, align) - off; + int page_start, page_end, rs, re; + + if (this_size < head + size) + return -1; + + if (!pop_only) + return head; + + /* + * If the first unpopulated page is beyond the end of the + * allocation, the whole allocation is populated; + * otherwise, retry from the end of the unpopulated area. + */ + page_start = PFN_DOWN(head + off); + page_end = PFN_UP(head + off + size); + + rs = page_start; + pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); + if (rs >= page_end) + return head; + cand_off = re * PAGE_SIZE; + } +} + /** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest * @size: wanted size in bytes * @align: wanted align + * @pop_only: allocate only from the populated area + * @occ_pages_p: out param for the number of pages the area occupies * * Try to allocate @size bytes area aligned at @align from @chunk. * Note that this function only allocates the offset. It doesn't @@ -437,7 +553,8 @@ out_unlock: * Allocated offset in @chunk on success, -1 if no matching area is * found. */ -static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, + bool pop_only, int *occ_pages_p) { int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; @@ -453,11 +570,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) if (off & 1) continue; - /* extra for alignment requirement */ - head = ALIGN(off, align) - off; - this_size = (p[1] & ~1) - off; - if (this_size < head + size) { + + head = pcpu_fit_in_area(chunk, off, this_size, size, align, + pop_only); + if (head < 0) { if (!seen_free) { chunk->first_free = i; seen_free = true; @@ -526,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) chunk->free_size -= size; *p |= 1; + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); pcpu_chunk_relocate(chunk, oslot); return off; } @@ -541,6 +659,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * pcpu_free_area - free area to a pcpu_chunk * @chunk: chunk of interest * @freeme: offset of area to free + * @occ_pages_p: out param for the number of pages the area occupies * * Free area starting from @freeme to @chunk. Note that this function * only modifies the allocation map. It doesn't depopulate or unmap @@ -549,7 +668,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * CONTEXT: * pcpu_lock. */ -static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) +static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, + int *occ_pages_p) { int oslot = pcpu_chunk_slot(chunk); int off = 0; @@ -580,6 +700,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) *p = off &= ~1; chunk->free_size += (p[1] & ~1) - off; + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); + /* merge with next? */ if (!(p[1] & 1)) to_free++; @@ -620,6 +742,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); + INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; @@ -634,6 +757,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) pcpu_mem_free(chunk, pcpu_chunk_struct_size); } +/** + * pcpu_chunk_populated - post-population bookkeeping + * @chunk: pcpu_chunk which got populated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been populated to @chunk. Update + * the bookkeeping information accordingly. Must be called after each + * successful population. + */ +static void pcpu_chunk_populated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_set(chunk->populated, page_start, nr); + chunk->nr_populated += nr; + pcpu_nr_empty_pop_pages += nr; +} + +/** + * pcpu_chunk_depopulated - post-depopulation bookkeeping + * @chunk: pcpu_chunk which got depopulated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been depopulated from @chunk. + * Update the bookkeeping information accordingly. Must be called after + * each successful depopulation. + */ +static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_clear(chunk->populated, page_start, nr); + chunk->nr_populated -= nr; + pcpu_nr_empty_pop_pages -= nr; +} + /* * Chunk management implementation. * @@ -695,21 +862,23 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available + * @gfp: allocation flags * - * Allocate percpu area of @size bytes aligned at @align. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't + * contain %GFP_KERNEL, the allocation is atomic. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) +static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, + gfp_t gfp) { static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; - int slot, off, new_alloc; + bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; + int occ_pages = 0; + int slot, off, new_alloc, cpu, ret; unsigned long flags; void __percpu *ptr; @@ -728,7 +897,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) return NULL; } - mutex_lock(&pcpu_alloc_mutex); spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ @@ -740,16 +908,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) goto fail_unlock; } - while ((new_alloc = pcpu_need_to_extend(chunk))) { + while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { spin_unlock_irqrestore(&pcpu_lock, flags); - if (pcpu_extend_area_map(chunk, new_alloc) < 0) { + if (is_atomic || + pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map of reserved chunk"; - goto fail_unlock_mutex; + goto fail; } spin_lock_irqsave(&pcpu_lock, flags); } - off = pcpu_alloc_area(chunk, size, align); + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); if (off >= 0) goto area_found; @@ -764,13 +934,15 @@ restart: if (size > chunk->contig_hint) continue; - new_alloc = pcpu_need_to_extend(chunk); + new_alloc = pcpu_need_to_extend(chunk, is_atomic); if (new_alloc) { + if (is_atomic) + continue; spin_unlock_irqrestore(&pcpu_lock, flags); if (pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map"; - goto fail_unlock_mutex; + goto fail; } spin_lock_irqsave(&pcpu_lock, flags); /* @@ -780,74 +952,134 @@ restart: goto restart; } - off = pcpu_alloc_area(chunk, size, align); + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); if (off >= 0) goto area_found; } } - /* hmmm... no space left, create a new chunk */ spin_unlock_irqrestore(&pcpu_lock, flags); - chunk = pcpu_create_chunk(); - if (!chunk) { - err = "failed to allocate new chunk"; - goto fail_unlock_mutex; + /* + * No space left. Create a new chunk. We don't want multiple + * tasks to create chunks simultaneously. Serialize and create iff + * there's still no empty chunk after grabbing the mutex. + */ + if (is_atomic) + goto fail; + + mutex_lock(&pcpu_alloc_mutex); + + if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { + chunk = pcpu_create_chunk(); + if (!chunk) { + mutex_unlock(&pcpu_alloc_mutex); + err = "failed to allocate new chunk"; + goto fail; + } + + spin_lock_irqsave(&pcpu_lock, flags); + pcpu_chunk_relocate(chunk, -1); + } else { + spin_lock_irqsave(&pcpu_lock, flags); } - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_chunk_relocate(chunk, -1); + mutex_unlock(&pcpu_alloc_mutex); goto restart; area_found: spin_unlock_irqrestore(&pcpu_lock, flags); - /* populate, map and clear the area */ - if (pcpu_populate_chunk(chunk, off, size)) { - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_free_area(chunk, off); - err = "failed to populate"; - goto fail_unlock; + /* populate if not all pages are already there */ + if (!is_atomic) { + int page_start, page_end, rs, re; + + mutex_lock(&pcpu_alloc_mutex); + + page_start = PFN_DOWN(off); + page_end = PFN_UP(off + size); + + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + WARN_ON(chunk->immutable); + + ret = pcpu_populate_chunk(chunk, rs, re); + + spin_lock_irqsave(&pcpu_lock, flags); + if (ret) { + mutex_unlock(&pcpu_alloc_mutex); + pcpu_free_area(chunk, off, &occ_pages); + err = "failed to populate"; + goto fail_unlock; + } + pcpu_chunk_populated(chunk, rs, re); + spin_unlock_irqrestore(&pcpu_lock, flags); + } + + mutex_unlock(&pcpu_alloc_mutex); } - mutex_unlock(&pcpu_alloc_mutex); + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages -= occ_pages; + + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + pcpu_schedule_balance_work(); + + /* clear the areas and return address relative to base address */ + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); - /* return address relative to base address */ ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size); return ptr; fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); -fail_unlock_mutex: - mutex_unlock(&pcpu_alloc_mutex); - if (warn_limit) { - pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " - "%s\n", size, align, err); +fail: + if (!is_atomic && warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); dump_stack(); if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); } + if (is_atomic) { + /* see the flag handling in pcpu_blance_workfn() */ + pcpu_atomic_alloc_failed = true; + pcpu_schedule_balance_work(); + } return NULL; } /** - * __alloc_percpu - allocate dynamic percpu area + * __alloc_percpu_gfp - allocate dynamic percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) + * @gfp: allocation flags * - * Allocate zero-filled percpu area of @size bytes aligned at @align. - * Might sleep. Might trigger writeouts. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate zero-filled percpu area of @size bytes aligned at @align. If + * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can + * be called from any context but is a lot more likely to fail. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ +void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) +{ + return pcpu_alloc(size, align, false, gfp); +} +EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); + +/** + * __alloc_percpu - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). + */ void __percpu *__alloc_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, false); + return pcpu_alloc(size, align, false, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__alloc_percpu); @@ -869,44 +1101,121 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); */ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, true); + return pcpu_alloc(size, align, true, GFP_KERNEL); } /** - * pcpu_reclaim - reclaim fully free chunks, workqueue function + * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * * Reclaim all fully free chunks except for the first one. - * - * CONTEXT: - * workqueue context. */ -static void pcpu_reclaim(struct work_struct *work) +static void pcpu_balance_workfn(struct work_struct *work) { - LIST_HEAD(todo); - struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; + LIST_HEAD(to_free); + struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; + int slot, nr_to_pop, ret; + /* + * There's no reason to keep around multiple unused chunks and VM + * areas can be scarce. Destroy all free chunks except for one. + */ mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); - list_for_each_entry_safe(chunk, next, head, list) { + list_for_each_entry_safe(chunk, next, free_head, list) { WARN_ON(chunk->immutable); /* spare the first one */ - if (chunk == list_first_entry(head, struct pcpu_chunk, list)) + if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; - list_move(&chunk->list, &todo); + list_move(&chunk->list, &to_free); } spin_unlock_irq(&pcpu_lock); - list_for_each_entry_safe(chunk, next, &todo, list) { - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); + list_for_each_entry_safe(chunk, next, &to_free, list) { + int rs, re; + + pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { + pcpu_depopulate_chunk(chunk, rs, re); + spin_lock_irq(&pcpu_lock); + pcpu_chunk_depopulated(chunk, rs, re); + spin_unlock_irq(&pcpu_lock); + } pcpu_destroy_chunk(chunk); } + /* + * Ensure there are certain number of free populated pages for + * atomic allocs. Fill up from the most packed so that atomic + * allocs don't increase fragmentation. If atomic allocation + * failed previously, always populate the maximum amount. This + * should prevent atomic allocs larger than PAGE_SIZE from keeping + * failing indefinitely; however, large atomic allocs are not + * something we support properly and can be highly unreliable and + * inefficient. + */ +retry_pop: + if (pcpu_atomic_alloc_failed) { + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; + /* best effort anyway, don't worry about synchronization */ + pcpu_atomic_alloc_failed = false; + } else { + nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - + pcpu_nr_empty_pop_pages, + 0, PCPU_EMPTY_POP_PAGES_HIGH); + } + + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { + int nr_unpop = 0, rs, re; + + if (!nr_to_pop) + break; + + spin_lock_irq(&pcpu_lock); + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + nr_unpop = pcpu_unit_pages - chunk->nr_populated; + if (nr_unpop) + break; + } + spin_unlock_irq(&pcpu_lock); + + if (!nr_unpop) + continue; + + /* @chunk can't go away while pcpu_alloc_mutex is held */ + pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { + int nr = min(re - rs, nr_to_pop); + + ret = pcpu_populate_chunk(chunk, rs, rs + nr); + if (!ret) { + nr_to_pop -= nr; + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, rs, rs + nr); + spin_unlock_irq(&pcpu_lock); + } else { + nr_to_pop = 0; + } + + if (!nr_to_pop) + break; + } + } + + if (nr_to_pop) { + /* ran out of chunks to populate, create a new one and retry */ + chunk = pcpu_create_chunk(); + if (chunk) { + spin_lock_irq(&pcpu_lock); + pcpu_chunk_relocate(chunk, -1); + spin_unlock_irq(&pcpu_lock); + goto retry_pop; + } + } + mutex_unlock(&pcpu_alloc_mutex); } @@ -924,7 +1233,7 @@ void free_percpu(void __percpu *ptr) void *addr; struct pcpu_chunk *chunk; unsigned long flags; - int off; + int off, occ_pages; if (!ptr) return; @@ -938,7 +1247,10 @@ void free_percpu(void __percpu *ptr) chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; - pcpu_free_area(chunk, off); + pcpu_free_area(chunk, off, &occ_pages); + + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages += occ_pages; /* if there are more than one fully free chunks, wake up grim reaper */ if (chunk->free_size == pcpu_unit_size) { @@ -946,7 +1258,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - schedule_work(&pcpu_reclaim_work); + pcpu_schedule_balance_work(); break; } } @@ -1336,11 +1648,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, */ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); + INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); schunk->immutable = true; bitmap_fill(schunk->populated, pcpu_unit_pages); + schunk->nr_populated = pcpu_unit_pages; if (ai->reserved_size) { schunk->free_size = ai->reserved_size; @@ -1364,11 +1678,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (dyn_size) { dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); + INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->immutable = true; bitmap_fill(dchunk->populated, pcpu_unit_pages); + dchunk->nr_populated = pcpu_unit_pages; dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[0] = 1; @@ -1379,6 +1695,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; + pcpu_nr_empty_pop_pages += + pcpu_count_occupied_pages(pcpu_first_chunk, 1); pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ @@ -1965,3 +2283,15 @@ void __init percpu_init_late(void) spin_unlock_irqrestore(&pcpu_lock, flags); } } + +/* + * Percpu allocator is initialized early during boot when neither slab or + * workqueue is available. Plug async management until everything is up + * and running. + */ +static int __init percpu_enable_async(void) +{ + pcpu_async_enabled = true; + return 0; +} +subsys_initcall(percpu_enable_async); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index a8b9199..dfb79e0 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -195,7 +195,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t entry = *pmdp; if (pmd_numa(entry)) entry = pmd_mknonnuma(entry); - set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); + set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/mm/readahead.c b/mm/readahead.c index 0ca36a7..17b9172 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -326,7 +326,6 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, * - thrashing threshold in memory tight systems */ static pgoff_t count_history_pages(struct address_space *mapping, - struct file_ra_state *ra, pgoff_t offset, unsigned long max) { pgoff_t head; @@ -349,7 +348,7 @@ static int try_context_readahead(struct address_space *mapping, { pgoff_t size; - size = count_history_pages(mapping, ra, offset, max); + size = count_history_pages(mapping, offset, max); /* * not enough history pages: @@ -527,7 +527,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) unsigned long address = __vma_address(page, vma); /* page should be within @vma mapping range */ - VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); return address; } @@ -897,7 +897,7 @@ void page_move_anon_rmap(struct page *page, struct anon_vma *anon_vma = vma->anon_vma; VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON(!anon_vma); + VM_BUG_ON_VMA(!anon_vma, vma); VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; @@ -1024,7 +1024,7 @@ void do_page_add_anon_rmap(struct page *page, void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { - VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ if (PageTransHuge(page)) @@ -1032,25 +1032,6 @@ void page_add_new_anon_rmap(struct page *page, __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, hpage_nr_pages(page)); __page_set_anon_rmap(page, vma, address, 1); - - VM_BUG_ON_PAGE(PageLRU(page), page); - if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { - SetPageActive(page); - lru_cache_add(page); - return; - } - - if (!TestSetPageMlocked(page)) { - /* - * We use the irq-unsafe __mod_zone_page_stat because this - * counter is not modified from interrupt context, and the pte - * lock is held(spinlock), which implies preemption disabled. - */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, - hpage_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGMLOCKED); - } - add_page_to_unevictable_list(page); } /** @@ -1061,15 +1042,46 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { - bool locked; + struct mem_cgroup *memcg; unsigned long flags; + bool locked; - mem_cgroup_begin_update_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_update_page_stat(page, &locked, &flags); + mem_cgroup_end_page_stat(memcg, locked, flags); +} + +static void page_remove_file_rmap(struct page *page) +{ + struct mem_cgroup *memcg; + unsigned long flags; + bool locked; + + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) + goto out; + + /* Hugepages are not counted in NR_FILE_MAPPED for now. */ + if (unlikely(PageHuge(page))) + goto out; + + /* + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption disabled. + */ + __dec_zone_page_state(page, NR_FILE_MAPPED); + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + + if (unlikely(PageMlocked(page))) + clear_page_mlock(page); +out: + mem_cgroup_end_page_stat(memcg, locked, flags); } /** @@ -1080,47 +1092,33 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { - bool anon = PageAnon(page); - bool locked; - unsigned long flags; - - /* - * The anon case has no mem_cgroup page_stat to update; but may - * uncharge_page() below, where the lock ordering can deadlock if - * we hold the lock against page_stat move: so avoid it on anon. - */ - if (!anon) - mem_cgroup_begin_update_page_stat(page, &locked, &flags); + if (!PageAnon(page)) { + page_remove_file_rmap(page); + return; + } /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) - goto out; + return; + + /* Hugepages are not counted in NR_ANON_PAGES for now. */ + if (unlikely(PageHuge(page))) + return; /* - * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED - * and not charged by memcg for now. - * * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and - * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - if (unlikely(PageHuge(page))) - goto out; - if (anon) { - mem_cgroup_uncharge_page(page); - if (PageTransHuge(page)) - __dec_zone_page_state(page, - NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - -hpage_nr_pages(page)); - } else { - __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); - mem_cgroup_end_update_page_stat(page, &locked, &flags); - } + if (PageTransHuge(page)) + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, + -hpage_nr_pages(page)); + if (unlikely(PageMlocked(page))) clear_page_mlock(page); + /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap @@ -1130,10 +1128,6 @@ void page_remove_rmap(struct page *page) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ - return; -out: - if (!anon) - mem_cgroup_end_update_page_stat(page, &locked, &flags); } /* @@ -1375,7 +1369,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, continue; /* don't unmap */ } - if (ptep_clear_flush_young_notify(vma, address, pte)) + /* + * No need for _notify because we're within an + * mmu_notifier_invalidate_range_ {start|end} scope. + */ + if (ptep_clear_flush_young(vma, address, pte)) continue; /* Nuke the page table entry. */ @@ -1686,7 +1684,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) * structure at mapping cannot be freed and reused yet, * so we can safely take mapping->i_mmap_mutex. */ - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); if (!mapping) return ret; @@ -66,6 +66,9 @@ static struct vfsmount *shm_mnt; #include <linux/highmem.h> #include <linux/seq_file.h> #include <linux/magic.h> +#include <linux/syscalls.h> +#include <linux/fcntl.h> +#include <uapi/linux/memfd.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -149,6 +152,19 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) vm_unacct_memory(VM_ACCT(size)); } +static inline int shmem_reacct_size(unsigned long flags, + loff_t oldsize, loff_t newsize) +{ + if (!(flags & VM_NORESERVE)) { + if (VM_ACCT(newsize) > VM_ACCT(oldsize)) + return security_vm_enough_memory_mm(current->mm, + VM_ACCT(newsize) - VM_ACCT(oldsize)); + else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) + vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); + } + return 0; +} + /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow huge sparse files. @@ -280,7 +296,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp, void *expected) + pgoff_t index, void *expected) { int error; @@ -406,7 +422,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, pvec.pages, indices); if (!pvec.nr) break; - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -434,7 +449,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); cond_resched(); index++; } @@ -482,7 +496,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index = start; continue; } - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -518,7 +531,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); index++; } @@ -538,6 +550,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); static int shmem_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); int error; error = inode_change_ok(inode, attr); @@ -548,7 +561,16 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; + /* protected by i_mutex */ + if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || + (newsize > oldsize && (info->seals & F_SEAL_GROW))) + return -EPERM; + if (newsize != oldsize) { + error = shmem_reacct_size(SHMEM_I(inode)->flags, + oldsize, newsize); + if (error) + return error; i_size_write(inode, newsize); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } @@ -604,7 +626,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, radswap = swp_to_radix_entry(swap); index = radix_tree_locate_item(&mapping->page_tree, radswap); if (index == -1) - return 0; + return -EAGAIN; /* tell shmem_unuse we found nothing */ /* * Move _head_ to start search for next from here. @@ -649,7 +671,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, */ if (!error) error = shmem_add_to_page_cache(*pagep, mapping, index, - GFP_NOWAIT, radswap); + radswap); if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which @@ -663,7 +685,6 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, spin_unlock(&info->lock); swap_free(swap); } - error = 1; /* not an error, but entry was found */ } return error; } @@ -675,7 +696,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) { struct list_head *this, *next; struct shmem_inode_info *info; - int found = 0; + struct mem_cgroup *memcg; int error = 0; /* @@ -690,26 +711,32 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL); + error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ + error = -EAGAIN; mutex_lock(&shmem_swaplist_mutex); list_for_each_safe(this, next, &shmem_swaplist) { info = list_entry(this, struct shmem_inode_info, swaplist); if (info->swapped) - found = shmem_unuse_inode(info, swap, &page); + error = shmem_unuse_inode(info, swap, &page); else list_del_init(&info->swaplist); cond_resched(); - if (found) + if (error != -EAGAIN) break; + /* found nothing in this: move on to search the next */ } mutex_unlock(&shmem_swaplist_mutex); - if (found < 0) - error = found; + if (error) { + if (error != -ENOMEM) + error = 0; + mem_cgroup_cancel_charge(page, memcg); + } else + mem_cgroup_commit_charge(page, memcg, true); out: unlock_page(page); page_cache_release(page); @@ -813,7 +840,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); - swapcache_free(swap, NULL); + swapcache_free(swap); redirty: set_page_dirty(page); if (wbc->for_reclaim) @@ -986,7 +1013,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_replace_page_cache(oldpage, newpage); + mem_cgroup_migrate(oldpage, newpage, false); lru_cache_add_anon(newpage); *pagep = newpage; } @@ -1013,6 +1040,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo; + struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; int error; @@ -1091,11 +1119,10 @@ repeat: goto failed; } - error = mem_cgroup_charge_file(page, current->mm, - gfp & GFP_RECLAIM_MASK); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - gfp, swp_to_radix_entry(swap)); + swp_to_radix_entry(swap)); /* * We already confirmed swap under page lock, and make * no memory allocation here, so usually no possibility @@ -1108,12 +1135,16 @@ repeat: * Reset swap.val? No, leave it so "failed" goes back to * "repeat": reading a hole and writing should succeed. */ - if (error) + if (error) { + mem_cgroup_cancel_charge(page, memcg); delete_from_swap_cache(page); + } } if (error) goto failed; + mem_cgroup_commit_charge(page, memcg, true); + spin_lock(&info->lock); info->swapped--; shmem_recalc_inode(inode); @@ -1149,22 +1180,22 @@ repeat: __SetPageSwapBacked(page); __set_page_locked(page); if (sgp == SGP_WRITE) - init_page_accessed(page); + __SetPageReferenced(page); - error = mem_cgroup_charge_file(page, current->mm, - gfp & GFP_RECLAIM_MASK); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); if (error) goto decused; error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - gfp, NULL); + NULL); radix_tree_preload_end(); } if (error) { - mem_cgroup_uncharge_cache_page(page); + mem_cgroup_cancel_charge(page, memcg); goto decused; } + mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_anon(page); spin_lock(&info->lock); @@ -1390,6 +1421,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); + info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); @@ -1448,7 +1480,17 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; + struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + /* i_mutex is held by caller */ + if (unlikely(info->seals)) { + if (info->seals & F_SEAL_WRITE) + return -EPERM; + if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) + return -EPERM; + } + return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); } @@ -1786,11 +1828,233 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) return offset; } +/* + * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * so reuse a tag which we firmly believe is never set or cleared on shmem. + */ +#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE +#define LAST_SCAN 4 /* about 150ms max */ + +static void shmem_tag_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void **slot; + pgoff_t start; + struct page *page; + + lru_add_drain(); + start = 0; + rcu_read_lock(); + +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + page = radix_tree_deref_slot(slot); + if (!page || radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + } else if (page_count(page) - page_mapcount(page) > 1) { + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_set(&mapping->page_tree, iter.index, + SHMEM_TAG_PINNED); + spin_unlock_irq(&mapping->tree_lock); + } + + if (need_resched()) { + cond_resched_rcu(); + start = iter.index + 1; + goto restart; + } + } + rcu_read_unlock(); +} + +/* + * Setting SEAL_WRITE requires us to verify there's no pending writer. However, + * via get_user_pages(), drivers might have some pending I/O without any active + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * and see whether it has an elevated ref-count. If so, we tag them and wait for + * them to be dropped. + * The caller must guarantee that no new user will acquire writable references + * to those pages to avoid races. + */ +static int shmem_wait_for_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void **slot; + pgoff_t start; + struct page *page; + int error, scan; + + shmem_tag_pins(mapping); + + error = 0; + for (scan = 0; scan <= LAST_SCAN; scan++) { + if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED)) + break; + + if (!scan) + lru_add_drain_all(); + else if (schedule_timeout_killable((HZ << scan) / 200)) + scan = LAST_SCAN; + + start = 0; + rcu_read_lock(); +restart: + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, + start, SHMEM_TAG_PINNED) { + + page = radix_tree_deref_slot(slot); + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + + page = NULL; + } + + if (page && + page_count(page) - page_mapcount(page) != 1) { + if (scan < LAST_SCAN) + goto continue_resched; + + /* + * On the last scan, we clean up all those tags + * we inserted; but make a note that we still + * found pages pinned. + */ + error = -EBUSY; + } + + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_clear(&mapping->page_tree, + iter.index, SHMEM_TAG_PINNED); + spin_unlock_irq(&mapping->tree_lock); +continue_resched: + if (need_resched()) { + cond_resched_rcu(); + start = iter.index + 1; + goto restart; + } + } + rcu_read_unlock(); + } + + return error; +} + +#define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE) + +int shmem_add_seals(struct file *file, unsigned int seals) +{ + struct inode *inode = file_inode(file); + struct shmem_inode_info *info = SHMEM_I(inode); + int error; + + /* + * SEALING + * Sealing allows multiple parties to share a shmem-file but restrict + * access to a specific subset of file operations. Seals can only be + * added, but never removed. This way, mutually untrusted parties can + * share common memory regions with a well-defined policy. A malicious + * peer can thus never perform unwanted operations on a shared object. + * + * Seals are only supported on special shmem-files and always affect + * the whole underlying inode. Once a seal is set, it may prevent some + * kinds of access to the file. Currently, the following seals are + * defined: + * SEAL_SEAL: Prevent further seals from being set on this file + * SEAL_SHRINK: Prevent the file from shrinking + * SEAL_GROW: Prevent the file from growing + * SEAL_WRITE: Prevent write access to the file + * + * As we don't require any trust relationship between two parties, we + * must prevent seals from being removed. Therefore, sealing a file + * only adds a given set of seals to the file, it never touches + * existing seals. Furthermore, the "setting seals"-operation can be + * sealed itself, which basically prevents any further seal from being + * added. + * + * Semantics of sealing are only defined on volatile files. Only + * anonymous shmem files support sealing. More importantly, seals are + * never written to disk. Therefore, there's no plan to support it on + * other file types. + */ + + if (file->f_op != &shmem_file_operations) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) + return -EPERM; + if (seals & ~(unsigned int)F_ALL_SEALS) + return -EINVAL; + + mutex_lock(&inode->i_mutex); + + if (info->seals & F_SEAL_SEAL) { + error = -EPERM; + goto unlock; + } + + if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { + error = mapping_deny_writable(file->f_mapping); + if (error) + goto unlock; + + error = shmem_wait_for_pins(file->f_mapping); + if (error) { + mapping_allow_writable(file->f_mapping); + goto unlock; + } + } + + info->seals |= seals; + error = 0; + +unlock: + mutex_unlock(&inode->i_mutex); + return error; +} +EXPORT_SYMBOL_GPL(shmem_add_seals); + +int shmem_get_seals(struct file *file) +{ + if (file->f_op != &shmem_file_operations) + return -EINVAL; + + return SHMEM_I(file_inode(file))->seals; +} +EXPORT_SYMBOL_GPL(shmem_get_seals); + +long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long error; + + switch (cmd) { + case F_ADD_SEALS: + /* disallow upper 32bit */ + if (arg > UINT_MAX) + return -EINVAL; + + error = shmem_add_seals(file, arg); + break; + case F_GET_SEALS: + error = shmem_get_seals(file); + break; + default: + error = -EINVAL; + break; + } + + return error; +} + static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; pgoff_t start, index, end; int error; @@ -1806,6 +2070,12 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); + /* protected by i_mutex */ + if (info->seals & F_SEAL_WRITE) { + error = -EPERM; + goto out; + } + shmem_falloc.waitq = &shmem_falloc_waitq; shmem_falloc.start = unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; @@ -1832,6 +2102,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (error) goto out; + if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { + error = -EPERM; + goto out; + } + start = offset >> PAGE_CACHE_SHIFT; end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; /* Try to avoid a swapstorm if len is impossible to satisfy */ @@ -2048,24 +2323,88 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) return shmem_unlink(dir, dentry); } +static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) +{ + bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode); + bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode); + + if (old_dir != new_dir && old_is_dir != new_is_dir) { + if (old_is_dir) { + drop_nlink(old_dir); + inc_nlink(new_dir); + } else { + drop_nlink(new_dir); + inc_nlink(old_dir); + } + } + old_dir->i_ctime = old_dir->i_mtime = + new_dir->i_ctime = new_dir->i_mtime = + old_dentry->d_inode->i_ctime = + new_dentry->d_inode->i_ctime = CURRENT_TIME; + + return 0; +} + +static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) +{ + struct dentry *whiteout; + int error; + + whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); + if (!whiteout) + return -ENOMEM; + + error = shmem_mknod(old_dir, whiteout, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); + dput(whiteout); + if (error) + return error; + + /* + * Cheat and hash the whiteout while the old dentry is still in + * place, instead of playing games with FS_RENAME_DOES_D_MOVE. + * + * d_lookup() will consistently find one of them at this point, + * not sure which one, but that isn't even important. + */ + d_rehash(whiteout); + return 0; +} + /* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ -static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) +static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *inode = old_dentry->d_inode; int they_are_dirs = S_ISDIR(inode->i_mode); + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) + return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry); + if (!simple_empty(new_dentry)) return -ENOTEMPTY; + if (flags & RENAME_WHITEOUT) { + int error; + + error = shmem_whiteout(old_dir, old_dentry); + if (error) + return error; + } + if (new_dentry->d_inode) { (void) shmem_unlink(new_dir, new_dentry); - if (they_are_dirs) + if (they_are_dirs) { + drop_nlink(new_dentry->d_inode); drop_nlink(old_dir); + } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); @@ -2567,6 +2906,77 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) shmem_show_mpol(seq, sbinfo->mpol); return 0; } + +#define MFD_NAME_PREFIX "memfd:" +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) + +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + struct shmem_inode_info *info; + struct file *file; + int fd, error; + char *name; + long len; + + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + + /* length includes terminating zero */ + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); + if (len <= 0) + return -EFAULT; + if (len > MFD_NAME_MAX_LEN + 1) + return -EINVAL; + + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY); + if (!name) + return -ENOMEM; + + strcpy(name, MFD_NAME_PREFIX); + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + error = -EFAULT; + goto err_name; + } + + /* terminating-zero may have changed after strnlen_user() returned */ + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { + error = -EFAULT; + goto err_name; + } + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + file = shmem_file_setup(name, 0, VM_NORESERVE); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + info = SHMEM_I(file_inode(file)); + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + file->f_flags |= O_RDWR | O_LARGEFILE; + if (flags & MFD_ALLOW_SEALING) + info->seals &= ~F_SEAL_SEAL; + + fd_install(fd, file); + kfree(name); + return fd; + +err_fd: + put_unused_fd(fd); +err_name: + kfree(name); + return error; +} + #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) @@ -2619,7 +3029,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #endif spin_lock_init(&sbinfo->stat_lock); - if (percpu_counter_init(&sbinfo->used_blocks, 0)) + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; sbinfo->free_inodes = sbinfo->max_inodes; @@ -2701,7 +3111,9 @@ static const struct address_space_operations shmem_aops = { .write_begin = shmem_write_begin, .write_end = shmem_write_end, #endif +#ifdef CONFIG_MIGRATION .migratepage = migrate_page, +#endif .error_remove_page = generic_error_remove_page, }; @@ -2741,7 +3153,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .mkdir = shmem_mkdir, .rmdir = shmem_rmdir, .mknod = shmem_mknod, - .rename = shmem_rename, + .rename2 = shmem_rename2, .tmpfile = shmem_tmpfile, #endif #ifdef CONFIG_TMPFS_XATTR @@ -2932,16 +3344,16 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, this.len = strlen(name); this.hash = 0; /* will go */ sb = shm_mnt->mnt_sb; + path.mnt = mntget(shm_mnt); path.dentry = d_alloc_pseudo(sb, &this); if (!path.dentry) goto put_memory; d_set_d_op(path.dentry, &anon_ops); - path.mnt = mntget(shm_mnt); res = ERR_PTR(-ENOSPC); inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) - goto put_dentry; + goto put_memory; inode->i_flags |= i_flags; d_instantiate(path.dentry, inode); @@ -2949,19 +3361,19 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (IS_ERR(res)) - goto put_dentry; + goto put_path; res = alloc_file(&path, FMODE_WRITE | FMODE_READ, &shmem_file_operations); if (IS_ERR(res)) - goto put_dentry; + goto put_path; return res; -put_dentry: - path_put(&path); put_memory: shmem_unacct_size(flags, size); +put_path: + path_put(&path); return res; } @@ -191,7 +191,6 @@ struct array_cache { unsigned int limit; unsigned int batchcount; unsigned int touched; - spinlock_t lock; void *entry[]; /* * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing @@ -203,6 +202,11 @@ struct array_cache { */ }; +struct alien_cache { + spinlock_t lock; + struct array_cache ac; +}; + #define SLAB_OBJ_PFMEMALLOC 1 static inline bool is_obj_pfmemalloc(void *objp) { @@ -233,22 +237,21 @@ struct arraycache_init { /* * Need this for bootstrapping a per node allocator. */ -#define NUM_INIT_LISTS (3 * MAX_NUMNODES) +#define NUM_INIT_LISTS (2 * MAX_NUMNODES) static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; #define CACHE_CACHE 0 -#define SIZE_AC MAX_NUMNODES -#define SIZE_NODE (2 * MAX_NUMNODES) +#define SIZE_NODE (MAX_NUMNODES) static int drain_freelist(struct kmem_cache *cache, struct kmem_cache_node *n, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node); + int node, struct list_head *list); +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); static int slab_early_init = 1; -#define INDEX_AC kmalloc_index(sizeof(struct arraycache_init)) #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) static void kmem_cache_node_init(struct kmem_cache_node *parent) @@ -267,7 +270,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define MAKE_LIST(cachep, listp, slab, nodeid) \ do { \ INIT_LIST_HEAD(listp); \ - list_splice(&(cachep->node[nodeid]->slab), listp); \ + list_splice(&get_node(cachep, nodeid)->slab, listp); \ } while (0) #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ @@ -453,9 +456,6 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache, return reciprocal_divide(offset, cache->reciprocal_buffer_size); } -static struct arraycache_init initarray_generic = - { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; - /* internal cache of cache description objs */ static struct kmem_cache kmem_cache_boot = { .batchcount = 1, @@ -467,146 +467,11 @@ static struct kmem_cache kmem_cache_boot = { #define BAD_ALIEN_MAGIC 0x01020304ul -#ifdef CONFIG_LOCKDEP - -/* - * Slab sometimes uses the kmalloc slabs to store the slab headers - * for other slabs "off slab". - * The locking for this is tricky in that it nests within the locks - * of all other slabs in a few places; to deal with this special - * locking we put on-slab caches into a separate lock-class. - * - * We set lock class for alien array caches which are up during init. - * The lock annotation will be lost if all cpus of a node goes down and - * then comes back up during hotplug - */ -static struct lock_class_key on_slab_l3_key; -static struct lock_class_key on_slab_alc_key; - -static struct lock_class_key debugobj_l3_key; -static struct lock_class_key debugobj_alc_key; - -static void slab_set_lock_classes(struct kmem_cache *cachep, - struct lock_class_key *l3_key, struct lock_class_key *alc_key, - int q) -{ - struct array_cache **alc; - struct kmem_cache_node *n; - int r; - - n = cachep->node[q]; - if (!n) - return; - - lockdep_set_class(&n->list_lock, l3_key); - alc = n->alien; - /* - * FIXME: This check for BAD_ALIEN_MAGIC - * should go away when common slab code is taught to - * work even without alien caches. - * Currently, non NUMA code returns BAD_ALIEN_MAGIC - * for alloc_alien_cache, - */ - if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - return; - for_each_node(r) { - if (alc[r]) - lockdep_set_class(&alc[r]->lock, alc_key); - } -} - -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) -{ - slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); -} - -static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) -{ - int node; - - for_each_online_node(node) - slab_set_debugobj_lock_classes_node(cachep, node); -} - -static void init_node_lock_keys(int q) -{ - int i; - - if (slab_state < UP) - return; - - for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache_node *n; - struct kmem_cache *cache = kmalloc_caches[i]; - - if (!cache) - continue; - - n = cache->node[q]; - if (!n || OFF_SLAB(cache)) - continue; - - slab_set_lock_classes(cache, &on_slab_l3_key, - &on_slab_alc_key, q); - } -} - -static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) -{ - if (!cachep->node[q]) - return; - - slab_set_lock_classes(cachep, &on_slab_l3_key, - &on_slab_alc_key, q); -} - -static inline void on_slab_lock_classes(struct kmem_cache *cachep) -{ - int node; - - VM_BUG_ON(OFF_SLAB(cachep)); - for_each_node(node) - on_slab_lock_classes_node(cachep, node); -} - -static inline void init_lock_keys(void) -{ - int node; - - for_each_node(node) - init_node_lock_keys(node); -} -#else -static void init_node_lock_keys(int q) -{ -} - -static inline void init_lock_keys(void) -{ -} - -static inline void on_slab_lock_classes(struct kmem_cache *cachep) -{ -} - -static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) -{ -} - -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) -{ -} - -static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) -{ -} -#endif - static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) { - return cachep->array[smp_processor_id()]; + return this_cpu_ptr(cachep->cpu_cache); } static size_t calculate_freelist_size(int nr_objs, size_t align) @@ -792,13 +657,8 @@ static void start_cpu_timer(int cpu) } } -static struct array_cache *alloc_arraycache(int node, int entries, - int batchcount, gfp_t gfp) +static void init_arraycache(struct array_cache *ac, int limit, int batch) { - int memsize = sizeof(void *) * entries + sizeof(struct array_cache); - struct array_cache *nc = NULL; - - nc = kmalloc_node(memsize, gfp, node); /* * The array_cache structures contain pointers to free object. * However, when such objects are allocated or transferred to another @@ -806,15 +666,24 @@ static struct array_cache *alloc_arraycache(int node, int entries, * valid references during a kmemleak scan. Therefore, kmemleak must * not scan such objects. */ - kmemleak_no_scan(nc); - if (nc) { - nc->avail = 0; - nc->limit = entries; - nc->batchcount = batchcount; - nc->touched = 0; - spin_lock_init(&nc->lock); + kmemleak_no_scan(ac); + if (ac) { + ac->avail = 0; + ac->limit = limit; + ac->batchcount = batch; + ac->touched = 0; } - return nc; +} + +static struct array_cache *alloc_arraycache(int node, int entries, + int batchcount, gfp_t gfp) +{ + size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); + struct array_cache *ac = NULL; + + ac = kmalloc_node(memsize, gfp, node); + init_arraycache(ac, entries, batchcount); + return ac; } static inline bool is_slab_pfmemalloc(struct page *page) @@ -826,7 +695,7 @@ static inline bool is_slab_pfmemalloc(struct page *page) static void recheck_pfmemalloc_active(struct kmem_cache *cachep, struct array_cache *ac) { - struct kmem_cache_node *n = cachep->node[numa_mem_id()]; + struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); struct page *page; unsigned long flags; @@ -881,7 +750,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, * If there are empty slabs on the slabs_free list and we are * being forced to refill the cache, mark this one !pfmemalloc. */ - n = cachep->node[numa_mem_id()]; + n = get_node(cachep, numa_mem_id()); if (!list_empty(&n->slabs_free) && force_refill) { struct page *page = virt_to_head_page(objp); ClearPageSlabPfmemalloc(page); @@ -911,8 +780,8 @@ static inline void *ac_get_obj(struct kmem_cache *cachep, return objp; } -static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, - void *objp) +static noinline void *__ac_put_obj(struct kmem_cache *cachep, + struct array_cache *ac, void *objp) { if (unlikely(pfmemalloc_active)) { /* Some pfmemalloc slabs exist, check if this is one */ @@ -961,12 +830,13 @@ static int transfer_objects(struct array_cache *to, #define drain_alien_cache(cachep, alien) do { } while (0) #define reap_alien(cachep, n) do { } while (0) -static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) +static inline struct alien_cache **alloc_alien_cache(int node, + int limit, gfp_t gfp) { - return (struct array_cache **)BAD_ALIEN_MAGIC; + return (struct alien_cache **)BAD_ALIEN_MAGIC; } -static inline void free_alien_cache(struct array_cache **ac_ptr) +static inline void free_alien_cache(struct alien_cache **ac_ptr) { } @@ -992,46 +862,60 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); static void *alternate_node_alloc(struct kmem_cache *, gfp_t); -static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) +static struct alien_cache *__alloc_alien_cache(int node, int entries, + int batch, gfp_t gfp) +{ + size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); + struct alien_cache *alc = NULL; + + alc = kmalloc_node(memsize, gfp, node); + init_arraycache(&alc->ac, entries, batch); + spin_lock_init(&alc->lock); + return alc; +} + +static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { - struct array_cache **ac_ptr; - int memsize = sizeof(void *) * nr_node_ids; + struct alien_cache **alc_ptr; + size_t memsize = sizeof(void *) * nr_node_ids; int i; if (limit > 1) limit = 12; - ac_ptr = kzalloc_node(memsize, gfp, node); - if (ac_ptr) { - for_each_node(i) { - if (i == node || !node_online(i)) - continue; - ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); - if (!ac_ptr[i]) { - for (i--; i >= 0; i--) - kfree(ac_ptr[i]); - kfree(ac_ptr); - return NULL; - } + alc_ptr = kzalloc_node(memsize, gfp, node); + if (!alc_ptr) + return NULL; + + for_each_node(i) { + if (i == node || !node_online(i)) + continue; + alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); + if (!alc_ptr[i]) { + for (i--; i >= 0; i--) + kfree(alc_ptr[i]); + kfree(alc_ptr); + return NULL; } } - return ac_ptr; + return alc_ptr; } -static void free_alien_cache(struct array_cache **ac_ptr) +static void free_alien_cache(struct alien_cache **alc_ptr) { int i; - if (!ac_ptr) + if (!alc_ptr) return; for_each_node(i) - kfree(ac_ptr[i]); - kfree(ac_ptr); + kfree(alc_ptr[i]); + kfree(alc_ptr); } static void __drain_alien_cache(struct kmem_cache *cachep, - struct array_cache *ac, int node) + struct array_cache *ac, int node, + struct list_head *list) { - struct kmem_cache_node *n = cachep->node[node]; + struct kmem_cache_node *n = get_node(cachep, node); if (ac->avail) { spin_lock(&n->list_lock); @@ -1043,7 +927,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, if (n->shared) transfer_objects(n->shared, ac, ac->limit); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, list); ac->avail = 0; spin_unlock(&n->list_lock); } @@ -1057,66 +941,88 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) int node = __this_cpu_read(slab_reap_node); if (n->alien) { - struct array_cache *ac = n->alien[node]; + struct alien_cache *alc = n->alien[node]; + struct array_cache *ac; + + if (alc) { + ac = &alc->ac; + if (ac->avail && spin_trylock_irq(&alc->lock)) { + LIST_HEAD(list); - if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { - __drain_alien_cache(cachep, ac, node); - spin_unlock_irq(&ac->lock); + __drain_alien_cache(cachep, ac, node, &list); + spin_unlock_irq(&alc->lock); + slabs_destroy(cachep, &list); + } } } } static void drain_alien_cache(struct kmem_cache *cachep, - struct array_cache **alien) + struct alien_cache **alien) { int i = 0; + struct alien_cache *alc; struct array_cache *ac; unsigned long flags; for_each_online_node(i) { - ac = alien[i]; - if (ac) { - spin_lock_irqsave(&ac->lock, flags); - __drain_alien_cache(cachep, ac, i); - spin_unlock_irqrestore(&ac->lock, flags); + alc = alien[i]; + if (alc) { + LIST_HEAD(list); + + ac = &alc->ac; + spin_lock_irqsave(&alc->lock, flags); + __drain_alien_cache(cachep, ac, i, &list); + spin_unlock_irqrestore(&alc->lock, flags); + slabs_destroy(cachep, &list); } } } -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +static int __cache_free_alien(struct kmem_cache *cachep, void *objp, + int node, int page_node) { - int nodeid = page_to_nid(virt_to_page(objp)); struct kmem_cache_node *n; - struct array_cache *alien = NULL; - int node; - - node = numa_mem_id(); - - /* - * Make sure we are not freeing a object from another node to the array - * cache on this cpu. - */ - if (likely(nodeid == node)) - return 0; + struct alien_cache *alien = NULL; + struct array_cache *ac; + LIST_HEAD(list); - n = cachep->node[node]; + n = get_node(cachep, node); STATS_INC_NODEFREES(cachep); - if (n->alien && n->alien[nodeid]) { - alien = n->alien[nodeid]; + if (n->alien && n->alien[page_node]) { + alien = n->alien[page_node]; + ac = &alien->ac; spin_lock(&alien->lock); - if (unlikely(alien->avail == alien->limit)) { + if (unlikely(ac->avail == ac->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, alien, nodeid); + __drain_alien_cache(cachep, ac, page_node, &list); } - ac_put_obj(cachep, alien, objp); + ac_put_obj(cachep, ac, objp); spin_unlock(&alien->lock); + slabs_destroy(cachep, &list); } else { - spin_lock(&(cachep->node[nodeid])->list_lock); - free_block(cachep, &objp, 1, nodeid); - spin_unlock(&(cachep->node[nodeid])->list_lock); + n = get_node(cachep, page_node); + spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, page_node, &list); + spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); } return 1; } + +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +{ + int page_node = page_to_nid(virt_to_page(objp)); + int node = numa_mem_id(); + /* + * Make sure we are not freeing a object from another node to the array + * cache on this cpu. + */ + if (likely(node == page_node)) + return 0; + + return __cache_free_alien(cachep, objp, node, page_node); +} #endif /* @@ -1132,7 +1038,7 @@ static int init_cache_node_node(int node) { struct kmem_cache *cachep; struct kmem_cache_node *n; - const int memsize = sizeof(struct kmem_cache_node); + const size_t memsize = sizeof(struct kmem_cache_node); list_for_each_entry(cachep, &slab_caches, list) { /* @@ -1140,7 +1046,8 @@ static int init_cache_node_node(int node) * begin anything. Make sure some other cpu on this * node has not already allocated this */ - if (!cachep->node[node]) { + n = get_node(cachep, node); + if (!n) { n = kmalloc_node(memsize, GFP_KERNEL, node); if (!n) return -ENOMEM; @@ -1156,11 +1063,11 @@ static int init_cache_node_node(int node) cachep->node[node] = n; } - spin_lock_irq(&cachep->node[node]->list_lock); - cachep->node[node]->free_limit = + spin_lock_irq(&n->list_lock); + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->node[node]->list_lock); + spin_unlock_irq(&n->list_lock); } return 0; } @@ -1181,32 +1088,34 @@ static void cpuup_canceled(long cpu) list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared; - struct array_cache **alien; - - /* cpu is dead; no one can alloc from it. */ - nc = cachep->array[cpu]; - cachep->array[cpu] = NULL; - n = cachep->node[node]; + struct alien_cache **alien; + LIST_HEAD(list); + n = get_node(cachep, node); if (!n) - goto free_array_cache; + continue; spin_lock_irq(&n->list_lock); /* Free limit for this kmem_cache_node */ n->free_limit -= cachep->batchcount; - if (nc) - free_block(cachep, nc->entry, nc->avail, node); + + /* cpu is dead; no one can alloc from it. */ + nc = per_cpu_ptr(cachep->cpu_cache, cpu); + if (nc) { + free_block(cachep, nc->entry, nc->avail, node, &list); + nc->avail = 0; + } if (!cpumask_empty(mask)) { spin_unlock_irq(&n->list_lock); - goto free_array_cache; + goto free_slab; } shared = n->shared; if (shared) { free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &list); n->shared = NULL; } @@ -1220,8 +1129,9 @@ static void cpuup_canceled(long cpu) drain_alien_cache(cachep, alien); free_alien_cache(alien); } -free_array_cache: - kfree(nc); + +free_slab: + slabs_destroy(cachep, &list); } /* * In the previous loop, all the objects were freed to @@ -1229,7 +1139,7 @@ free_array_cache: * shrink each nodelist to its limit. */ list_for_each_entry(cachep, &slab_caches, list) { - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) continue; drain_freelist(cachep, n, slabs_tofree(cachep, n)); @@ -1258,33 +1168,24 @@ static int cpuup_prepare(long cpu) * array caches */ list_for_each_entry(cachep, &slab_caches, list) { - struct array_cache *nc; struct array_cache *shared = NULL; - struct array_cache **alien = NULL; + struct alien_cache **alien = NULL; - nc = alloc_arraycache(node, cachep->limit, - cachep->batchcount, GFP_KERNEL); - if (!nc) - goto bad; if (cachep->shared) { shared = alloc_arraycache(node, cachep->shared * cachep->batchcount, 0xbaadf00d, GFP_KERNEL); - if (!shared) { - kfree(nc); + if (!shared) goto bad; - } } if (use_alien_caches) { alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); if (!alien) { kfree(shared); - kfree(nc); goto bad; } } - cachep->array[cpu] = nc; - n = cachep->node[node]; + n = get_node(cachep, node); BUG_ON(!n); spin_lock_irq(&n->list_lock); @@ -1305,13 +1206,7 @@ static int cpuup_prepare(long cpu) spin_unlock_irq(&n->list_lock); kfree(shared); free_alien_cache(alien); - if (cachep->flags & SLAB_DEBUG_OBJECTS) - slab_set_debugobj_lock_classes_node(cachep, node); - else if (!OFF_SLAB(cachep) && - !(cachep->flags & SLAB_DESTROY_BY_RCU)) - on_slab_lock_classes_node(cachep, node); } - init_node_lock_keys(node); return 0; bad: @@ -1395,7 +1290,7 @@ static int __meminit drain_cache_node_node(int node) list_for_each_entry(cachep, &slab_caches, list) { struct kmem_cache_node *n; - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) continue; @@ -1481,15 +1376,6 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) } /* - * The memory after the last cpu cache pointer is used for the - * the node pointer. - */ -static void setup_node_pointer(struct kmem_cache *cachep) -{ - cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids]; -} - -/* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). */ @@ -1500,7 +1386,6 @@ void __init kmem_cache_init(void) BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; - setup_node_pointer(kmem_cache); if (num_possible_nodes() == 1) use_alien_caches = 0; @@ -1508,8 +1393,6 @@ void __init kmem_cache_init(void) for (i = 0; i < NUM_INIT_LISTS; i++) kmem_cache_node_init(&init_kmem_cache_node[i]); - set_up_node(kmem_cache, CACHE_CACHE); - /* * Fragmentation resistance on low memory - only use bigger * page orders on machines with more than 32MB of memory if @@ -1544,57 +1427,22 @@ void __init kmem_cache_init(void) * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ create_boot_cache(kmem_cache, "kmem_cache", - offsetof(struct kmem_cache, array[nr_cpu_ids]) + + offsetof(struct kmem_cache, node) + nr_node_ids * sizeof(struct kmem_cache_node *), SLAB_HWCACHE_ALIGN); list_add(&kmem_cache->list, &slab_caches); - - /* 2+3) create the kmalloc caches */ + slab_state = PARTIAL; /* - * Initialize the caches that provide memory for the array cache and the - * kmem_cache_node structures first. Without this, further allocations will - * bug. + * Initialize the caches that provide memory for the kmem_cache_node + * structures first. Without this, further allocations will bug. */ - - kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac", - kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS); - - if (INDEX_AC != INDEX_NODE) - kmalloc_caches[INDEX_NODE] = - create_kmalloc_cache("kmalloc-node", + kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); + slab_state = PARTIAL_NODE; slab_early_init = 0; - /* 4) Replace the bootstrap head arrays */ - { - struct array_cache *ptr; - - ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - - memcpy(ptr, cpu_cache_get(kmem_cache), - sizeof(struct arraycache_init)); - /* - * Do not assume that spinlocks can be initialized via memcpy: - */ - spin_lock_init(&ptr->lock); - - kmem_cache->array[smp_processor_id()] = ptr; - - ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - - BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC]) - != &initarray_generic.cache); - memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), - sizeof(struct arraycache_init)); - /* - * Do not assume that spinlocks can be initialized via memcpy: - */ - spin_lock_init(&ptr->lock); - - kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; - } /* 5) Replace the bootstrap kmem_cache_node */ { int nid; @@ -1602,13 +1450,8 @@ void __init kmem_cache_init(void) for_each_online_node(nid) { init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); - init_list(kmalloc_caches[INDEX_AC], - &init_kmem_cache_node[SIZE_AC + nid], nid); - - if (INDEX_AC != INDEX_NODE) { - init_list(kmalloc_caches[INDEX_NODE], + init_list(kmalloc_caches[INDEX_NODE], &init_kmem_cache_node[SIZE_NODE + nid], nid); - } } } @@ -1628,9 +1471,6 @@ void __init kmem_cache_init_late(void) BUG(); mutex_unlock(&slab_mutex); - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); - /* Done! */ slab_state = FULL; @@ -1690,14 +1530,10 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", cachep->name, cachep->size, cachep->gfporder); - for_each_online_node(node) { + for_each_kmem_cache_node(cachep, node, n) { unsigned long active_objs = 0, num_objs = 0, free_objects = 0; unsigned long active_slabs = 0, num_slabs = 0; - n = cachep->node[node]; - if (!n) - continue; - spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->slabs_full, lru) { active_objs += cachep->num; @@ -1724,7 +1560,8 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) } /* - * Interface to system's page allocator. No need to hold the cache-lock. + * Interface to system's page allocator. No need to hold the + * kmem_cache_node ->list_lock. * * If we requested dmaable memory, we will get it. Even if we * did not request dmaable memory, we might get it, but that @@ -2026,9 +1863,9 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, * @cachep: cache pointer being destroyed * @page: page pointer being destroyed * - * Destroy all the objs in a slab, and release the mem back to the system. - * Before calling the slab must have been unlinked from the cache. The - * cache-lock is not held/needed. + * Destroy all the objs in a slab page, and release the mem back to the system. + * Before calling the slab page must have been unlinked from the cache. The + * kmem_cache_node ->list_lock is not held/needed. */ static void slab_destroy(struct kmem_cache *cachep, struct page *page) { @@ -2060,6 +1897,16 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) kmem_cache_free(cachep->freelist_cache, freelist); } +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) +{ + struct page *page, *n; + + list_for_each_entry_safe(page, n, list, lru) { + list_del(&page->lru); + slab_destroy(cachep, page); + } +} + /** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created @@ -2137,56 +1984,53 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, return left_over; } +static struct array_cache __percpu *alloc_kmem_cache_cpus( + struct kmem_cache *cachep, int entries, int batchcount) +{ + int cpu; + size_t size; + struct array_cache __percpu *cpu_cache; + + size = sizeof(void *) * entries + sizeof(struct array_cache); + cpu_cache = __alloc_percpu(size, sizeof(void *)); + + if (!cpu_cache) + return NULL; + + for_each_possible_cpu(cpu) { + init_arraycache(per_cpu_ptr(cpu_cache, cpu), + entries, batchcount); + } + + return cpu_cache; +} + static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) { if (slab_state >= FULL) return enable_cpucache(cachep, gfp); + cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1); + if (!cachep->cpu_cache) + return 1; + if (slab_state == DOWN) { - /* - * Note: Creation of first cache (kmem_cache). - * The setup_node is taken care - * of by the caller of __kmem_cache_create - */ - cachep->array[smp_processor_id()] = &initarray_generic.cache; - slab_state = PARTIAL; + /* Creation of first cache (kmem_cache). */ + set_up_node(kmem_cache, CACHE_CACHE); } else if (slab_state == PARTIAL) { - /* - * Note: the second kmem_cache_create must create the cache - * that's used by kmalloc(24), otherwise the creation of - * further caches will BUG(). - */ - cachep->array[smp_processor_id()] = &initarray_generic.cache; - - /* - * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is - * the second cache, then we need to set up all its node/, - * otherwise the creation of further caches will BUG(). - */ - set_up_node(cachep, SIZE_AC); - if (INDEX_AC == INDEX_NODE) - slab_state = PARTIAL_NODE; - else - slab_state = PARTIAL_ARRAYCACHE; + /* For kmem_cache_node */ + set_up_node(cachep, SIZE_NODE); } else { - /* Remaining boot caches */ - cachep->array[smp_processor_id()] = - kmalloc(sizeof(struct arraycache_init), gfp); + int node; - if (slab_state == PARTIAL_ARRAYCACHE) { - set_up_node(cachep, SIZE_NODE); - slab_state = PARTIAL_NODE; - } else { - int node; - for_each_online_node(node) { - cachep->node[node] = - kmalloc_node(sizeof(struct kmem_cache_node), - gfp, node); - BUG_ON(!cachep->node[node]); - kmem_cache_node_init(cachep->node[node]); - } + for_each_online_node(node) { + cachep->node[node] = kmalloc_node( + sizeof(struct kmem_cache_node), gfp, node); + BUG_ON(!cachep->node[node]); + kmem_cache_node_init(cachep->node[node]); } } + cachep->node[numa_mem_id()]->next_reap = jiffies + REAPTIMEOUT_NODE + ((unsigned long)cachep) % REAPTIMEOUT_NODE; @@ -2200,6 +2044,32 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) return 0; } +unsigned long kmem_cache_flags(unsigned long object_size, + unsigned long flags, const char *name, + void (*ctor)(void *)) +{ + return flags; +} + +struct kmem_cache * +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ + struct kmem_cache *cachep; + + cachep = find_mergeable(size, align, flags, name, ctor); + if (cachep) { + cachep->refcount++; + + /* + * Adjust the object sizes so that we clear + * the complete object on kzalloc. + */ + cachep->object_size = max_t(int, cachep->object_size, size); + } + return cachep; +} + /** * __kmem_cache_create - Create a cache. * @cachep: cache management descriptor @@ -2224,7 +2094,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { - size_t left_over, freelist_size, ralign; + size_t left_over, freelist_size; + size_t ralign = BYTES_PER_WORD; gfp_t gfp; int err; size_t size = cachep->size; @@ -2257,14 +2128,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size &= ~(BYTES_PER_WORD - 1); } - /* - * Redzoning and user store require word alignment or possibly larger. - * Note this will be overridden by architecture or caller mandated - * alignment if either is greater than BYTES_PER_WORD. - */ - if (flags & SLAB_STORE_USER) - ralign = BYTES_PER_WORD; - if (flags & SLAB_RED_ZONE) { ralign = REDZONE_ALIGN; /* If redzoning, ensure that the second redzone is suitably @@ -2290,7 +2153,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else gfp = GFP_NOWAIT; - setup_node_pointer(cachep); #if DEBUG /* @@ -2405,17 +2267,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) return err; } - if (flags & SLAB_DEBUG_OBJECTS) { - /* - * Would deadlock through slab_destroy()->call_rcu()-> - * debug_object_activate()->kmem_cache_alloc(). - */ - WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); - - slab_set_debugobj_lock_classes(cachep); - } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) - on_slab_lock_classes(cachep); - return 0; } @@ -2434,7 +2285,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock); + assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); #endif } @@ -2442,7 +2293,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->node[node]->list_lock); + assert_spin_locked(&get_node(cachep, node)->list_lock); #endif } @@ -2462,12 +2313,16 @@ static void do_drain(void *arg) struct kmem_cache *cachep = arg; struct array_cache *ac; int node = numa_mem_id(); + struct kmem_cache_node *n; + LIST_HEAD(list); check_irq_off(); ac = cpu_cache_get(cachep); - spin_lock(&cachep->node[node]->list_lock); - free_block(cachep, ac->entry, ac->avail, node); - spin_unlock(&cachep->node[node]->list_lock); + n = get_node(cachep, node); + spin_lock(&n->list_lock); + free_block(cachep, ac->entry, ac->avail, node, &list); + spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); ac->avail = 0; } @@ -2478,17 +2333,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep) on_each_cpu(do_drain, cachep, 1); check_irq_on(); - for_each_online_node(node) { - n = cachep->node[node]; - if (n && n->alien) + for_each_kmem_cache_node(cachep, node, n) + if (n->alien) drain_alien_cache(cachep, n->alien); - } - for_each_online_node(node) { - n = cachep->node[node]; - if (n) - drain_array(cachep, n, n->shared, 1, node); - } + for_each_kmem_cache_node(cachep, node, n) + drain_array(cachep, n, n->shared, 1, node); } /* @@ -2534,17 +2384,14 @@ out: int __kmem_cache_shrink(struct kmem_cache *cachep) { - int ret = 0, i = 0; + int ret = 0; + int node; struct kmem_cache_node *n; drain_cpu_caches(cachep); check_irq_on(); - for_each_online_node(i) { - n = cachep->node[i]; - if (!n) - continue; - + for_each_kmem_cache_node(cachep, node, n) { drain_freelist(cachep, n, slabs_tofree(cachep, n)); ret += !list_empty(&n->slabs_full) || @@ -2562,17 +2409,14 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) if (rc) return rc; - for_each_online_cpu(i) - kfree(cachep->array[i]); + free_percpu(cachep->cpu_cache); /* NUMA: free the node structures */ - for_each_online_node(i) { - n = cachep->node[i]; - if (n) { - kfree(n->shared); - free_alien_cache(n->alien); - kfree(n); - } + for_each_kmem_cache_node(cachep, i, n) { + kfree(n->shared); + free_alien_cache(n->alien); + kfree(n); + cachep->node[i] = NULL; } return 0; } @@ -2751,7 +2595,7 @@ static int cache_grow(struct kmem_cache *cachep, /* Take the node list lock to change the colour_next on this node */ check_irq_off(); - n = cachep->node[nodeid]; + n = get_node(cachep, nodeid); spin_lock(&n->list_lock); /* Get colour for the slab, and cal the next value. */ @@ -2920,7 +2764,7 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - n = cachep->node[node]; + n = get_node(cachep, node); BUG_ON(ac->avail > 0 || !n); spin_lock(&n->list_lock); @@ -3060,7 +2904,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) { - if (cachep == kmem_cache) + if (unlikely(cachep == kmem_cache)) return false; return should_failslab(cachep->object_size, flags, cachep->flags); @@ -3111,7 +2955,7 @@ out: #ifdef CONFIG_NUMA /* - * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. + * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set. * * If we are in_interrupt, then process context, including cpusets and * mempolicy, may not apply and should not be used for allocation policy. @@ -3169,8 +3013,8 @@ retry: nid = zone_to_nid(zone); if (cpuset_zone_allowed_hardwall(zone, flags) && - cache->node[nid] && - cache->node[nid]->free_objects) { + get_node(cache, nid) && + get_node(cache, nid)->free_objects) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (obj) @@ -3233,7 +3077,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int x; VM_BUG_ON(nodeid > num_online_nodes()); - n = cachep->node[nodeid]; + n = get_node(cachep, nodeid); BUG_ON(!n); retry: @@ -3304,7 +3148,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (nodeid == NUMA_NO_NODE) nodeid = slab_node; - if (unlikely(!cachep->node[nodeid])) { + if (unlikely(!get_node(cachep, nodeid))) { /* Node not bootstrapped yet */ ptr = fallback_alloc(cachep, flags); goto out; @@ -3343,7 +3187,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) { void *objp; - if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { + if (current->mempolicy || cpuset_do_slab_mem_spread()) { objp = alternate_node_alloc(cache, flags); if (objp) goto out; @@ -3405,12 +3249,13 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) /* * Caller needs to acquire correct kmem_cache_node's list_lock + * @list: List of detached free slabs should be freed by caller */ -static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, - int node) +static void free_block(struct kmem_cache *cachep, void **objpp, + int nr_objects, int node, struct list_head *list) { int i; - struct kmem_cache_node *n; + struct kmem_cache_node *n = get_node(cachep, node); for (i = 0; i < nr_objects; i++) { void *objp; @@ -3420,7 +3265,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, objp = objpp[i]; page = virt_to_head_page(objp); - n = cachep->node[node]; list_del(&page->lru); check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp, node); @@ -3431,13 +3275,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, if (page->active == 0) { if (n->free_objects > n->free_limit) { n->free_objects -= cachep->num; - /* No need to drop any previously held - * lock here, even if we have a off-slab slab - * descriptor it is guaranteed to come from - * a different cache, refer to comments before - * alloc_slabmgmt. - */ - slab_destroy(cachep, page); + list_add_tail(&page->lru, list); } else { list_add(&page->lru, &n->slabs_free); } @@ -3456,13 +3294,14 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) int batchcount; struct kmem_cache_node *n; int node = numa_mem_id(); + LIST_HEAD(list); batchcount = ac->batchcount; #if DEBUG BUG_ON(!batchcount || batchcount > ac->avail); #endif check_irq_off(); - n = cachep->node[node]; + n = get_node(cachep, node); spin_lock(&n->list_lock); if (n->shared) { struct array_cache *shared_array = n->shared; @@ -3477,7 +3316,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) } } - free_block(cachep, ac->entry, batchcount, node); + free_block(cachep, ac->entry, batchcount, node, &list); free_done: #if STATS { @@ -3498,6 +3337,7 @@ free_done: } #endif spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); } @@ -3527,7 +3367,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) return; - if (likely(ac->avail < ac->limit)) { + if (ac->avail < ac->limit) { STATS_INC_FREEHIT(cachep); } else { STATS_INC_FREEMISS(cachep); @@ -3624,7 +3464,6 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) return kmem_cache_alloc_node_trace(cachep, flags, node, size); } -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __do_kmalloc_node(size, flags, node, _RET_IP_); @@ -3637,13 +3476,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, return __do_kmalloc_node(size, flags, node, caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); -#else -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - return __do_kmalloc_node(size, flags, node, 0); -} -EXPORT_SYMBOL(__kmalloc_node); -#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ #endif /* CONFIG_NUMA */ /** @@ -3669,8 +3501,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return ret; } - -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc(size_t size, gfp_t flags) { return __do_kmalloc(size, flags, _RET_IP_); @@ -3683,14 +3513,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) } EXPORT_SYMBOL(__kmalloc_track_caller); -#else -void *__kmalloc(size_t size, gfp_t flags) -{ - return __do_kmalloc(size, flags, 0); -} -EXPORT_SYMBOL(__kmalloc); -#endif - /** * kmem_cache_free - Deallocate an object * @cachep: The cache the allocation was from. @@ -3754,7 +3576,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) int node; struct kmem_cache_node *n; struct array_cache *new_shared; - struct array_cache **new_alien = NULL; + struct alien_cache **new_alien = NULL; for_each_online_node(node) { @@ -3775,15 +3597,16 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) } } - n = cachep->node[node]; + n = get_node(cachep, node); if (n) { struct array_cache *shared = n->shared; + LIST_HEAD(list); spin_lock_irq(&n->list_lock); if (shared) free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &list); n->shared = new_shared; if (!n->alien) { @@ -3793,6 +3616,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); kfree(shared); free_alien_cache(new_alien); continue; @@ -3820,9 +3644,8 @@ fail: /* Cache is not active yet. Roll back what we did */ node--; while (node >= 0) { - if (cachep->node[node]) { - n = cachep->node[node]; - + n = get_node(cachep, node); + if (n) { kfree(n->shared); free_alien_cache(n->alien); kfree(n); @@ -3834,64 +3657,45 @@ fail: return -ENOMEM; } -struct ccupdate_struct { - struct kmem_cache *cachep; - struct array_cache *new[0]; -}; - -static void do_ccupdate_local(void *info) -{ - struct ccupdate_struct *new = info; - struct array_cache *old; - - check_irq_off(); - old = cpu_cache_get(new->cachep); - - new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; - new->new[smp_processor_id()] = old; -} - /* Always called with the slab_mutex held */ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { - struct ccupdate_struct *new; - int i; + struct array_cache __percpu *cpu_cache, *prev; + int cpu; - new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), - gfp); - if (!new) + cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount); + if (!cpu_cache) return -ENOMEM; - for_each_online_cpu(i) { - new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, - batchcount, gfp); - if (!new->new[i]) { - for (i--; i >= 0; i--) - kfree(new->new[i]); - kfree(new); - return -ENOMEM; - } - } - new->cachep = cachep; - - on_each_cpu(do_ccupdate_local, (void *)new, 1); + prev = cachep->cpu_cache; + cachep->cpu_cache = cpu_cache; + kick_all_cpus_sync(); check_irq_on(); cachep->batchcount = batchcount; cachep->limit = limit; cachep->shared = shared; - for_each_online_cpu(i) { - struct array_cache *ccold = new->new[i]; - if (!ccold) - continue; - spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); - spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); - kfree(ccold); + if (!prev) + goto alloc_node; + + for_each_online_cpu(cpu) { + LIST_HEAD(list); + int node; + struct kmem_cache_node *n; + struct array_cache *ac = per_cpu_ptr(prev, cpu); + + node = cpu_to_mem(cpu); + n = get_node(cachep, node); + spin_lock_irq(&n->list_lock); + free_block(cachep, ac->entry, ac->avail, node, &list); + spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); } - kfree(new); + free_percpu(prev); + +alloc_node: return alloc_kmem_cache_node(cachep, gfp); } @@ -3996,6 +3800,7 @@ skip_setup: static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, struct array_cache *ac, int force, int node) { + LIST_HEAD(list); int tofree; if (!ac || !ac->avail) @@ -4008,12 +3813,13 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node); + free_block(cachep, ac->entry, tofree, node, &list); ac->avail -= tofree; memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); } } @@ -4048,7 +3854,7 @@ static void cache_reap(struct work_struct *w) * have established with reasonable certainty that * we can do some work if the lock was obtained. */ - n = searchp->node[node]; + n = get_node(searchp, node); reap_alien(searchp, n); @@ -4100,10 +3906,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) active_objs = 0; num_slabs = 0; - for_each_online_node(node) { - n = cachep->node[node]; - if (!n) - continue; + for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); spin_lock_irq(&n->list_lock); @@ -4328,10 +4131,7 @@ static int leaks_show(struct seq_file *m, void *p) x[1] = 0; - for_each_online_node(node) { - n = cachep->node[node]; - if (!n) - continue; + for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); spin_lock_irq(&n->list_lock); @@ -4378,19 +4178,15 @@ static const struct seq_operations slabstats_op = { static int slabstats_open(struct inode *inode, struct file *file) { - unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); - int ret = -ENOMEM; - if (n) { - ret = seq_open(file, &slabstats_op); - if (!ret) { - struct seq_file *m = file->private_data; - *n = PAGE_SIZE / (2 * sizeof(unsigned long)); - m->private = n; - n = NULL; - } - kfree(n); - } - return ret; + unsigned long *n; + + n = __seq_open_private(file, &slabstats_op, PAGE_SIZE); + if (!n) + return -ENOMEM; + + *n = PAGE_SIZE / (2 * sizeof(unsigned long)); + + return 0; } static const struct file_operations proc_slabstats_operations = { @@ -4,6 +4,41 @@ * Internal slab definitions */ +#ifdef CONFIG_SLOB +/* + * Common fields provided in kmem_cache by all slab allocators + * This struct is either used directly by the allocator (SLOB) + * or the allocator must include definitions for all fields + * provided in kmem_cache_common in their definition of kmem_cache. + * + * Once we can do anonymous structs (C11 standard) we could put a + * anonymous struct definition in these allocators so that the + * separate allocations in the kmem_cache structure of SLAB and + * SLUB is no longer needed. + */ +struct kmem_cache { + unsigned int object_size;/* The original size of the object */ + unsigned int size; /* The aligned/padded/added on size */ + unsigned int align; /* Alignment as calculated */ + unsigned long flags; /* Active flags on the slab */ + const char *name; /* Slab name for sysfs */ + int refcount; /* Use counter */ + void (*ctor)(void *); /* Called on object slot creation */ + struct list_head list; /* List of all slab caches on the system */ +}; + +#endif /* CONFIG_SLOB */ + +#ifdef CONFIG_SLAB +#include <linux/slab_def.h> +#endif + +#ifdef CONFIG_SLUB +#include <linux/slub_def.h> +#endif + +#include <linux/memcontrol.h> + /* * State of the slab allocator. * @@ -15,7 +50,6 @@ enum slab_state { DOWN, /* No slab functionality yet */ PARTIAL, /* SLUB: kmem_cache_node available */ - PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */ PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ UP, /* Slab caches usable but not all extras yet */ FULL /* Everything is working */ @@ -53,15 +87,30 @@ extern void create_boot_cache(struct kmem_cache *, const char *name, size_t size, unsigned long flags); struct mem_cgroup; -#ifdef CONFIG_SLUB + +int slab_unmergeable(struct kmem_cache *s); +struct kmem_cache *find_mergeable(size_t size, size_t align, + unsigned long flags, const char *name, void (*ctor)(void *)); +#ifndef CONFIG_SLOB struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)); + +unsigned long kmem_cache_flags(unsigned long object_size, + unsigned long flags, const char *name, + void (*ctor)(void *)); #else static inline struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { return NULL; } + +static inline unsigned long kmem_cache_flags(unsigned long object_size, + unsigned long flags, const char *name, + void (*ctor)(void *)) +{ + return flags; +} #endif @@ -256,13 +305,12 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return cachep; pr_err("%s: Wrong slab cache. %s but object is from %s\n", - __FUNCTION__, cachep->name, s->name); + __func__, cachep->name, s->name); WARN_ON_ONCE(1); return s; } -#endif - +#ifndef CONFIG_SLOB /* * The slab lists for all objects. */ @@ -277,7 +325,7 @@ struct kmem_cache_node { unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ struct array_cache *shared; /* shared per node */ - struct array_cache **alien; /* on other nodes */ + struct alien_cache **alien; /* on other nodes */ unsigned long next_reap; /* updated without locking */ int free_touched; /* updated without locking */ #endif @@ -294,5 +342,22 @@ struct kmem_cache_node { }; +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __node < nr_node_ids; __node++) \ + if ((__n = get_node(__s, __node))) + +#endif + void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); + +#endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index d31c4ba..dcdab81 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -19,6 +19,8 @@ #include <asm/tlbflush.h> #include <asm/page.h> #include <linux/memcontrol.h> + +#define CREATE_TRACE_POINTS #include <trace/events/kmem.h> #include "slab.h" @@ -28,6 +30,43 @@ LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; +/* + * Set of flags that will prevent slab merging + */ +#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ + SLAB_FAILSLAB) + +#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_CACHE_DMA | SLAB_NOTRACK) + +/* + * Merge control. If this is set then no merging of slab caches will occur. + * (Could be removed. This was introduced to pacify the merge skeptics.) + */ +static int slab_nomerge; + +static int __init setup_slab_nomerge(char *str) +{ + slab_nomerge = 1; + return 1; +} + +#ifdef CONFIG_SLUB +__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0); +#endif + +__setup("slab_nomerge", setup_slab_nomerge); + +/* + * Determine the size of a slab object + */ +unsigned int kmem_cache_size(struct kmem_cache *s) +{ + return s->object_size; +} +EXPORT_SYMBOL(kmem_cache_size); + #ifdef CONFIG_DEBUG_VM static int kmem_cache_sanity_check(const char *name, size_t size) { @@ -54,16 +93,6 @@ static int kmem_cache_sanity_check(const char *name, size_t size) s->object_size); continue; } - -#if !defined(CONFIG_SLUB) - if (!strcmp(s->name, name)) { - pr_err("%s (%s): Cache name already exists.\n", - __func__, name); - dump_stack(); - s = NULL; - return -EINVAL; - } -#endif } WARN_ON(strchr(name, ' ')); /* It confuses parsers */ @@ -77,6 +106,65 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) #endif #ifdef CONFIG_MEMCG_KMEM +static int memcg_alloc_cache_params(struct mem_cgroup *memcg, + struct kmem_cache *s, struct kmem_cache *root_cache) +{ + size_t size; + + if (!memcg_kmem_enabled()) + return 0; + + if (!memcg) { + size = offsetof(struct memcg_cache_params, memcg_caches); + size += memcg_limited_groups_array_size * sizeof(void *); + } else + size = sizeof(struct memcg_cache_params); + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) + return -ENOMEM; + + if (memcg) { + s->memcg_params->memcg = memcg; + s->memcg_params->root_cache = root_cache; + } else + s->memcg_params->is_root_cache = true; + + return 0; +} + +static void memcg_free_cache_params(struct kmem_cache *s) +{ + kfree(s->memcg_params); +} + +static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs) +{ + int size; + struct memcg_cache_params *new_params, *cur_params; + + BUG_ON(!is_root_cache(s)); + + size = offsetof(struct memcg_cache_params, memcg_caches); + size += num_memcgs * sizeof(void *); + + new_params = kzalloc(size, GFP_KERNEL); + if (!new_params) + return -ENOMEM; + + cur_params = s->memcg_params; + memcpy(new_params->memcg_caches, cur_params->memcg_caches, + memcg_limited_groups_array_size * sizeof(void *)); + + new_params->is_root_cache = true; + + rcu_assign_pointer(s->memcg_params, new_params); + if (cur_params) + kfree_rcu(cur_params, rcu_head); + + return 0; +} + int memcg_update_all_caches(int num_memcgs) { struct kmem_cache *s; @@ -87,9 +175,8 @@ int memcg_update_all_caches(int num_memcgs) if (!is_root_cache(s)) continue; - ret = memcg_update_cache_size(s, num_memcgs); + ret = memcg_update_cache_params(s, num_memcgs); /* - * See comment in memcontrol.c, memcg_update_cache_size: * Instead of freeing the memory, we'll just leave the caches * up to this point in an updated state. */ @@ -102,7 +189,84 @@ out: mutex_unlock(&slab_mutex); return ret; } -#endif +#else +static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, + struct kmem_cache *s, struct kmem_cache *root_cache) +{ + return 0; +} + +static inline void memcg_free_cache_params(struct kmem_cache *s) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +/* + * Find a mergeable slab cache + */ +int slab_unmergeable(struct kmem_cache *s) +{ + if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE)) + return 1; + + if (!is_root_cache(s)) + return 1; + + if (s->ctor) + return 1; + + /* + * We may have set a slab to be unmergeable during bootstrap. + */ + if (s->refcount < 0) + return 1; + + return 0; +} + +struct kmem_cache *find_mergeable(size_t size, size_t align, + unsigned long flags, const char *name, void (*ctor)(void *)) +{ + struct kmem_cache *s; + + if (slab_nomerge || (flags & SLAB_NEVER_MERGE)) + return NULL; + + if (ctor) + return NULL; + + size = ALIGN(size, sizeof(void *)); + align = calculate_alignment(flags, align, size); + size = ALIGN(size, align); + flags = kmem_cache_flags(size, flags, name, NULL); + + list_for_each_entry(s, &slab_caches, list) { + if (slab_unmergeable(s)) + continue; + + if (size > s->size) + continue; + + if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME)) + continue; + /* + * Check if alignment is compatible. + * Courtesy of Adrian Drzewiecki + */ + if ((s->size & ~(align - 1)) != s->size) + continue; + + if (s->size - size >= sizeof(void *)) + continue; + + if (IS_ENABLED(CONFIG_SLAB) && align && + (align > s->align || s->align % align)) + continue; + + return s; + } + return NULL; +} /* * Figure out what the alignment of the objects will be given a set of @@ -209,8 +373,10 @@ kmem_cache_create(const char *name, size_t size, size_t align, mutex_lock(&slab_mutex); err = kmem_cache_sanity_check(name, size); - if (err) + if (err) { + s = NULL; /* suppress uninit var warning */ goto out_unlock; + } /* * Some allocators will constraint the set of valid flags to a subset @@ -787,3 +953,102 @@ static int __init slab_proc_init(void) } module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */ + +static __always_inline void *__do_krealloc(const void *p, size_t new_size, + gfp_t flags) +{ + void *ret; + size_t ks = 0; + + if (p) + ks = ksize(p); + + if (ks >= new_size) + return (void *)p; + + ret = kmalloc_track_caller(new_size, flags); + if (ret && p) + memcpy(ret, p, ks); + + return ret; +} + +/** + * __krealloc - like krealloc() but don't free @p. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * This function is like krealloc() except it never frees the originally + * allocated buffer. Use this if you don't want to free the buffer immediately + * like, for example, with RCU. + */ +void *__krealloc(const void *p, size_t new_size, gfp_t flags) +{ + if (unlikely(!new_size)) + return ZERO_SIZE_PTR; + + return __do_krealloc(p, new_size, flags); + +} +EXPORT_SYMBOL(__krealloc); + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __do_krealloc(p, new_size, flags); + if (ret && p != ret) + kfree(p); + + return ret; +} +EXPORT_SYMBOL(krealloc); + +/** + * kzfree - like kfree but zero memory + * @p: object to free memory of + * + * The memory of the object @p points to is zeroed before freed. + * If @p is %NULL, kzfree() does nothing. + * + * Note: this function zeroes the whole allocated buffer which can be a good + * deal bigger than the requested buffer size passed to kmalloc(). So be + * careful when using this function in performance sensitive code. + */ +void kzfree(const void *p) +{ + size_t ks; + void *mem = (void *)p; + + if (unlikely(ZERO_OR_NULL_PTR(mem))) + return; + ks = ksize(mem); + memset(mem, 0, ks); + kfree(mem); +} +EXPORT_SYMBOL(kzfree); + +/* Tracepoints definitions. */ +EXPORT_TRACEPOINT_SYMBOL(kmalloc); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); +EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); +EXPORT_TRACEPOINT_SYMBOL(kfree); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); @@ -468,7 +468,6 @@ void *__kmalloc(size_t size, gfp_t gfp) } EXPORT_SYMBOL(__kmalloc); -#ifdef CONFIG_TRACING void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) { return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); @@ -481,7 +480,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, return __do_kmalloc_node(size, gfp, node, caller); } #endif -#endif void kfree(const void *block) { @@ -169,16 +169,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) */ #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) -/* - * Set of flags that will prevent slab merging - */ -#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ - SLAB_FAILSLAB) - -#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_CACHE_DMA | SLAB_NOTRACK) - #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ @@ -233,11 +223,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) -{ - return s->node[node]; -} - /* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) @@ -288,6 +273,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) +#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ + for (__p = (__addr), __idx = 1; __idx <= __objects;\ + __p += (__s)->size, __idx++) + /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) { @@ -382,9 +371,9 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; } else #endif { @@ -418,9 +407,9 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; } else #endif { @@ -945,60 +934,6 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, } /* - * Hooks for other subsystems that check memory allocations. In a typical - * production configuration these hooks all should produce no code at all. - */ -static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) -{ - kmemleak_alloc(ptr, size, 1, flags); -} - -static inline void kfree_hook(const void *x) -{ - kmemleak_free(x); -} - -static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) -{ - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); - - return should_failslab(s->object_size, flags, s->flags); -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, - gfp_t flags, void *object) -{ - flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); -} - -static inline void slab_free_hook(struct kmem_cache *s, void *x) -{ - kmemleak_free_recursive(x, s->flags); - - /* - * Trouble is that we may no longer disable interrupts in the fast path - * So in order to make the debug calls that expect irqs to be - * disabled we need to disable interrupts temporarily. - */ -#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) - { - unsigned long flags; - - local_irq_save(flags); - kmemcheck_slab_free(s, x, s->object_size); - debug_check_no_locks_freed(x, s->object_size); - local_irq_restore(flags); - } -#endif - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->object_size); -} - -/* * Tracking of fully allocated slabs for debugging purposes. */ static void add_full(struct kmem_cache *s, @@ -1231,7 +1166,7 @@ out: __setup("slub_debug", setup_slub_debug); -static unsigned long kmem_cache_flags(unsigned long object_size, +unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -1263,7 +1198,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} -static inline unsigned long kmem_cache_flags(unsigned long object_size, +unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -1282,6 +1217,12 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +#endif /* CONFIG_SLUB_DEBUG */ + +/* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); @@ -1293,21 +1234,44 @@ static inline void kfree_hook(const void *x) } static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) - { return 0; } +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(flags & __GFP_WAIT); + + return should_failslab(s->object_size, flags, s->flags); +} -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) +static inline void slab_post_alloc_hook(struct kmem_cache *s, + gfp_t flags, void *object) { - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, - flags & gfp_allowed_mask); + flags &= gfp_allowed_mask; + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); } static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); -} -#endif /* CONFIG_SLUB_DEBUG */ + /* + * Trouble is that we may no longer disable interrupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->object_size); + debug_check_no_locks_freed(x, s->object_size); + local_irq_restore(flags); + } +#endif + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->object_size); +} /* * Slab allocation and freeing @@ -1409,9 +1373,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; void *start; - void *last; void *p; int order; + int idx; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1432,14 +1396,13 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (unlikely(s->flags & SLAB_POISON)) memset(start, POISON_INUSE, PAGE_SIZE << order); - last = start; - for_each_object(p, s, start, page->objects) { - setup_object(s, page, last); - set_freepointer(s, last, p); - last = p; + for_each_object_idx(p, idx, s, start, page->objects) { + setup_object(s, page, p); + if (likely(idx < page->objects)) + set_freepointer(s, p, p + s->size); + else + set_freepointer(s, p, NULL); } - setup_object(s, page, last); - set_freepointer(s, last, NULL); page->freelist = start; page->inuse = page->objects; @@ -1726,7 +1689,12 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, struct kmem_cache_cpu *c) { void *object; - int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node; + int searchnode = node; + + if (node == NUMA_NO_NODE) + searchnode = numa_mem_id(); + else if (!node_present_pages(node)) + searchnode = node_to_mem_node(node); object = get_partial_node(s, get_node(s, searchnode), c, flags); if (object || node != NUMA_NO_NODE) @@ -2162,6 +2130,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); int node; + struct kmem_cache_node *n; if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; @@ -2176,15 +2145,11 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", s->name); - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long nr_slabs; unsigned long nr_objs; unsigned long nr_free; - if (!n) - continue; - nr_free = count_partial(n, count_free); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); @@ -2310,11 +2275,18 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, redo: if (unlikely(!node_match(page, node))) { - stat(s, ALLOC_NODE_MISMATCH); - deactivate_slab(s, page, c->freelist); - c->page = NULL; - c->freelist = NULL; - goto new_slab; + int searchnode = node; + + if (node != NUMA_NO_NODE && !node_present_pages(node)) + searchnode = node_to_mem_node(node); + + if (unlikely(!node_match(page, searchnode))) { + stat(s, ALLOC_NODE_MISMATCH); + deactivate_slab(s, page, c->freelist); + c->page = NULL; + c->freelist = NULL; + goto new_slab; + } } /* @@ -2737,12 +2709,6 @@ static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; static int slub_min_objects; /* - * Merge control. If this is set then no merging of slab caches will occur. - * (Could be removed. This was introduced to pacify the merge skeptics.) - */ -static int slub_nomerge; - -/* * Calculate the order of allocation given an slab object size. * * The order of allocation has significant impact on performance and other @@ -2928,13 +2894,10 @@ static void early_kmem_cache_node_alloc(int node) static void free_kmem_cache_nodes(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = s->node[node]; - - if (n) - kmem_cache_free(kmem_cache_node, n); - + for_each_kmem_cache_node(s, node, n) { + kmem_cache_free(kmem_cache_node, n); s->node[node] = NULL; } } @@ -3222,12 +3185,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) static inline int kmem_cache_close(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; flush_all(s); /* Attempt to free all objects */ - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { free_partial(s, n); if (n->nr_partial || slabs_node(s, node)) return 1; @@ -3274,14 +3236,6 @@ static int __init setup_slub_min_objects(char *str) __setup("slub_min_objects=", setup_slub_min_objects); -static int __init setup_slub_nomerge(char *str) -{ - slub_nomerge = 1; - return 1; -} - -__setup("slub_nomerge", setup_slub_nomerge); - void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; @@ -3412,9 +3366,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) return -ENOMEM; flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { if (!n->nr_partial) continue; @@ -3586,6 +3538,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) { int node; struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + struct kmem_cache_node *n; memcpy(s, static_cache, kmem_cache->object_size); @@ -3595,19 +3548,16 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) * IPIs around. */ __flush_cpu_slab(s, smp_processor_id()); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { struct page *p; - if (n) { - list_for_each_entry(p, &n->partial, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->partial, lru) + p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG - list_for_each_entry(p, &n->full, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->full, lru) + p->slab_cache = s; #endif - } } list_add(&s->list, &slab_caches); return s; @@ -3663,69 +3613,6 @@ void __init kmem_cache_init_late(void) { } -/* - * Find a mergeable slab cache - */ -static int slab_unmergeable(struct kmem_cache *s) -{ - if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) - return 1; - - if (!is_root_cache(s)) - return 1; - - if (s->ctor) - return 1; - - /* - * We may have set a slab to be unmergeable during bootstrap. - */ - if (s->refcount < 0) - return 1; - - return 0; -} - -static struct kmem_cache *find_mergeable(size_t size, size_t align, - unsigned long flags, const char *name, void (*ctor)(void *)) -{ - struct kmem_cache *s; - - if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) - return NULL; - - if (ctor) - return NULL; - - size = ALIGN(size, sizeof(void *)); - align = calculate_alignment(flags, align, size); - size = ALIGN(size, align); - flags = kmem_cache_flags(size, flags, name, NULL); - - list_for_each_entry(s, &slab_caches, list) { - if (slab_unmergeable(s)) - continue; - - if (size > s->size) - continue; - - if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) - continue; - /* - * Check if alignment is compatible. - * Courtesy of Adrian Drzewiecki - */ - if ((s->size & ~(align - 1)) != s->size) - continue; - - if (s->size - size >= sizeof(void *)) - continue; - - return s; - } - return NULL; -} - struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) @@ -3960,16 +3847,14 @@ static long validate_slab_cache(struct kmem_cache *s) unsigned long count = 0; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; if (!map) return -ENOMEM; flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) count += validate_slab_node(s, n, map); - } kfree(map); return count; } @@ -4123,6 +4008,7 @@ static int list_locations(struct kmem_cache *s, char *buf, int node; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_TEMPORARY)) { @@ -4132,8 +4018,7 @@ static int list_locations(struct kmem_cache *s, char *buf, /* Push back cpu slabs */ flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long flags; struct page *page; @@ -4205,7 +4090,7 @@ static int list_locations(struct kmem_cache *s, char *buf, #endif #ifdef SLUB_RESILIENCY_TEST -static void resiliency_test(void) +static void __init resiliency_test(void) { u8 *p; @@ -4332,8 +4217,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, get_online_mems(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = atomic_long_read(&n->total_objects); @@ -4349,9 +4235,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } else #endif if (flags & SO_PARTIAL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = count_partial(n, count_total); else if (flags & SO_OBJECTS) @@ -4364,7 +4250,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA - for_each_node_state(node, N_NORMAL_MEMORY) + for (node = 0; node < nr_node_ids; node++) if (nodes[node]) x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); @@ -4378,16 +4264,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s, static int any_slab_objects(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) if (atomic_long_read(&n->total_objects)) return 1; - } + return 0; } #endif @@ -4509,7 +4391,7 @@ SLAB_ATTR_RO(ctor); static ssize_t aliases_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->refcount - 1); + return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); } SLAB_ATTR_RO(aliases); @@ -4647,6 +4529,14 @@ static ssize_t trace_show(struct kmem_cache *s, char *buf) static ssize_t trace_store(struct kmem_cache *s, const char *buf, size_t length) { + /* + * Tracing a merged cache is going to give confusing results + * as well as cause other issues like converting a mergeable + * cache into an umergeable one. + */ + if (s->refcount > 1) + return -EINVAL; + s->flags &= ~SLAB_TRACE; if (buf[0] == '1') { s->flags &= ~__CMPXCHG_DOUBLE; @@ -4764,6 +4654,9 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf) static ssize_t failslab_store(struct kmem_cache *s, const char *buf, size_t length) { + if (s->refcount > 1) + return -EINVAL; + s->flags &= ~SLAB_FAILSLAB; if (buf[0] == '1') s->flags |= SLAB_FAILSLAB; @@ -5171,12 +5064,6 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = '-'; p += sprintf(p, "%07d", s->size); -#ifdef CONFIG_MEMCG_KMEM - if (!is_root_cache(s)) - p += sprintf(p, "-%08d", - memcg_cache_id(s->memcg_params->memcg)); -#endif - BUG_ON(p > name + ID_STR_LENGTH - 1); return name; } @@ -5342,13 +5229,9 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) unsigned long nr_objs = 0; unsigned long nr_free = 0; int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); nr_free += count_partial(n, count_free); @@ -62,6 +62,7 @@ static void __page_cache_release(struct page *page) del_page_from_lru_list(page, lruvec, page_off_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); } + mem_cgroup_uncharge(page); } static void __put_single_page(struct page *page) @@ -501,7 +502,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, SetPageActive(page); lru += LRU_ACTIVE; add_page_to_lru_list(page, lruvec, lru); - trace_mm_lru_activate(page, page_to_pfn(page)); + trace_mm_lru_activate(page); __count_vm_event(PGACTIVATE); update_page_reclaim_stat(lruvec, file, 1); @@ -589,6 +590,9 @@ static void __lru_cache_activate_page(struct page *page) * inactive,unreferenced -> inactive,referenced * inactive,referenced -> active,unreferenced * active,unreferenced -> active,referenced + * + * When a newly allocated page is not yet visible, so safe for non-atomic ops, + * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). */ void mark_page_accessed(struct page *page) { @@ -614,17 +618,6 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -/* - * Used to mark_page_accessed(page) that is not visible yet and when it is - * still safe to use non-atomic ops - */ -void init_page_accessed(struct page *page) -{ - if (!PageReferenced(page)) - __SetPageReferenced(page); -} -EXPORT_SYMBOL(init_page_accessed); - static void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvec); @@ -695,6 +688,40 @@ void add_page_to_unevictable_list(struct page *page) spin_unlock_irq(&zone->lru_lock); } +/** + * lru_cache_add_active_or_unevictable + * @page: the page to be added to LRU + * @vma: vma in which page is mapped for determining reclaimability + * + * Place @page on the active or unevictable LRU list, depending on its + * evictability. Note that if the page is not evictable, it goes + * directly back onto it's zone's unevictable list, it does NOT use a + * per cpu pagevec. + */ +void lru_cache_add_active_or_unevictable(struct page *page, + struct vm_area_struct *vma) +{ + VM_BUG_ON_PAGE(PageLRU(page), page); + + if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { + SetPageActive(page); + lru_cache_add(page); + return; + } + + if (!TestSetPageMlocked(page)) { + /* + * We use the irq-unsafe __mod_zone_page_stat because this + * counter is not modified from interrupt context, and the pte + * lock is held(spinlock), which implies preemption disabled. + */ + __mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); + count_vm_event(UNEVICTABLE_PGMLOCKED); + } + add_page_to_unevictable_list(page); +} + /* * If the page can not be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the @@ -860,18 +887,14 @@ void lru_add_drain_all(void) mutex_unlock(&lock); } -/* - * Batched page_cache_release(). Decrement the reference count on all the - * passed pages. If it fell to zero then remove the page from the LRU and - * free it. - * - * Avoid taking zone->lru_lock if possible, but if it is taken, retain it - * for the remainder of the operation. +/** + * release_pages - batched page_cache_release() + * @pages: array of pages to release + * @nr: number of pages + * @cold: whether the pages are cache cold * - * The locking in this function is against shrink_inactive_list(): we recheck - * the page count inside the lock to see whether shrink_inactive_list() - * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() - * will free it. + * Decrement the reference count on all the pages in @pages. If it + * fell to zero, remove the page from the LRU and free it. */ void release_pages(struct page **pages, int nr, bool cold) { @@ -880,6 +903,7 @@ void release_pages(struct page **pages, int nr, bool cold) struct zone *zone = NULL; struct lruvec *lruvec; unsigned long uninitialized_var(flags); + unsigned int uninitialized_var(lock_batch); for (i = 0; i < nr; i++) { struct page *page = pages[i]; @@ -893,6 +917,16 @@ void release_pages(struct page **pages, int nr, bool cold) continue; } + /* + * Make sure the IRQ-safe lock-holding time does not get + * excessive with a continuous string of pages from the + * same zone. The lock is held only if zone != NULL. + */ + if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + zone = NULL; + } + if (!put_page_testzero(page)) continue; @@ -903,6 +937,7 @@ void release_pages(struct page **pages, int nr, bool cold) if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); + lock_batch = 0; zone = pagezone; spin_lock_irqsave(&zone->lru_lock, flags); } @@ -921,6 +956,7 @@ void release_pages(struct page **pages, int nr, bool cold) if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); + mem_cgroup_uncharge_list(&pages_to_free); free_hot_cold_page_list(&pages_to_free, cold); } EXPORT_SYMBOL(release_pages); @@ -996,7 +1032,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, SetPageLRU(page); add_page_to_lru_list(page, lruvec, lru); update_page_reclaim_stat(lruvec, file, active); - trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); + trace_mm_lru_insertion(page, lru); } /* diff --git a/mm/swap_state.c b/mm/swap_state.c index 2972eee..1544449 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -28,7 +28,9 @@ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, .set_page_dirty = swap_set_page_dirty, +#ifdef CONFIG_MIGRATION .migratepage = migrate_page, +#endif }; static struct backing_dev_info swap_backing_dev_info = { @@ -39,6 +41,7 @@ static struct backing_dev_info swap_backing_dev_info = { struct address_space swapper_spaces[MAX_SWAPFILES] = { [0 ... MAX_SWAPFILES - 1] = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .i_mmap_writable = ATOMIC_INIT(0), .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, } @@ -176,7 +179,7 @@ int add_to_swap(struct page *page, struct list_head *list) if (unlikely(PageTransHuge(page))) if (unlikely(split_huge_page_to_list(page, list))) { - swapcache_free(entry, NULL); + swapcache_free(entry); return 0; } @@ -202,7 +205,7 @@ int add_to_swap(struct page *page, struct list_head *list) * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry, NULL); + swapcache_free(entry); return 0; } } @@ -225,7 +228,7 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&address_space->tree_lock); - swapcache_free(entry, page); + swapcache_free(entry); page_cache_release(page); } @@ -262,18 +265,12 @@ void free_page_and_swap_cache(struct page *page) void free_pages_and_swap_cache(struct page **pages, int nr) { struct page **pagep = pages; + int i; lru_add_drain(); - while (nr) { - int todo = min(nr, PAGEVEC_SIZE); - int i; - - for (i = 0; i < todo; i++) - free_swap_cache(pagep[i]); - release_pages(pagep, todo, false); - pagep += todo; - nr -= todo; - } + for (i = 0; i < nr; i++) + free_swap_cache(pagep[i]); + release_pages(pagep, nr, false); } /* @@ -386,7 +383,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry, NULL); + swapcache_free(entry); } while (err != -ENOMEM); if (new_page) diff --git a/mm/swapfile.c b/mm/swapfile.c index 4c524f7..8798b2e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -843,16 +843,13 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -void swapcache_free(swp_entry_t entry, struct page *page) +void swapcache_free(swp_entry_t entry) { struct swap_info_struct *p; - unsigned char count; p = swap_info_get(entry); if (p) { - count = swap_entry_free(p, entry, SWAP_HAS_CACHE); - if (page) - mem_cgroup_uncharge_swapcache(page, entry, count != 0); + swap_entry_free(p, entry, SWAP_HAS_CACHE); spin_unlock(&p->lock); } } @@ -1106,15 +1103,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; - if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, - GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) { ret = -ENOMEM; goto out_nolock; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { - mem_cgroup_cancel_charge_swapin(memcg); + mem_cgroup_cancel_charge(page, memcg); ret = 0; goto out; } @@ -1124,11 +1120,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); - if (page == swapcache) + if (page == swapcache) { page_add_anon_rmap(page, vma, addr); - else /* ksm created a completely new copy */ + mem_cgroup_commit_charge(page, memcg, true); + } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge_swapin(page, memcg); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); + } swap_free(entry); /* * Move the page to the active list so it is not diff --git a/mm/truncate.c b/mm/truncate.c index eda2473..f1e4d60 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -20,6 +20,7 @@ #include <linux/buffer_head.h> /* grr. try_to_release_page, do_invalidatepage */ #include <linux/cleancache.h> +#include <linux/rmap.h> #include "internal.h" static void clear_exceptional_entry(struct address_space *mapping, @@ -281,7 +282,6 @@ void truncate_inode_pages_range(struct address_space *mapping, while (index < end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -307,7 +307,6 @@ void truncate_inode_pages_range(struct address_space *mapping, } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); cond_resched(); index++; } @@ -369,7 +368,6 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); break; } - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -394,7 +392,6 @@ void truncate_inode_pages_range(struct address_space *mapping, } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); index++; } cleancache_invalidate_inode(mapping); @@ -493,7 +490,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -522,7 +518,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); cond_resched(); index++; } @@ -553,7 +548,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -602,7 +596,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -655,7 +648,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); cond_resched(); index++; } @@ -723,17 +715,73 @@ EXPORT_SYMBOL(truncate_pagecache); * necessary) to @newsize. It will be typically be called from the filesystem's * setattr function when ATTR_SIZE is passed in. * - * Must be called with inode_mutex held and before all filesystem specific - * block truncation has been performed. + * Must be called with a lock serializing truncates and writes (generally + * i_mutex but e.g. xfs uses a different lock) and before all filesystem + * specific block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) { + loff_t oldsize = inode->i_size; + i_size_write(inode, newsize); + if (newsize > oldsize) + pagecache_isize_extended(inode, oldsize, newsize); truncate_pagecache(inode, newsize); } EXPORT_SYMBOL(truncate_setsize); /** + * pagecache_isize_extended - update pagecache after extension of i_size + * @inode: inode for which i_size was extended + * @from: original inode size + * @to: new inode size + * + * Handle extension of inode size either caused by extending truncate or by + * write starting after current i_size. We mark the page straddling current + * i_size RO so that page_mkwrite() is called on the nearest write access to + * the page. This way filesystem can be sure that page_mkwrite() is called on + * the page before user writes to the page via mmap after the i_size has been + * changed. + * + * The function must be called after i_size is updated so that page fault + * coming after we unlock the page will already see the new i_size. + * The function must be called while we still hold i_mutex - this not only + * makes sure i_size is stable but also that userspace cannot observe new + * i_size value before we are prepared to store mmap writes at new inode size. + */ +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) +{ + int bsize = 1 << inode->i_blkbits; + loff_t rounded_from; + struct page *page; + pgoff_t index; + + WARN_ON(to > inode->i_size); + + if (from >= to || bsize == PAGE_CACHE_SIZE) + return; + /* Page straddling @from will not have any hole block created? */ + rounded_from = round_up(from, bsize); + if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) + return; + + index = from >> PAGE_CACHE_SHIFT; + page = find_lock_page(inode->i_mapping, index); + /* Page not cached? Nothing to do */ + if (!page) + return; + /* + * See clear_page_dirty_for_io() for details why set_page_dirty() + * is needed. + */ + if (page_mkclean(page)) + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); +} +EXPORT_SYMBOL(pagecache_isize_extended); + +/** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode * @lstart: offset of beginning of hole @@ -16,9 +16,6 @@ #include "internal.h" -#define CREATE_TRACE_POINTS -#include <trace/events/kmem.h> - /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate @@ -112,97 +109,6 @@ void *memdup_user(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user); -static __always_inline void *__do_krealloc(const void *p, size_t new_size, - gfp_t flags) -{ - void *ret; - size_t ks = 0; - - if (p) - ks = ksize(p); - - if (ks >= new_size) - return (void *)p; - - ret = kmalloc_track_caller(new_size, flags); - if (ret && p) - memcpy(ret, p, ks); - - return ret; -} - -/** - * __krealloc - like krealloc() but don't free @p. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * This function is like krealloc() except it never frees the originally - * allocated buffer. Use this if you don't want to free the buffer immediately - * like, for example, with RCU. - */ -void *__krealloc(const void *p, size_t new_size, gfp_t flags) -{ - if (unlikely(!new_size)) - return ZERO_SIZE_PTR; - - return __do_krealloc(p, new_size, flags); - -} -EXPORT_SYMBOL(__krealloc); - -/** - * krealloc - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. - */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) -{ - void *ret; - - if (unlikely(!new_size)) { - kfree(p); - return ZERO_SIZE_PTR; - } - - ret = __do_krealloc(p, new_size, flags); - if (ret && p != ret) - kfree(p); - - return ret; -} -EXPORT_SYMBOL(krealloc); - -/** - * kzfree - like kfree but zero memory - * @p: object to free memory of - * - * The memory of the object @p points to is zeroed before freed. - * If @p is %NULL, kzfree() does nothing. - * - * Note: this function zeroes the whole allocated buffer which can be a good - * deal bigger than the requested buffer size passed to kmalloc(). So be - * careful when using this function in performance sensitive code. - */ -void kzfree(const void *p) -{ - size_t ks; - void *mem = (void *)p; - - if (unlikely(ZERO_OR_NULL_PTR(mem))) - return; - ks = ksize(mem); - memset(mem, 0, ks); - kfree(mem); -} -EXPORT_SYMBOL(kzfree); - /* * strndup_user - duplicate an existing string from user space * @s: The string to duplicate @@ -264,35 +170,25 @@ static int vm_is_stack_for_task(struct task_struct *t, /* * Check if the vma is being used as a stack. * If is_group is non-zero, check in the entire thread group or else - * just check in the current task. Returns the pid of the task that - * the vma is stack for. + * just check in the current task. Returns the task_struct of the task + * that the vma is stack for. Must be called under rcu_read_lock(). */ -pid_t vm_is_stack(struct task_struct *task, - struct vm_area_struct *vma, int in_group) +struct task_struct *task_of_stack(struct task_struct *task, + struct vm_area_struct *vma, bool in_group) { - pid_t ret = 0; - if (vm_is_stack_for_task(task, vma)) - return task->pid; + return task; if (in_group) { struct task_struct *t; - rcu_read_lock(); - if (!pid_alive(task)) - goto done; - - t = task; - do { - if (vm_is_stack_for_task(t, vma)) { - ret = t->pid; - goto done; - } - } while_each_thread(task, t); -done: - rcu_read_unlock(); + + for_each_thread(task, t) { + if (vm_is_stack_for_task(t, vma)) + return t; + } } - return ret; + return NULL; } #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) @@ -504,11 +400,3 @@ out_mm: out: return res; } - -/* Tracepoints definitions. */ -EXPORT_TRACEPOINT_SYMBOL(kmalloc); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); -EXPORT_TRACEPOINT_SYMBOL(kfree); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f64632b..90520af 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1270,19 +1270,15 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) } EXPORT_SYMBOL_GPL(unmap_kernel_range); -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) { unsigned long addr = (unsigned long)area->addr; unsigned long end = addr + get_vm_area_size(area); int err; - err = vmap_page_range(addr, end, prot, *pages); - if (err > 0) { - *pages += err; - err = 0; - } + err = vmap_page_range(addr, end, prot, pages); - return err; + return err > 0 ? 0 : err; } EXPORT_SYMBOL_GPL(map_vm_area); @@ -1548,7 +1544,7 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_vm_area(area, prot, &pages)) { + if (map_vm_area(area, prot, pages)) { vunmap(area->addr); return NULL; } @@ -1566,7 +1562,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, const int order = 0; struct page **pages; unsigned int nr_pages, array_size, i; - gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1589,12 +1586,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, for (i = 0; i < area->nr_pages; i++) { struct page *page; - gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; if (node == NUMA_NO_NODE) - page = alloc_page(tmp_mask); + page = alloc_page(alloc_mask); else - page = alloc_pages_node(node, tmp_mask, order); + page = alloc_pages_node(node, alloc_mask, order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -1602,9 +1598,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; + if (gfp_mask & __GFP_WAIT) + cond_resched(); } - if (map_vm_area(area, prot, &pages)) + if (map_vm_area(area, prot, pages)) goto fail; return area->addr; @@ -2648,21 +2646,11 @@ static const struct seq_operations vmalloc_op = { static int vmalloc_open(struct inode *inode, struct file *file) { - unsigned int *ptr = NULL; - int ret; - - if (IS_ENABLED(CONFIG_NUMA)) { - ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); - if (ptr == NULL) - return -ENOMEM; - } - ret = seq_open(file, &vmalloc_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = ptr; - } else - kfree(ptr); - return ret; + if (IS_ENABLED(CONFIG_NUMA)) + return seq_open_private(file, &vmalloc_op, + nr_node_ids * sizeof(unsigned int)); + else + return seq_open(file, &vmalloc_op); } static const struct file_operations proc_vmalloc_operations = { @@ -2690,14 +2678,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi) prev_end = VMALLOC_START; - spin_lock(&vmap_area_lock); + rcu_read_lock(); if (list_empty(&vmap_area_list)) { vmi->largest_chunk = VMALLOC_TOTAL; goto out; } - list_for_each_entry(va, &vmap_area_list, list) { + list_for_each_entry_rcu(va, &vmap_area_list, list) { unsigned long addr = va->va_start; /* @@ -2724,7 +2712,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi) vmi->largest_chunk = VMALLOC_END - prev_end; out: - spin_unlock(&vmap_area_lock); + rcu_read_unlock(); } #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 0f16ffe..dcb4707 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -59,35 +59,20 @@ #include <trace/events/vmscan.h> struct scan_control { - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Number of pages freed so far during a call to shrink_zones() */ - unsigned long nr_reclaimed; - /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; - unsigned long hibernation_mode; - /* This context's GFP mask */ gfp_t gfp_mask; - int may_writepage; - - /* Can mapped pages be reclaimed? */ - int may_unmap; - - /* Can pages be swapped as part of reclaim? */ - int may_swap; - + /* Allocation order */ int order; - /* Scan (total_size >> priority) pages at once */ - int priority; - - /* anon vs. file LRUs scanning "ratio" */ - int swappiness; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; /* * The memory cgroup that hit its limit and as a result is the @@ -95,11 +80,27 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; - /* - * Nodemask of nodes allowed by the caller. If NULL, all nodes - * are scanned. - */ - nodemask_t *nodemask; + /* Scan (total_size >> priority) pages at once */ + int priority; + + unsigned int may_writepage:1; + + /* Can mapped pages be reclaimed? */ + unsigned int may_unmap:1; + + /* Can pages be swapped as part of reclaim? */ + unsigned int may_swap:1; + + unsigned int hibernation_mode:1; + + /* One of the zones is ready for compaction */ + unsigned int compaction_ready:1; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -136,7 +137,11 @@ struct scan_control { * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -unsigned long vm_total_pages; /* The total number of pages which the VM controls */ +/* + * The total number of pages which are beyond the high watermark within all + * zones. + */ +unsigned long vm_total_pages; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -169,7 +174,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) bool zone_reclaimable(struct zone *zone) { - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; + return zone_page_state(zone, NR_PAGES_SCANNED) < + zone_reclaimable_pages(zone) * 6; } static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) @@ -571,9 +577,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; + mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irq(&mapping->tree_lock); - swapcache_free(swap, page); + swapcache_free(swap); } else { void (*freepage)(struct page *); void *shadow = NULL; @@ -594,7 +601,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); if (freepage != NULL) freepage(page); @@ -816,7 +822,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, cond_resched(); - mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; @@ -915,7 +920,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Case 1 above */ if (current_is_kswapd() && PageReclaim(page) && - zone_is_reclaim_writeback(zone)) { + test_bit(ZONE_WRITEBACK, &zone->flags)) { nr_immediate++; goto keep_locked; @@ -997,7 +1002,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_is_file_cache(page) && (!current_is_kswapd() || - !zone_is_reclaim_dirty(zone))) { + !test_bit(ZONE_DIRTY, &zone->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -1127,11 +1132,12 @@ keep: VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } + mem_cgroup_uncharge_list(&free_pages); free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); - mem_cgroup_uncharge_end(); + *ret_nr_dirty += nr_dirty; *ret_nr_congested += nr_congested; *ret_nr_unqueued_dirty += nr_unqueued_dirty; @@ -1431,6 +1437,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); } else @@ -1503,7 +1510,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); if (global_reclaim(sc)) { - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); else @@ -1538,6 +1545,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge_list(&page_list); free_hot_cold_page_list(&page_list, true); /* @@ -1555,7 +1563,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * are encountered in the nr_immediate check below. */ if (nr_writeback && nr_writeback == nr_taken) - zone_set_flag(zone, ZONE_WRITEBACK); + set_bit(ZONE_WRITEBACK, &zone->flags); /* * memcg will stall in page writeback so only consider forcibly @@ -1567,16 +1575,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * backed by a congested BDI and wait_iff_congested will stall. */ if (nr_dirty && nr_dirty == nr_congested) - zone_set_flag(zone, ZONE_CONGESTED); + set_bit(ZONE_CONGESTED, &zone->flags); /* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not keeping up. In this case, flag - * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing - * pages from reclaim context. + * the zone ZONE_DIRTY and kswapd will start writing pages from + * reclaim context. */ if (nr_unqueued_dirty == nr_taken) - zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); + set_bit(ZONE_DIRTY, &zone->flags); /* * If kswapd scans pages marked marked for immediate @@ -1652,6 +1660,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); } else @@ -1693,7 +1702,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); if (global_reclaim(sc)) - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); reclaim_stat->recent_scanned[file] += nr_taken; @@ -1750,7 +1759,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * Count referenced pages from currently used mappings as rotated, * even though only some of them are actually re-activated. This * helps balance scan pressure between file and anonymous pages in - * get_scan_ratio. + * get_scan_count. */ reclaim_stat->recent_rotated[file] += nr_rotated; @@ -1759,6 +1768,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge_list(&l_hold); free_hot_cold_page_list(&l_hold, true); } @@ -1865,8 +1875,8 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, - unsigned long *nr) +static void get_scan_count(struct lruvec *lruvec, int swappiness, + struct scan_control *sc, unsigned long *nr) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; @@ -1909,7 +1919,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !sc->swappiness) { + if (!global_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } @@ -1919,16 +1929,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ - if (!sc->priority && sc->swappiness) { + if (!sc->priority && swappiness) { scan_balance = SCAN_EQUAL; goto out; } - anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + - get_lru_size(lruvec, LRU_INACTIVE_ANON); - file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + - get_lru_size(lruvec, LRU_INACTIVE_FILE); - /* * Prevent the reclaimer from falling into the cache trap: as * cache pages start out inactive, every cache fault will tip @@ -1939,9 +1944,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * anon pages. Try to detect this based on file LRU size. */ if (global_reclaim(sc)) { - unsigned long free = zone_page_state(zone, NR_FREE_PAGES); + unsigned long zonefile; + unsigned long zonefree; - if (unlikely(file + free <= high_wmark_pages(zone))) { + zonefree = zone_page_state(zone, NR_FREE_PAGES); + zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { scan_balance = SCAN_ANON; goto out; } @@ -1962,7 +1972,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = sc->swappiness; + anon_prio = swappiness; file_prio = 200 - anon_prio; /* @@ -1976,6 +1986,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * * anon in [0], file in [1] */ + + anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + + get_lru_size(lruvec, LRU_INACTIVE_ANON); + file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + + get_lru_size(lruvec, LRU_INACTIVE_FILE); + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; @@ -2052,7 +2068,8 @@ out: /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +static void shrink_lruvec(struct lruvec *lruvec, int swappiness, + struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; @@ -2063,7 +2080,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, sc, nr); + get_scan_count(lruvec, swappiness, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2241,9 +2258,10 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void shrink_zone(struct zone *zone, struct scan_control *sc) +static bool shrink_zone(struct zone *zone, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned; + bool reclaimable = false; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2259,11 +2277,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(root, NULL, &reclaim); do { struct lruvec *lruvec; + int swappiness; lruvec = mem_cgroup_zone_lruvec(zone, memcg); + swappiness = mem_cgroup_swappiness(memcg); - sc->swappiness = mem_cgroup_swappiness(memcg); - shrink_lruvec(lruvec, sc); + shrink_lruvec(lruvec, swappiness, sc); /* * Direct reclaim and kswapd have to scan all memory @@ -2287,20 +2306,24 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + + return reclaimable; } -/* Returns true if compaction should go ahead for a high-order request */ -static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +/* + * Returns true if compaction should go ahead for a high-order request, or + * the high-order allocation would succeed without compaction. + */ +static inline bool compaction_ready(struct zone *zone, int order) { unsigned long balance_gap, watermark; bool watermark_ok; - /* Do not consider compaction for orders reclaim is meant to satisfy */ - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) - return false; - /* * Compaction takes time to run and there are potentially other * callers using the pages just freed. Continue reclaiming until @@ -2309,18 +2332,21 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) */ balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); - watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); /* * If compaction is deferred, reclaim up to a point where * compaction will have a chance of success when re-enabled */ - if (compaction_deferred(zone, sc->order)) + if (compaction_deferred(zone, order)) return watermark_ok; - /* If compaction is not ready to start, keep reclaiming */ - if (!compaction_suitable(zone, sc->order)) + /* + * If compaction is not ready to start and allocation is not likely + * to succeed without it, then keep reclaiming. + */ + if (compaction_suitable(zone, order) == COMPACT_SKIPPED) return false; return watermark_ok; @@ -2342,10 +2368,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. * - * This function returns true if a zone is being reclaimed for a costly - * high-order allocation and compaction is ready to begin. This indicates to - * the caller that it should consider retrying the allocation instead of - * further reclaim. + * Returns true if a zone was reclaimable. */ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { @@ -2354,13 +2377,13 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; unsigned long lru_pages = 0; - bool aborted_reclaim = false; struct reclaim_state *reclaim_state = current->reclaim_state; gfp_t orig_mask; struct shrink_control shrink = { .gfp_mask = sc->gfp_mask, }; enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + bool reclaimable = false; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2391,22 +2414,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (sc->priority != DEF_PRIORITY && !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ - if (IS_ENABLED(CONFIG_COMPACTION)) { - /* - * If we already have plenty of memory free for - * compaction in this zone, don't free any more. - * Even though compaction is invoked for any - * non-zero order, only frequent costly order - * reclamation is disruptive enough to become a - * noticeable problem, like transparent huge - * page allocations. - */ - if ((zonelist_zone_idx(z) <= requested_highidx) - && compaction_ready(zone, sc)) { - aborted_reclaim = true; - continue; - } + + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticeable problem, like transparent huge + * page allocations. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && + sc->order > PAGE_ALLOC_COSTLY_ORDER && + zonelist_zone_idx(z) <= requested_highidx && + compaction_ready(zone, sc->order)) { + sc->compaction_ready = true; + continue; } + /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and @@ -2419,10 +2444,17 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; + if (nr_soft_reclaimed) + reclaimable = true; /* need some check for avoid more shrink_zone() */ } - shrink_zone(zone, sc); + if (shrink_zone(zone, sc)) + reclaimable = true; + + if (global_reclaim(sc) && + !reclaimable && zone_reclaimable(zone)) + reclaimable = true; } /* @@ -2445,27 +2477,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) */ sc->gfp_mask = orig_mask; - return aborted_reclaim; -} - -/* All zones in zonelist are unreclaimable? */ -static bool all_unreclaimable(struct zonelist *zonelist, - struct scan_control *sc) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { - if (!populated_zone(zone)) - continue; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - if (zone_reclaimable(zone)) - return false; - } - - return true; + return reclaimable; } /* @@ -2489,7 +2501,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, { unsigned long total_scanned = 0; unsigned long writeback_threshold; - bool aborted_reclaim; + bool zones_reclaimable; delayacct_freepages_start(); @@ -2500,11 +2512,14 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - aborted_reclaim = shrink_zones(zonelist, sc); + zones_reclaimable = shrink_zones(zonelist, sc); total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) - goto out; + break; + + if (sc->compaction_ready) + break; /* * If we're getting trouble reclaiming, start doing @@ -2526,28 +2541,19 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } - } while (--sc->priority >= 0 && !aborted_reclaim); + } while (--sc->priority >= 0); -out: delayacct_freepages_end(); if (sc->nr_reclaimed) return sc->nr_reclaimed; - /* - * As hibernation is going on, kswapd is freezed so that it can't mark - * the zone into all_unreclaimable. Thus bypassing all_unreclaimable - * check. - */ - if (oom_killer_disabled) - return 0; - /* Aborted reclaim to try compaction? don't OOM, then */ - if (aborted_reclaim) + if (sc->compaction_ready) return 1; - /* top priority shrink_zones still had more to do? don't OOM, then */ - if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) + /* Any of the zones still reclaimable? Don't OOM. */ + if (zones_reclaimable) return 1; return 0; @@ -2684,15 +2690,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, { unsigned long nr_reclaimed; struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .order = order, + .nodemask = nodemask, + .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, - .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, - .order = order, - .priority = DEF_PRIORITY, - .target_mem_cgroup = NULL, - .nodemask = nodemask, }; /* @@ -2722,17 +2727,14 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, unsigned long *nr_scanned) { struct scan_control sc = { - .nr_scanned = 0, .nr_to_reclaim = SWAP_CLUSTER_MAX, + .target_mem_cgroup = memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .order = 0, - .priority = 0, - .swappiness = mem_cgroup_swappiness(memcg), - .target_mem_cgroup = memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + int swappiness = mem_cgroup_swappiness(memcg); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2748,7 +2750,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, &sc); + shrink_lruvec(lruvec, swappiness, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2757,23 +2759,22 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, } unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, gfp_t gfp_mask, - bool noswap) + bool may_swap) { struct zonelist *zonelist; unsigned long nr_reclaimed; int nid; struct scan_control sc = { - .may_writepage = !laptop_mode, - .may_unmap = 1, - .may_swap = !noswap, - .nr_to_reclaim = SWAP_CLUSTER_MAX, - .order = 0, - .priority = DEF_PRIORITY, - .target_mem_cgroup = memcg, - .nodemask = NULL, /* we don't care the placement */ + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + .target_mem_cgroup = memcg, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = may_swap, }; /* @@ -2824,7 +2825,7 @@ static bool zone_balanced(struct zone *zone, int order, return false; if (IS_ENABLED(CONFIG_COMPACTION) && order && - !compaction_suitable(zone, order)) + compaction_suitable(zone, order) == COMPACT_SKIPPED) return false; return true; @@ -2984,7 +2985,7 @@ static bool kswapd_shrink_zone(struct zone *zone, /* Account for the number of pages attempted to reclaim */ *nr_attempted += sc->nr_to_reclaim; - zone_clear_flag(zone, ZONE_WRITEBACK); + clear_bit(ZONE_WRITEBACK, &zone->flags); /* * If a zone reaches its high watermark, consider it to be no longer @@ -2994,8 +2995,8 @@ static bool kswapd_shrink_zone(struct zone *zone, */ if (zone_reclaimable(zone) && zone_balanced(zone, testorder, 0, classzone_idx)) { - zone_clear_flag(zone, ZONE_CONGESTED); - zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + clear_bit(ZONE_CONGESTED, &zone->flags); + clear_bit(ZONE_DIRTY, &zone->flags); } return sc->nr_scanned >= sc->nr_to_reclaim; @@ -3031,12 +3032,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .order = order, .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = 1, - .may_writepage = !laptop_mode, - .order = order, - .target_mem_cgroup = NULL, }; count_vm_event(PAGEOUTRUN); @@ -3087,8 +3087,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * If balanced, clear the dirty and congested * flags */ - zone_clear_flag(zone, ZONE_CONGESTED); - zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + clear_bit(ZONE_CONGESTED, &zone->flags); + clear_bit(ZONE_DIRTY, &zone->flags); } } @@ -3417,14 +3417,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { struct reclaim_state reclaim_state; struct scan_control sc = { + .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, - .may_swap = 1, - .may_unmap = 1, + .priority = DEF_PRIORITY, .may_writepage = 1, - .nr_to_reclaim = nr_to_reclaim, + .may_unmap = 1, + .may_swap = 1, .hibernation_mode = 1, - .order = 0, - .priority = DEF_PRIORITY, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; @@ -3604,13 +3603,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct task_struct *p = current; struct reclaim_state reclaim_state; struct scan_control sc = { - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), - .may_swap = 1, .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, .priority = ZONE_RECLAIM_PRIORITY, + .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_swap = 1, }; struct shrink_control shrink = { .gfp_mask = sc.gfp_mask, @@ -3716,11 +3715,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) if (node_state(node_id, N_CPU) && node_id != numa_node_id()) return ZONE_RECLAIM_NOSCAN; - if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) + if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags)) return ZONE_RECLAIM_NOSCAN; ret = __zone_reclaim(zone, gfp_mask, order); - zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags); if (!ret) count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); @@ -3799,66 +3798,3 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) } } #endif /* CONFIG_SHMEM */ - -static void warn_scan_unevictable_pages(void) -{ - printk_once(KERN_WARNING - "%s: The scan_unevictable_pages sysctl/node-interface has been " - "disabled for lack of a legitimate use case. If you have " - "one, please send an email to linux-mm@kvack.org.\n", - current->comm); -} - -/* - * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of - * all nodes' unevictable lists for evictable pages - */ -unsigned long scan_unevictable_pages; - -int scan_unevictable_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) -{ - warn_scan_unevictable_pages(); - proc_doulongvec_minmax(table, write, buffer, length, ppos); - scan_unevictable_pages = 0; - return 0; -} - -#ifdef CONFIG_NUMA -/* - * per node 'scan_unevictable_pages' attribute. On demand re-scan of - * a specified node's per zone unevictable lists for evictable pages. - */ - -static ssize_t read_scan_unevictable_node(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - warn_scan_unevictable_pages(); - return sprintf(buf, "0\n"); /* always zero; should fit... */ -} - -static ssize_t write_scan_unevictable_node(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - warn_scan_unevictable_pages(); - return 1; -} - - -static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, - read_scan_unevictable_node, - write_scan_unevictable_node); - -int scan_unevictable_register_node(struct node *node) -{ - return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages); -} - -void scan_unevictable_unregister_node(struct node *node) -{ - device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages); -} -#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index b37bd49..1b12d39 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -7,6 +7,7 @@ * zoned VM statistics * Copyright (C) 2006 Silicon Graphics, Inc., * Christoph Lameter <christoph@lameter.com> + * Copyright (C) 2008-2014 Christoph Lameter */ #include <linux/fs.h> #include <linux/mm.h> @@ -14,6 +15,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/cpu.h> +#include <linux/cpumask.h> #include <linux/vmstat.h> #include <linux/sched.h> #include <linux/math64.h> @@ -200,7 +202,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, continue; threshold = (*calculate_pressure)(zone); - for_each_possible_cpu(cpu) + for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; } @@ -419,13 +421,22 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) EXPORT_SYMBOL(dec_zone_page_state); #endif -static inline void fold_diff(int *diff) + +/* + * Fold a differential into the global counters. + * Returns the number of counters updated. + */ +static int fold_diff(int *diff) { int i; + int changes = 0; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (diff[i]) + if (diff[i]) { atomic_long_add(diff[i], &vm_stat[i]); + changes++; + } + return changes; } /* @@ -441,12 +452,15 @@ static inline void fold_diff(int *diff) * statistics in the remote zone struct as well as the global cachelines * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. + * + * The function returns the number of global counters updated. */ -static void refresh_cpu_vm_stats(void) +static int refresh_cpu_vm_stats(void) { struct zone *zone; int i; int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int changes = 0; for_each_populated_zone(zone) { struct per_cpu_pageset __percpu *p = zone->pageset; @@ -486,15 +500,17 @@ static void refresh_cpu_vm_stats(void) continue; } - if (__this_cpu_dec_return(p->expire)) continue; - if (__this_cpu_read(p->pcp.count)) + if (__this_cpu_read(p->pcp.count)) { drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + changes++; + } #endif } - fold_diff(global_diff); + changes += fold_diff(global_diff); + return changes; } /* @@ -735,7 +751,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, TEXT_FOR_HIGHMEM(xx) xx "_movable", const char * const vmstat_text[] = { - /* Zoned VM counters */ + /* enum zone_stat_item countes */ "nr_free_pages", "nr_alloc_batch", "nr_inactive_anon", @@ -763,6 +779,7 @@ const char * const vmstat_text[] = { "nr_shmem", "nr_dirtied", "nr_written", + "nr_pages_scanned", #ifdef CONFIG_NUMA "numa_hit", @@ -777,10 +794,13 @@ const char * const vmstat_text[] = { "workingset_nodereclaim", "nr_anon_transparent_hugepages", "nr_free_cma", + + /* enum writeback_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", #ifdef CONFIG_VM_EVENT_COUNTERS + /* enum vm_event_item counters */ "pgpgin", "pgpgout", "pswpin", @@ -859,6 +879,13 @@ const char * const vmstat_text[] = { "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif +#ifdef CONFIG_MEMORY_BALLOON + "balloon_inflate", + "balloon_deflate", +#ifdef CONFIG_BALLOON_COMPACTION + "balloon_migrate", +#endif +#endif /* CONFIG_MEMORY_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_SMP "nr_tlb_remote_flush", @@ -1067,7 +1094,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - zone->pages_scanned, + zone_page_state(zone, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); @@ -1077,10 +1104,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone_page_state(zone, i)); seq_printf(m, - "\n protection: (%lu", + "\n protection: (%ld", zone->lowmem_reserve[0]); for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) - seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, ", %ld", zone->lowmem_reserve[i]); seq_printf(m, ")" "\n pagesets"); @@ -1228,20 +1255,108 @@ static const struct file_operations proc_vmstat_file_operations = { #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; +static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - refresh_cpu_vm_stats(); - schedule_delayed_work(this_cpu_ptr(&vmstat_work), + if (refresh_cpu_vm_stats()) + /* + * Counters were updated so we expect more updates + * to occur in the future. Keep on running the + * update worker thread. + */ + schedule_delayed_work(this_cpu_ptr(&vmstat_work), + round_jiffies_relative(sysctl_stat_interval)); + else { + /* + * We did not update any counters so the app may be in + * a mode where it does not cause counter updates. + * We may be uselessly running vmstat_update. + * Defer the checking for differentials to the + * shepherd thread on a different processor. + */ + int r; + /* + * Shepherd work thread does not race since it never + * changes the bit if its zero but the cpu + * online / off line code may race if + * worker threads are still allowed during + * shutdown / startup. + */ + r = cpumask_test_and_set_cpu(smp_processor_id(), + cpu_stat_off); + VM_BUG_ON(r); + } +} + +/* + * Check if the diffs for a certain cpu indicate that + * an update is needed. + */ +static bool need_update(int cpu) +{ + struct zone *zone; + + for_each_populated_zone(zone) { + struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); + + BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); + /* + * The fast way of checking if there are any vmstat diffs. + * This works because the diffs are byte sized items. + */ + if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) + return true; + + } + return false; +} + + +/* + * Shepherd worker thread that checks the + * differentials of processors that have their worker + * threads for vm statistics updates disabled because of + * inactivity. + */ +static void vmstat_shepherd(struct work_struct *w); + +static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); + +static void vmstat_shepherd(struct work_struct *w) +{ + int cpu; + + get_online_cpus(); + /* Check processors whose vmstat worker threads have been disabled */ + for_each_cpu(cpu, cpu_stat_off) + if (need_update(cpu) && + cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) + + schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu), + __round_jiffies_relative(sysctl_stat_interval, cpu)); + + put_online_cpus(); + + schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); + } -static void start_cpu_timer(int cpu) +static void __init start_shepherd_timer(void) { - struct delayed_work *work = &per_cpu(vmstat_work, cpu); + int cpu; + + for_each_possible_cpu(cpu) + INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), + vmstat_update); - INIT_DEFERRABLE_WORK(work, vmstat_update); - schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); + if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) + BUG(); + cpumask_copy(cpu_stat_off, cpu_online_mask); + + schedule_delayed_work(&shepherd, + round_jiffies_relative(sysctl_stat_interval)); } static void vmstat_cpu_dead(int node) @@ -1272,17 +1387,17 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_ONLINE: case CPU_ONLINE_FROZEN: refresh_zone_stat_thresholds(); - start_cpu_timer(cpu); node_set_state(cpu_to_node(cpu), N_CPU); + cpumask_set_cpu(cpu, cpu_stat_off); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); - per_cpu(vmstat_work, cpu).work.func = NULL; + cpumask_clear_cpu(cpu, cpu_stat_off); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - start_cpu_timer(cpu); + cpumask_set_cpu(cpu, cpu_stat_off); break; case CPU_DEAD: case CPU_DEAD_FROZEN: @@ -1302,15 +1417,10 @@ static struct notifier_block vmstat_notifier = static int __init setup_vmstat(void) { #ifdef CONFIG_SMP - int cpu; - cpu_notifier_register_begin(); __register_cpu_notifier(&vmstat_notifier); - for_each_online_cpu(cpu) { - start_cpu_timer(cpu); - node_set_state(cpu_to_node(cpu), N_CPU); - } + start_shepherd_timer(); cpu_notifier_register_done(); #endif #ifdef CONFIG_PROC_FS @@ -51,6 +51,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/zbud.h> +#include <linux/zpool.h> /***************** * Structures @@ -59,15 +60,17 @@ * NCHUNKS_ORDER determines the internal allocation granularity, effectively * adjusting internal fragmentation. It also determines the number of * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the - * allocation granularity will be in chunks of size PAGE_SIZE/64, and there - * will be 64 freelists per pool. + * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk + * in allocated page is occupied by zbud header, NCHUNKS will be calculated to + * 63 which shows the max number of free chunks in zbud page, also there will be + * 63 freelists per pool. */ #define NCHUNKS_ORDER 6 #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) #define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT) #define ZHDR_SIZE_ALIGNED CHUNK_SIZE +#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) /** * struct zbud_pool - stores metadata for each zbud pool @@ -113,6 +116,91 @@ struct zbud_header { }; /***************** + * zpool + ****************/ + +#ifdef CONFIG_ZPOOL + +static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) +{ + return zpool_evict(pool, handle); +} + +static struct zbud_ops zbud_zpool_ops = { + .evict = zbud_zpool_evict +}; + +static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +{ + return zbud_create_pool(gfp, &zbud_zpool_ops); +} + +static void zbud_zpool_destroy(void *pool) +{ + zbud_destroy_pool(pool); +} + +static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zbud_alloc(pool, size, gfp, handle); +} +static void zbud_zpool_free(void *pool, unsigned long handle) +{ + zbud_free(pool, handle); +} + +static int zbud_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = zbud_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + +static void *zbud_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + return zbud_map(pool, handle); +} +static void zbud_zpool_unmap(void *pool, unsigned long handle) +{ + zbud_unmap(pool, handle); +} + +static u64 zbud_zpool_total_size(void *pool) +{ + return zbud_get_pool_size(pool) * PAGE_SIZE; +} + +static struct zpool_driver zbud_zpool_driver = { + .type = "zbud", + .owner = THIS_MODULE, + .create = zbud_zpool_create, + .destroy = zbud_zpool_destroy, + .malloc = zbud_zpool_malloc, + .free = zbud_zpool_free, + .shrink = zbud_zpool_shrink, + .map = zbud_zpool_map, + .unmap = zbud_zpool_unmap, + .total_size = zbud_zpool_total_size, +}; + +MODULE_ALIAS("zpool-zbud"); +#endif /* CONFIG_ZPOOL */ + +/***************** * Helpers *****************/ /* Just to make the code easier to read */ @@ -122,7 +210,7 @@ enum buddy { }; /* Converts an allocation size in bytes to size in zbud chunks */ -static int size_to_chunks(int size) +static int size_to_chunks(size_t size) { return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; } @@ -182,10 +270,9 @@ static int num_free_chunks(struct zbud_header *zhdr) { /* * Rather than branch for different situations, just use the fact that - * free buddies have a length of zero to simplify everything. -1 at the - * end for the zbud header. + * free buddies have a length of zero to simplify everything. */ - return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1; + return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; } /***************** @@ -247,7 +334,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate * a new page. */ -int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, +int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, unsigned long *handle) { int chunks, i, freechunks; @@ -511,11 +598,20 @@ static int __init init_zbud(void) /* Make sure the zbud header will fit in one chunk */ BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); pr_info("loaded\n"); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zbud_zpool_driver); +#endif + return 0; } static void __exit exit_zbud(void) { +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zbud_zpool_driver); +#endif + pr_info("unloaded\n"); } diff --git a/mm/zpool.c b/mm/zpool.c new file mode 100644 index 0000000..739cdf0 --- /dev/null +++ b/mm/zpool.c @@ -0,0 +1,364 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for memory storage pool implementations. + * Typically, this is used to store compressed memory. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/list.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/zpool.h> + +struct zpool { + char *type; + + struct zpool_driver *driver; + void *pool; + struct zpool_ops *ops; + + struct list_head list; +}; + +static LIST_HEAD(drivers_head); +static DEFINE_SPINLOCK(drivers_lock); + +static LIST_HEAD(pools_head); +static DEFINE_SPINLOCK(pools_lock); + +/** + * zpool_register_driver() - register a zpool implementation. + * @driver: driver to register + */ +void zpool_register_driver(struct zpool_driver *driver) +{ + spin_lock(&drivers_lock); + atomic_set(&driver->refcount, 0); + list_add(&driver->list, &drivers_head); + spin_unlock(&drivers_lock); +} +EXPORT_SYMBOL(zpool_register_driver); + +/** + * zpool_unregister_driver() - unregister a zpool implementation. + * @driver: driver to unregister. + * + * Module usage counting is used to prevent using a driver + * while/after unloading, so if this is called from module + * exit function, this should never fail; if called from + * other than the module exit function, and this returns + * failure, the driver is in use and must remain available. + */ +int zpool_unregister_driver(struct zpool_driver *driver) +{ + int ret = 0, refcount; + + spin_lock(&drivers_lock); + refcount = atomic_read(&driver->refcount); + WARN_ON(refcount < 0); + if (refcount > 0) + ret = -EBUSY; + else + list_del(&driver->list); + spin_unlock(&drivers_lock); + + return ret; +} +EXPORT_SYMBOL(zpool_unregister_driver); + +/** + * zpool_evict() - evict callback from a zpool implementation. + * @pool: pool to evict from. + * @handle: handle to evict. + * + * This can be used by zpool implementations to call the + * user's evict zpool_ops struct evict callback. + */ +int zpool_evict(void *pool, unsigned long handle) +{ + struct zpool *zpool; + + spin_lock(&pools_lock); + list_for_each_entry(zpool, &pools_head, list) { + if (zpool->pool == pool) { + spin_unlock(&pools_lock); + if (!zpool->ops || !zpool->ops->evict) + return -EINVAL; + return zpool->ops->evict(zpool, handle); + } + } + spin_unlock(&pools_lock); + + return -ENOENT; +} +EXPORT_SYMBOL(zpool_evict); + +static struct zpool_driver *zpool_get_driver(char *type) +{ + struct zpool_driver *driver; + + spin_lock(&drivers_lock); + list_for_each_entry(driver, &drivers_head, list) { + if (!strcmp(driver->type, type)) { + bool got = try_module_get(driver->owner); + + if (got) + atomic_inc(&driver->refcount); + spin_unlock(&drivers_lock); + return got ? driver : NULL; + } + } + + spin_unlock(&drivers_lock); + return NULL; +} + +static void zpool_put_driver(struct zpool_driver *driver) +{ + atomic_dec(&driver->refcount); + module_put(driver->owner); +} + +/** + * zpool_create_pool() - Create a new zpool + * @type The type of the zpool to create (e.g. zbud, zsmalloc) + * @gfp The GFP flags to use when allocating the pool. + * @ops The optional ops callback. + * + * This creates a new zpool of the specified type. The gfp flags will be + * used when allocating memory, if the implementation supports it. If the + * ops param is NULL, then the created zpool will not be shrinkable. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: New zpool on success, NULL on failure. + */ +struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) +{ + struct zpool_driver *driver; + struct zpool *zpool; + + pr_info("creating pool type %s\n", type); + + driver = zpool_get_driver(type); + + if (!driver) { + request_module("zpool-%s", type); + driver = zpool_get_driver(type); + } + + if (!driver) { + pr_err("no driver for type %s\n", type); + return NULL; + } + + zpool = kmalloc(sizeof(*zpool), gfp); + if (!zpool) { + pr_err("couldn't create zpool - out of memory\n"); + zpool_put_driver(driver); + return NULL; + } + + zpool->type = driver->type; + zpool->driver = driver; + zpool->pool = driver->create(gfp, ops); + zpool->ops = ops; + + if (!zpool->pool) { + pr_err("couldn't create %s pool\n", type); + zpool_put_driver(driver); + kfree(zpool); + return NULL; + } + + pr_info("created %s pool\n", type); + + spin_lock(&pools_lock); + list_add(&zpool->list, &pools_head); + spin_unlock(&pools_lock); + + return zpool; +} + +/** + * zpool_destroy_pool() - Destroy a zpool + * @pool The zpool to destroy. + * + * Implementations must guarantee this to be thread-safe, + * however only when destroying different pools. The same + * pool should only be destroyed once, and should not be used + * after it is destroyed. + * + * This destroys an existing zpool. The zpool should not be in use. + */ +void zpool_destroy_pool(struct zpool *zpool) +{ + pr_info("destroying pool type %s\n", zpool->type); + + spin_lock(&pools_lock); + list_del(&zpool->list); + spin_unlock(&pools_lock); + zpool->driver->destroy(zpool->pool); + zpool_put_driver(zpool->driver); + kfree(zpool); +} + +/** + * zpool_get_type() - Get the type of the zpool + * @pool The zpool to check + * + * This returns the type of the pool. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: The type of zpool. + */ +char *zpool_get_type(struct zpool *zpool) +{ + return zpool->type; +} + +/** + * zpool_malloc() - Allocate memory + * @pool The zpool to allocate from. + * @size The amount of memory to allocate. + * @gfp The GFP flags to use when allocating memory. + * @handle Pointer to the handle to set + * + * This allocates the requested amount of memory from the pool. + * The gfp flags will be used when allocating memory, if the + * implementation supports it. The provided @handle will be + * set to the allocated object handle. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error. + */ +int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zpool->driver->malloc(zpool->pool, size, gfp, handle); +} + +/** + * zpool_free() - Free previously allocated memory + * @pool The zpool that allocated the memory. + * @handle The handle to the memory to free. + * + * This frees previously allocated memory. This does not guarantee + * that the pool will actually free memory, only that the memory + * in the pool will become available for use by the pool. + * + * Implementations must guarantee this to be thread-safe, + * however only when freeing different handles. The same + * handle should only be freed once, and should not be used + * after freeing. + */ +void zpool_free(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->free(zpool->pool, handle); +} + +/** + * zpool_shrink() - Shrink the pool size + * @pool The zpool to shrink. + * @pages The number of pages to shrink the pool. + * @reclaimed The number of pages successfully evicted. + * + * This attempts to shrink the actual memory size of the pool + * by evicting currently used handle(s). If the pool was + * created with no zpool_ops, or the evict call fails for any + * of the handles, this will fail. If non-NULL, the @reclaimed + * parameter will be set to the number of pages reclaimed, + * which may be more than the number of pages requested. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error/failure. + */ +int zpool_shrink(struct zpool *zpool, unsigned int pages, + unsigned int *reclaimed) +{ + return zpool->driver->shrink(zpool->pool, pages, reclaimed); +} + +/** + * zpool_map_handle() - Map a previously allocated handle into memory + * @pool The zpool that the handle was allocated from + * @handle The handle to map + * @mm How the memory should be mapped + * + * This maps a previously allocated handle into memory. The @mm + * param indicates to the implementation how the memory will be + * used, i.e. read-only, write-only, read-write. If the + * implementation does not support it, the memory will be treated + * as read-write. + * + * This may hold locks, disable interrupts, and/or preemption, + * and the zpool_unmap_handle() must be called to undo those + * actions. The code that uses the mapped handle should complete + * its operatons on the mapped handle memory quickly and unmap + * as soon as possible. As the implementation may use per-cpu + * data, multiple handles should not be mapped concurrently on + * any cpu. + * + * Returns: A pointer to the handle's mapped memory area. + */ +void *zpool_map_handle(struct zpool *zpool, unsigned long handle, + enum zpool_mapmode mapmode) +{ + return zpool->driver->map(zpool->pool, handle, mapmode); +} + +/** + * zpool_unmap_handle() - Unmap a previously mapped handle + * @pool The zpool that the handle was allocated from + * @handle The handle to unmap + * + * This unmaps a previously mapped handle. Any locks or other + * actions that the implementation took in zpool_map_handle() + * will be undone here. The memory area returned from + * zpool_map_handle() should no longer be used after this. + */ +void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->unmap(zpool->pool, handle); +} + +/** + * zpool_get_total_size() - The total size of the pool + * @pool The zpool to check + * + * This returns the total size in bytes of the pool. + * + * Returns: Total size of the zpool in bytes. + */ +u64 zpool_get_total_size(struct zpool *zpool) +{ + return zpool->driver->total_size(zpool->pool); +} + +static int __init init_zpool(void) +{ + pr_info("loaded\n"); + return 0; +} + +static void __exit exit_zpool(void) +{ + pr_info("unloaded\n"); +} + +module_init(init_zpool); +module_exit(exit_zpool); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); +MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index fe78189..839a48c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -92,6 +92,7 @@ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/zsmalloc.h> +#include <linux/zpool.h> /* * This must be power of 2 and greater than of equal to sizeof(link_free). @@ -174,7 +175,7 @@ enum fullness_group { * n <= N / f, where * n = number of allocated objects * N = total number of objects zspage can store - * f = 1/fullness_threshold_frac + * f = fullness_threshold_frac * * Similarly, we assign zspage to: * ZS_ALMOST_FULL when n > N / f @@ -198,9 +199,6 @@ struct size_class { spinlock_t lock; - /* stats */ - u64 pages_allocated; - struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; }; @@ -219,6 +217,7 @@ struct zs_pool { struct size_class size_class[ZS_SIZE_CLASSES]; gfp_t flags; /* allocation flags used when growing pool */ + atomic_long_t pages_allocated; }; /* @@ -240,6 +239,82 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; +/* zpool driver */ + +#ifdef CONFIG_ZPOOL + +static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +{ + return zs_create_pool(gfp); +} + +static void zs_zpool_destroy(void *pool) +{ + zs_destroy_pool(pool); +} + +static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + *handle = zs_malloc(pool, size); + return *handle ? 0 : -1; +} +static void zs_zpool_free(void *pool, unsigned long handle) +{ + zs_free(pool, handle); +} + +static int zs_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + return -EINVAL; +} + +static void *zs_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + enum zs_mapmode zs_mm; + + switch (mm) { + case ZPOOL_MM_RO: + zs_mm = ZS_MM_RO; + break; + case ZPOOL_MM_WO: + zs_mm = ZS_MM_WO; + break; + case ZPOOL_MM_RW: /* fallthru */ + default: + zs_mm = ZS_MM_RW; + break; + } + + return zs_map_object(pool, handle, zs_mm); +} +static void zs_zpool_unmap(void *pool, unsigned long handle) +{ + zs_unmap_object(pool, handle); +} + +static u64 zs_zpool_total_size(void *pool) +{ + return zs_get_total_pages(pool) << PAGE_SHIFT; +} + +static struct zpool_driver zs_zpool_driver = { + .type = "zsmalloc", + .owner = THIS_MODULE, + .create = zs_zpool_create, + .destroy = zs_zpool_destroy, + .malloc = zs_zpool_malloc, + .free = zs_zpool_free, + .shrink = zs_zpool_shrink, + .map = zs_zpool_map, + .unmap = zs_zpool_unmap, + .total_size = zs_zpool_total_size, +}; + +MODULE_ALIAS("zpool-zsmalloc"); +#endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); @@ -553,7 +628,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) while (page) { struct page *next_page; struct link_free *link; - unsigned int i, objs_on_page; + unsigned int i = 1; /* * page->index stores offset of first object starting @@ -566,14 +641,10 @@ static void init_zspage(struct page *first_page, struct size_class *class) link = (struct link_free *)kmap_atomic(page) + off / sizeof(*link); - objs_on_page = (PAGE_SIZE - off) / class->size; - for (i = 1; i <= objs_on_page; i++) { - off += class->size; - if (off < PAGE_SIZE) { - link->next = obj_location_to_handle(page, i); - link += class->size / sizeof(*link); - } + while ((off += class->size) < PAGE_SIZE) { + link->next = obj_location_to_handle(page, i++); + link += class->size / sizeof(*link); } /* @@ -585,7 +656,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) link->next = obj_location_to_handle(next_page, 0); kunmap_atomic(link); page = next_page; - off = (off + class->size) % PAGE_SIZE; + off %= PAGE_SIZE; } } @@ -690,7 +761,7 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); + BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); area->vm_addr = area->vm->addr; return area->vm_addr + off; } @@ -814,6 +885,10 @@ static void zs_exit(void) { int cpu; +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif + cpu_notifier_register_begin(); for_each_online_cpu(cpu) @@ -840,6 +915,10 @@ static int zs_init(void) cpu_notifier_register_done(); +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zs_zpool_driver); +#endif + return 0; fail: zs_exit(); @@ -943,8 +1022,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) return 0; set_zspage_mapping(first_page, class->index, ZS_EMPTY); + atomic_long_add(class->pages_per_zspage, + &pool->pages_allocated); spin_lock(&class->lock); - class->pages_allocated += class->pages_per_zspage; } obj = (unsigned long)first_page->freelist; @@ -997,14 +1077,13 @@ void zs_free(struct zs_pool *pool, unsigned long obj) first_page->inuse--; fullness = fix_fullness_group(pool, first_page); - - if (fullness == ZS_EMPTY) - class->pages_allocated -= class->pages_per_zspage; - spin_unlock(&class->lock); - if (fullness == ZS_EMPTY) + if (fullness == ZS_EMPTY) { + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); free_zspage(first_page); + } } EXPORT_SYMBOL_GPL(zs_free); @@ -1098,17 +1177,11 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_unmap_object); -u64 zs_get_total_size_bytes(struct zs_pool *pool) +unsigned long zs_get_total_pages(struct zs_pool *pool) { - int i; - u64 npages = 0; - - for (i = 0; i < ZS_SIZE_CLASSES; i++) - npages += pool->size_class[i].pages_allocated; - - return npages << PAGE_SHIFT; + return atomic_long_read(&pool->pages_allocated); } -EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); +EXPORT_SYMBOL_GPL(zs_get_total_pages); module_init(zs_init); module_exit(zs_exit); @@ -34,7 +34,7 @@ #include <linux/swap.h> #include <linux/crypto.h> #include <linux/mempool.h> -#include <linux/zbud.h> +#include <linux/zpool.h> #include <linux/mm_types.h> #include <linux/page-flags.h> @@ -45,8 +45,8 @@ /********************************* * statistics **********************************/ -/* Number of memory pages used by the compressed pool */ -static u64 zswap_pool_pages; +/* Total bytes used by the compressed storage */ +static u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ static atomic_t zswap_stored_pages = ATOMIC_INIT(0); @@ -89,8 +89,13 @@ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); -/* zbud_pool is shared by all of zswap backend */ -static struct zbud_pool *zswap_pool; +/* Compressed storage to use */ +#define ZSWAP_ZPOOL_DEFAULT "zbud" +static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; +module_param_named(zpool, zswap_zpool_type, charp, 0444); + +/* zpool is shared by all of zswap backend */ +static struct zpool *zswap_pool; /********************************* * compression functions @@ -168,7 +173,7 @@ static void zswap_comp_exit(void) * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * offset - the swap offset for the entry. Index into the red-black tree. - * handle - zbud allocation handle that stores the compressed page data + * handle - zpool allocation handle that stores the compressed page data * length - the length in bytes of the compressed page data. Needed during * decompression */ @@ -207,7 +212,7 @@ static int zswap_entry_cache_create(void) return zswap_entry_cache == NULL; } -static void zswap_entry_cache_destory(void) +static void __init zswap_entry_cache_destroy(void) { kmem_cache_destroy(zswap_entry_cache); } @@ -284,15 +289,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) } /* - * Carries out the common pattern of freeing and entry's zbud allocation, + * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. */ static void zswap_free_entry(struct zswap_entry *entry) { - zbud_free(zswap_pool, entry->handle); + zpool_free(zswap_pool, entry->handle); zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(zswap_pool); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); } /* caller must hold the tree lock */ @@ -409,7 +414,7 @@ cleanup: static bool zswap_is_full(void) { return totalram_pages * zswap_max_pool_percent / 100 < - zswap_pool_pages; + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } /********************************* @@ -502,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry, NULL); + swapcache_free(entry); } while (err != -ENOMEM); if (new_page) @@ -525,7 +530,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, * the swap cache, the compressed version stored by zswap can be * freed. */ -static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) +static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) { struct zswap_header *zhdr; swp_entry_t swpentry; @@ -541,9 +546,9 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) }; /* extract swpentry from data */ - zhdr = zbud_map(pool, handle); + zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); swpentry = zhdr->swpentry; /* here */ - zbud_unmap(pool, handle); + zpool_unmap_handle(pool, handle); tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); @@ -573,13 +578,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(zswap_pool, entry->handle) + - sizeof(struct zswap_header); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(zswap_pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -652,7 +657,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; - if (zbud_reclaim_page(zswap_pool, 8)) { + if (zpool_shrink(zswap_pool, 1, NULL)) { zswap_reject_reclaim_fail++; ret = -ENOMEM; goto reject; @@ -679,7 +684,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ len = dlen + sizeof(struct zswap_header); - ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, + ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; @@ -689,11 +694,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto freepage; } - zhdr = zbud_map(zswap_pool, handle); + zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); zhdr->swpentry = swp_entry(type, offset); buf = (u8 *)(zhdr + 1); memcpy(buf, dst, dlen); - zbud_unmap(zswap_pool, handle); + zpool_unmap_handle(zswap_pool, handle); put_cpu_var(zswap_dstmem); /* populate entry */ @@ -716,7 +721,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* update stats */ atomic_inc(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(zswap_pool); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); return 0; @@ -752,13 +757,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(zswap_pool, entry->handle) + - sizeof(struct zswap_header); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(zswap_pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); BUG_ON(ret); spin_lock(&tree->lock); @@ -811,7 +816,7 @@ static void zswap_frontswap_invalidate_area(unsigned type) zswap_trees[type] = NULL; } -static struct zbud_ops zswap_zbud_ops = { +static struct zpool_ops zswap_zpool_ops = { .evict = zswap_writeback_entry }; @@ -869,8 +874,8 @@ static int __init zswap_debugfs_init(void) zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_u64("duplicate_entry", S_IRUGO, zswap_debugfs_root, &zswap_duplicate_entry); - debugfs_create_u64("pool_pages", S_IRUGO, - zswap_debugfs_root, &zswap_pool_pages); + debugfs_create_u64("pool_total_size", S_IRUGO, + zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", S_IRUGO, zswap_debugfs_root, &zswap_stored_pages); @@ -895,16 +900,26 @@ static void __exit zswap_debugfs_exit(void) { } **********************************/ static int __init init_zswap(void) { + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + if (!zswap_enabled) return 0; pr_info("loading zswap\n"); - zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); + zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); + if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + pr_info("%s zpool not available\n", zswap_zpool_type); + zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; + zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, + &zswap_zpool_ops); + } if (!zswap_pool) { - pr_err("zbud pool creation failed\n"); + pr_err("%s zpool not available\n", zswap_zpool_type); + pr_err("zpool creation failed\n"); goto error; } + pr_info("using %s pool\n", zswap_zpool_type); if (zswap_entry_cache_create()) { pr_err("entry cache creation failed\n"); @@ -926,9 +941,9 @@ static int __init init_zswap(void) pcpufail: zswap_comp_exit(); compfail: - zswap_entry_cache_destory(); + zswap_entry_cache_destroy(); cachefail: - zbud_destroy_pool(zswap_pool); + zpool_destroy_pool(zswap_pool); error: return -ENOMEM; } |